From 7c6cad2da3256cfd006e6a584358420e3cb1343e Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 2 Feb 2021 23:37:26 +0900 Subject: [PATCH 001/155] CI: Upgrade sccache to 0.2.15 --- .github/workflows/rav1e.yml | 7 ++++--- .travis/install-sccache.sh | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index 6b9186e02f..d1a9578ba3 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -105,11 +105,12 @@ jobs: - name: Install sccache env: LINK: https://github.com/mozilla/sccache/releases/download - SCCACHE_VERSION: 0.2.13 + SCCACHE_VERSION: 0.2.15 run: | - SCCACHE_FILE=sccache-$SCCACHE_VERSION-x86_64-unknown-linux-musl + SCCACHE_FILE=sccache-v$SCCACHE_VERSION-x86_64-unknown-linux-musl mkdir -p $HOME/.local/bin - curl -L "$LINK/$SCCACHE_VERSION/$SCCACHE_FILE.tar.gz" | tar xz + curl -L "$LINK/v$SCCACHE_VERSION/$SCCACHE_FILE.tar.gz" | tar xz + chmod +x $SCCACHE_FILE/sccache mv -f $SCCACHE_FILE/sccache $HOME/.local/bin/sccache echo "$HOME/.local/bin" >> $GITHUB_PATH - name: Add nasm diff --git a/.travis/install-sccache.sh b/.travis/install-sccache.sh index 0fb9032282..2fe613a673 100755 --- a/.travis/install-sccache.sh +++ b/.travis/install-sccache.sh @@ -1,15 +1,16 @@ #!/bin/bash set -ex -SCCACHE_VERSION="0.2.13" +SCCACHE_VERSION="0.2.15" export RUSTC_WRAPPER=sccache if [ "$(sccache --version)" = "sccache $SCCACHE_VERSION" ]; then echo "Using cached directory." elif [ "$ARCH" = "x86_64" ]; then - curl -L "https://github.com/mozilla/sccache/releases/download/$SCCACHE_VERSION/sccache-$SCCACHE_VERSION-x86_64-unknown-linux-musl.tar.gz" | tar xz - mv -f "sccache-$SCCACHE_VERSION-x86_64-unknown-linux-musl/sccache" "$DEPS_DIR/bin/sccache" + curl -L "https://github.com/mozilla/sccache/releases/download/v$SCCACHE_VERSION/sccache-v$SCCACHE_VERSION-x86_64-unknown-linux-musl.tar.gz" | tar xz + mv -f "sccache-v$SCCACHE_VERSION-x86_64-unknown-linux-musl/sccache" "$DEPS_DIR/bin/sccache" + chmod +x "$DEPS_DIR/bin/sccache" else RUSTC_WRAPPER='' cargo install --version "$SCCACHE_VERSION" --root "$DEPS_DIR" --no-default-features sccache fi From 61133e3434ea113ab4bb31c526a170ca594fccaf Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 2 Feb 2021 18:05:24 +0900 Subject: [PATCH 002/155] Add missing macro calls for symbol_with_update --- src/context/block_unit.rs | 43 +++++++------------- src/context/mod.rs | 85 +++++++++++++++++++-------------------- 2 files changed, 56 insertions(+), 72 deletions(-) diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs index d5299f190a..b3dd167caf 100644 --- a/src/context/block_unit.rs +++ b/src/context/block_unit.rs @@ -1702,20 +1702,10 @@ impl<'a> ContextWriter<'a> { ); if mv_joint_vertical(j) { - encode_mv_component( - w, - diff.row as i32, - &mut self.fc.nmv_context.comps[0], - mv_precision, - ); + self.encode_mv_component(w, diff.row as i32, 0, mv_precision); } if mv_joint_horizontal(j) { - encode_mv_component( - w, - diff.col as i32, - &mut self.fc.nmv_context.comps[1], - mv_precision, - ); + self.encode_mv_component(w, diff.col as i32, 1, mv_precision); } } @@ -1723,10 +1713,20 @@ impl<'a> ContextWriter<'a> { &mut self, w: &mut dyn Writer, bo: TileBlockOffset, multi: bool, planes: usize, ) { - fn write_block_delta(w: &mut dyn Writer, cdf: &mut [u16], delta: i8) { + let block = &self.bc.blocks[bo]; + let deltas_count = if multi { FRAME_LF_COUNT + planes - 3 } else { 1 }; + let deltas = &block.deblock_deltas[..deltas_count]; + let cdf1 = &mut [self.fc.deblock_delta_cdf]; + let cdfs = if multi { + &mut self.fc.deblock_delta_multi_cdf[..deltas_count] + } else { + cdf1 + }; + + for (&delta, cdf) in deltas.iter().zip(cdfs.iter_mut()) { let abs = delta.abs() as u32; - w.symbol_with_update(cmp::min(abs, DELTA_LF_SMALL), cdf); + symbol_with_update!(self, w, cmp::min(abs, DELTA_LF_SMALL), cdf); if abs >= DELTA_LF_SMALL { let bits = msb(abs as i32 - 1) as u32; @@ -1737,21 +1737,6 @@ impl<'a> ContextWriter<'a> { w.bool(delta < 0, 16384); } } - - let block = &self.bc.blocks[bo]; - if multi { - let deltas_count = FRAME_LF_COUNT + planes - 3; - let deltas = &block.deblock_deltas[..deltas_count]; - let cdfs = &mut self.fc.deblock_delta_multi_cdf[..deltas_count]; - - for (&delta, cdf) in deltas.iter().zip(cdfs.iter_mut()) { - write_block_delta(w, cdf, delta); - } - } else { - let delta = block.deblock_deltas[0]; - let cdf = &mut self.fc.deblock_delta_cdf; - write_block_delta(w, cdf, delta); - } } pub fn write_is_inter( diff --git a/src/context/mod.rs b/src/context/mod.rs index 2460d368ef..bb183a969e 100644 --- a/src/context/mod.rs +++ b/src/context/mod.rs @@ -183,56 +183,55 @@ pub fn get_mv_class(z: u32, offset: &mut u32) -> usize { c } -pub fn encode_mv_component( - w: &mut dyn Writer, comp: i32, mvcomp: &mut NMVComponent, - precision: MvSubpelPrecision, -) { - assert!(comp != 0); - assert!(MV_LOW <= comp && comp <= MV_UPP); - let mut offset: u32 = 0; - let sign: u32 = if comp < 0 { 1 } else { 0 }; - let mag: u32 = if sign == 1 { -comp as u32 } else { comp as u32 }; - let mv_class = get_mv_class(mag - 1, &mut offset); - let d = offset >> 3; // int mv data - let fr = (offset >> 1) & 3; // fractional mv data - let hp = offset & 1; // high precision mv data - - // Sign - w.symbol_with_update(sign, &mut mvcomp.sign_cdf); - - // Class - w.symbol_with_update(mv_class as u32, &mut mvcomp.classes_cdf); - - // Integer bits - if mv_class == MV_CLASS_0 { - w.symbol_with_update(d, &mut mvcomp.class0_cdf); - } else { - let n = mv_class + CLASS0_BITS - 1; // number of bits - for i in 0..n { - w.symbol_with_update((d >> i) & 1, &mut mvcomp.bits_cdf[i]); +impl<'a> ContextWriter<'a> { + pub fn encode_mv_component( + &mut self, w: &mut dyn Writer, comp: i32, axis: usize, + precision: MvSubpelPrecision, + ) { + assert!(comp != 0); + assert!(MV_LOW <= comp && comp <= MV_UPP); + let mvcomp = &mut self.fc.nmv_context.comps[axis]; + let mut offset: u32 = 0; + let sign: u32 = if comp < 0 { 1 } else { 0 }; + let mag: u32 = if sign == 1 { -comp as u32 } else { comp as u32 }; + let mv_class = get_mv_class(mag - 1, &mut offset); + let d = offset >> 3; // int mv data + let fr = (offset >> 1) & 3; // fractional mv data + let hp = offset & 1; // high precision mv data + + // Sign + symbol_with_update!(self, w, sign, &mut mvcomp.sign_cdf); + + // Class + symbol_with_update!(self, w, mv_class as u32, &mut mvcomp.classes_cdf); + + // Integer bits + if mv_class == MV_CLASS_0 { + symbol_with_update!(self, w, d, &mut mvcomp.class0_cdf); + } else { + let n = mv_class + CLASS0_BITS - 1; // number of bits + for i in 0..n { + symbol_with_update!(self, w, (d >> i) & 1, &mut mvcomp.bits_cdf[i]); + } } - } - // Fractional bits - if precision > MvSubpelPrecision::MV_SUBPEL_NONE { - w.symbol_with_update( - fr, - if mv_class == MV_CLASS_0 { + // Fractional bits + if precision > MvSubpelPrecision::MV_SUBPEL_NONE { + let cdf = if mv_class == MV_CLASS_0 { &mut mvcomp.class0_fp_cdf[d as usize] } else { &mut mvcomp.fp_cdf - }, - ); - } + }; + symbol_with_update!(self, w, fr, cdf); + } - // High precision bit - if precision > MvSubpelPrecision::MV_SUBPEL_LOW_PRECISION { - w.symbol_with_update( - hp, - if mv_class == MV_CLASS_0 { + // High precision bit + if precision > MvSubpelPrecision::MV_SUBPEL_LOW_PRECISION { + let cdf = if mv_class == MV_CLASS_0 { &mut mvcomp.class0_hp_cdf } else { &mut mvcomp.hp_cdf - }, - ); + }; + symbol_with_update!(self, w, hp, cdf); + } } } From f9c026f0ef63c22452013abe18f9f15e4283869c Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 2 Feb 2021 15:29:37 +0900 Subject: [PATCH 003/155] Implement a CDF log for rollback Co-Authored-By: Luca Barbato --- src/context/cdf_context.rs | 55 ++++++++++++++++++++++++++++++++++---- src/ec.rs | 11 ++++++-- src/encoder.rs | 2 ++ 3 files changed, 61 insertions(+), 7 deletions(-) diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 6d42beb0c7..9bde0595b9 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -498,7 +498,7 @@ impl fmt::Debug for CDFContext { #[macro_use] macro_rules! symbol_with_update { ($self:ident, $w:ident, $s:expr, $cdf:expr) => { - $w.symbol_with_update($s, $cdf); + $w.symbol_with_update($s, $cdf, &mut $self.fc_log); #[cfg(feature = "desync_finder")] { let cdf: &[_] = $cdf; @@ -511,13 +511,53 @@ macro_rules! symbol_with_update { #[derive(Clone)] pub struct ContextWriterCheckpoint { - pub fc: CDFContext, + pub fc: usize, pub bc: BlockContextCheckpoint, } +pub struct CDFContextLog { + base: usize, + data: Vec, +} + +impl CDFContextLog { + fn new(fc: &CDFContext) -> Self { + Self { base: fc as *const _ as usize, data: Vec::with_capacity(256 * 1024) } + } + fn checkpoint(&self) -> usize { + self.data.len() + } + pub fn push(&mut self, cdf: &[u16]) { + let offset = cdf.as_ptr() as usize - self.base; + debug_assert!(offset <= u16::MAX.into()); + self.data.extend_from_slice(cdf); + self.data.extend_from_slice(&[offset as u16, cdf.len() as u16]); + } + pub fn rollback(&mut self, fc: &mut CDFContext, checkpoint: usize) { + let base = fc as *mut _ as *mut u8; + while self.data.len() > checkpoint { + if let Some(len) = self.data.pop() { + if let Some(offset) = self.data.pop() { + let len = len as usize; + let src = &self.data[self.data.len() - len]; + unsafe { + let dst = base.add(offset as usize) as *mut u16; + dst.copy_from_nonoverlapping(src, len); + } + self.data.truncate(self.data.len() - len); + } + } + } + } + pub fn clear(&mut self) { + self.data.clear(); + } +} + pub struct ContextWriter<'a> { pub bc: BlockContext<'a>, pub fc: &'a mut CDFContext, + pub fc_log: CDFContextLog, #[cfg(feature = "desync_finder")] pub fc_map: Option, // For debugging purposes } @@ -525,10 +565,12 @@ pub struct ContextWriter<'a> { impl<'a> ContextWriter<'a> { #[allow(clippy::let_and_return)] pub fn new(fc: &'a mut CDFContext, bc: BlockContext<'a>) -> Self { + let fc_log = CDFContextLog::new(fc); #[allow(unused_mut)] let mut cw = ContextWriter { fc, bc, + fc_log, #[cfg(feature = "desync_finder")] fc_map: Default::default(), }; @@ -546,12 +588,15 @@ impl<'a> ContextWriter<'a> { (if element > 0 { cdf[element - 1] } else { 32768 }) - cdf[element] } - pub const fn checkpoint(&self) -> ContextWriterCheckpoint { - ContextWriterCheckpoint { fc: *self.fc, bc: self.bc.checkpoint() } + pub fn checkpoint(&self) -> ContextWriterCheckpoint { + ContextWriterCheckpoint { + fc: self.fc_log.checkpoint(), + bc: self.bc.checkpoint(), + } } pub fn rollback(&mut self, checkpoint: &ContextWriterCheckpoint) { - *self.fc = checkpoint.fc; + self.fc_log.rollback(&mut self.fc, checkpoint.fc); self.bc.rollback(&checkpoint.bc); #[cfg(feature = "desync_finder")] { diff --git a/src/ec.rs b/src/ec.rs index a6e0d7f592..9fdd24398a 100644 --- a/src/ec.rs +++ b/src/ec.rs @@ -42,7 +42,10 @@ pub trait Writer { /// leaves cdf unchanged fn symbol_bits(&self, s: u32, cdf: &[u16]) -> u32; /// Write a symbol s, using the passed in cdf reference; updates the referenced cdf. - fn symbol_with_update(&mut self, s: u32, cdf: &mut [u16]); + fn symbol_with_update( + &mut self, s: u32, cdf: &mut [u16], + log: &mut crate::context::CDFContextLog, + ); /// Write a bool using passed in probability fn bool(&mut self, val: bool, f: u16); /// Write a single bit with flat proability @@ -535,7 +538,10 @@ where /// `[s > 0 ? cdf[s - 1] : 0, cdf[s])`. /// The values must be monotonically non-decreasing, and the last value /// must be exactly 32768. There should be at most 16 values. - fn symbol_with_update(&mut self, s: u32, cdf: &mut [u16]) { + fn symbol_with_update( + &mut self, s: u32, cdf: &mut [u16], + log: &mut crate::context::CDFContextLog, + ) { let nsymbs = cdf.len() - 1; #[cfg(feature = "desync_finder")] { @@ -543,6 +549,7 @@ where self.print_backtrace(s); } } + log.push(cdf); self.symbol(s, &cdf[..nsymbs]); update_cdf(cdf, s); diff --git a/src/encoder.rs b/src/encoder.rs index ba6de68b3a..756f82bed0 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -3207,6 +3207,8 @@ fn encode_tile<'a, T: Pixel>( cw.bc.reset_left_contexts(planes); for sbx in 0..ts.sb_width { + cw.fc_log.clear(); + let tile_sbo = TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby }); let mut sbs_qe = SBSQueueEntry { sbo: tile_sbo, From 351f7c8a002695ccbd3307d931d2ba3a8d5a3913 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 2 Feb 2021 23:22:48 +0900 Subject: [PATCH 004/155] Use fixed-size reads and writes for CDF log --- src/context/cdf_context.rs | 48 ++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 9bde0595b9..a56108cf9b 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -9,7 +9,10 @@ use super::*; +const CDF_LEN_MAX: usize = 16 + 1; + #[derive(Clone, Copy)] +#[repr(C)] pub struct CDFContext { pub partition_cdf: [[u16; EXT_PARTITION_TYPES + 1]; PARTITION_CONTEXTS], pub kf_y_cdf: [[[u16; INTRA_MODES + 1]; KF_MODE_CONTEXTS]; KF_MODE_CONTEXTS], @@ -70,6 +73,8 @@ pub struct CDFContext { [[[[u16; 4 + 1]; SIG_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], pub coeff_br_cdf: [[[[u16; BR_CDF_SIZE + 1]; LEVEL_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], + + padding: [u16; CDF_LEN_MAX], } impl CDFContext { @@ -131,6 +136,8 @@ impl CDFContext { coeff_base_eob_cdf: av1_default_coeff_base_eob_multi_cdfs[qctx], coeff_base_cdf: av1_default_coeff_base_multi_cdfs[qctx], coeff_br_cdf: av1_default_coeff_lps_multi_cdfs[qctx], + + padding: [0; CDF_LEN_MAX], } } @@ -517,36 +524,47 @@ pub struct ContextWriterCheckpoint { pub struct CDFContextLog { base: usize, - data: Vec, + data: Vec<[u16; CDF_LEN_MAX + 1]>, } impl CDFContextLog { fn new(fc: &CDFContext) -> Self { - Self { base: fc as *const _ as usize, data: Vec::with_capacity(256 * 1024) } + Self { base: fc as *const _ as usize, data: Vec::with_capacity(1 << 15) } } fn checkpoint(&self) -> usize { self.data.len() } + #[inline(always)] pub fn push(&mut self, cdf: &[u16]) { let offset = cdf.as_ptr() as usize - self.base; debug_assert!(offset <= u16::MAX.into()); - self.data.extend_from_slice(cdf); - self.data.extend_from_slice(&[offset as u16, cdf.len() as u16]); + unsafe { + // Maintain an invariant of non-zero spare capacity, so that branching + // may be deferred until writes are issued. Benchmarks indicate this is + // faster than first testing capacity and possibly reallocating. + let len = self.data.len(); + debug_assert!(len < self.data.capacity()); + let entry = self.data.get_unchecked_mut(len); + let dst = entry.as_mut_ptr(); + dst.copy_from_nonoverlapping(cdf.as_ptr(), CDF_LEN_MAX); + entry[CDF_LEN_MAX] = offset as u16; + self.data.set_len(len + 1); + self.data.reserve(1); + } } + #[inline(always)] pub fn rollback(&mut self, fc: &mut CDFContext, checkpoint: usize) { let base = fc as *mut _ as *mut u8; - while self.data.len() > checkpoint { - if let Some(len) = self.data.pop() { - if let Some(offset) = self.data.pop() { - let len = len as usize; - let src = &self.data[self.data.len() - len]; - unsafe { - let dst = base.add(offset as usize) as *mut u16; - dst.copy_from_nonoverlapping(src, len); - } - self.data.truncate(self.data.len() - len); - } + let mut len = self.data.len(); + unsafe { + while len > checkpoint { + len -= 1; + let src = self.data.get_unchecked_mut(len); + let offset = src[CDF_LEN_MAX] as usize; + let dst = base.add(offset) as *mut u16; + dst.copy_from_nonoverlapping(src.as_ptr(), CDF_LEN_MAX); } + self.data.set_len(len); } } pub fn clear(&mut self) { From 211781dab078b07444f4c0d93ae313e949905fa0 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Sat, 6 Feb 2021 15:06:40 +0900 Subject: [PATCH 005/155] Move CDF counter to position of last symbol The lower bits of the last symbol's value may hold the counter, without affecting the entropy coding arithmetic. --- src/asm/x86/ec.rs | 5 +++-- src/context/cdf_context.rs | 34 +++++++++++++++++----------------- src/ec.rs | 19 +++++++++++-------- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/src/asm/x86/ec.rs b/src/asm/x86/ec.rs index 0b60acd7a2..546006ab6b 100644 --- a/src/asm/x86/ec.rs +++ b/src/asm/x86/ec.rs @@ -26,8 +26,8 @@ pub fn update_cdf(cdf: &mut [u16], val: u32) { #[inline] unsafe fn update_cdf_4_sse2(cdf: &mut [u16], val: u32) { let nsymbs = 4; - let rate = 5 + (cdf[nsymbs] >> 4) as usize; - cdf[nsymbs] += (cdf[nsymbs] < 32) as u16; + let rate = 5 + (cdf[nsymbs - 1] >> 4) as usize; + let count = cdf[nsymbs - 1] + (cdf[nsymbs - 1] < 32) as u16; // A bit of explanation of what is happening down here. First of all, let's look at the simple // implementation: @@ -86,6 +86,7 @@ unsafe fn update_cdf_4_sse2(cdf: &mut [u16], val: u32) { let result = _mm_sub_epi16(cdf_simd, fixed_if_lt_val); _mm_storel_epi64(cdf.as_mut_ptr() as *mut __m128i, result); + cdf[nsymbs - 1] = count; } #[cfg(test)] diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index a56108cf9b..016881d583 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -143,10 +143,9 @@ impl CDFContext { pub fn reset_counts(&mut self) { macro_rules! reset_1d { - ($field:expr) => { - let r = $field.last_mut().unwrap(); - *r = 0; - }; + ($field:expr) => {{ + $field[$field.len() - 2] = 0; + }}; } macro_rules! reset_2d { ($field:expr) => { @@ -171,21 +170,21 @@ impl CDFContext { } for i in 0..4 { - self.partition_cdf[i][4] = 0; + self.partition_cdf[i][3] = 0; } for i in 4..16 { - self.partition_cdf[i][10] = 0; + self.partition_cdf[i][9] = 0; } for i in 16..20 { - self.partition_cdf[i][8] = 0; + self.partition_cdf[i][7] = 0; } reset_3d!(self.kf_y_cdf); reset_2d!(self.y_mode_cdf); for i in 0..INTRA_MODES { - self.uv_mode_cdf[0][i][UV_INTRA_MODES - 1] = 0; - self.uv_mode_cdf[1][i][UV_INTRA_MODES] = 0; + self.uv_mode_cdf[0][i][UV_INTRA_MODES - 2] = 0; + self.uv_mode_cdf[1][i][UV_INTRA_MODES - 1] = 0; } reset_1d!(self.cfl_sign_cdf); reset_2d!(self.cfl_alpha_cdf); @@ -195,23 +194,23 @@ impl CDFContext { for i in 0..TX_SIZE_SQR_CONTEXTS { for j in 0..INTRA_MODES { - self.intra_tx_cdf[1][i][j][7] = 0; - self.intra_tx_cdf[2][i][j][5] = 0; + self.intra_tx_cdf[1][i][j][6] = 0; + self.intra_tx_cdf[2][i][j][4] = 0; } - self.inter_tx_cdf[1][i][16] = 0; - self.inter_tx_cdf[2][i][12] = 0; - self.inter_tx_cdf[3][i][2] = 0; + self.inter_tx_cdf[1][i][15] = 0; + self.inter_tx_cdf[2][i][11] = 0; + self.inter_tx_cdf[3][i][1] = 0; } for i in 0..TX_SIZE_CONTEXTS { - self.tx_size_cdf[0][i][MAX_TX_DEPTH] = 0; + self.tx_size_cdf[0][i][MAX_TX_DEPTH - 1] = 0; } reset_2d!(self.tx_size_cdf[1]); reset_2d!(self.tx_size_cdf[2]); reset_2d!(self.tx_size_cdf[3]); for i in 0..TXFM_PARTITION_CONTEXTS { - self.txfm_partition_cdf[i][2] = 0; + self.txfm_partition_cdf[i][1] = 0; } reset_2d!(self.skip_cdfs); @@ -603,7 +602,8 @@ impl<'a> ContextWriter<'a> { } pub fn cdf_element_prob(cdf: &[u16], element: usize) -> u16 { - (if element > 0 { cdf[element - 1] } else { 32768 }) - cdf[element] + (if element > 0 { cdf[element - 1] } else { 32768 }) + - (if element + 2 < cdf.len() { cdf[element] } else { 0 }) } pub fn checkpoint(&self) -> ContextWriterCheckpoint { diff --git a/src/ec.rs b/src/ec.rs index 9fdd24398a..43a16df589 100644 --- a/src/ec.rs +++ b/src/ec.rs @@ -519,14 +519,15 @@ where /// `cdf`: The CDF, such that symbol s falls in the range /// `[s > 0 ? cdf[s - 1] : 0, cdf[s])`. /// The values must be monotonically non-decreasing, and the last value - /// must be exactly 32768. There should be at most 16 values. + /// must be greater than 32704. There should be at most 16 values. + /// The lower 6 bits of the last value hold the count. #[inline(always)] fn symbol(&mut self, s: u32, cdf: &[u16]) { - debug_assert!(cdf[cdf.len() - 1] == 0); + debug_assert!(cdf[cdf.len() - 1] < (1 << EC_PROB_SHIFT)); let nms = cdf.len() - s as usize; let fl = if s > 0 { cdf[s as usize - 1] } else { 32768 }; let fh = cdf[s as usize]; - debug_assert!(fh <= fl); + debug_assert!((fh >> EC_PROB_SHIFT) <= (fl >> EC_PROB_SHIFT)); debug_assert!(fl <= 32768); self.store(fl, fh, nms as u16); } @@ -537,7 +538,8 @@ where /// `cdf`: The CDF, such that symbol s falls in the range /// `[s > 0 ? cdf[s - 1] : 0, cdf[s])`. /// The values must be monotonically non-decreasing, and the last value - /// must be exactly 32768. There should be at most 16 values. + /// must be greater 32704. There should be at most 16 values. + /// The lower 6 bits of the last value hold the count. fn symbol_with_update( &mut self, s: u32, cdf: &mut [u16], log: &mut crate::context::CDFContextLog, @@ -560,10 +562,11 @@ where /// `cdf`: The CDF, such that symbol s falls in the range /// `[s > 0 ? cdf[s - 1] : 0, cdf[s])`. /// The values must be monotonically non-decreasing, and the last value - /// must be exactly 32768. There should be at most 16 values. + /// must be greater than 32704. There should be at most 16 values. + /// The lower 6 bits of the last value hold the count. fn symbol_bits(&self, s: u32, cdf: &[u16]) -> u32 { let mut bits = 0; - debug_assert!(cdf[cdf.len() - 1] == 0); + debug_assert!(cdf[cdf.len() - 1] < (1 << EC_PROB_SHIFT)); debug_assert!(32768 <= self.rng); let rng = (self.rng >> 8) as u32; let fh = cdf[s as usize] as u32 >> EC_PROB_SHIFT; @@ -888,8 +891,8 @@ pub(crate) mod rust { // Function to update the CDF for Writer calls that do so. pub fn update_cdf(cdf: &mut [u16], val: u32) { let nsymbs = cdf.len() - 1; - let rate = 3 + (nsymbs >> 1).min(2) + (cdf[nsymbs] >> 4) as usize; - cdf[nsymbs] += 1 - (cdf[nsymbs] >> 5); + let rate = 3 + (nsymbs >> 1).min(2) + (cdf[nsymbs - 1] >> 4) as usize; + cdf[nsymbs - 1] += 1 - (cdf[nsymbs - 1] >> 5); // Single loop (faster) for (i, v) in cdf[..nsymbs - 1].iter_mut().enumerate() { From d0e25a481cee8e915ae454d758313a1eea321304 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Sat, 6 Feb 2021 23:26:12 +0900 Subject: [PATCH 006/155] Remove trailing zero from CDFs Now the counter has moved, this position is never read nor modified. --- src/asm/x86/ec.rs | 8 +-- src/context/block_unit.rs | 26 ++++---- src/context/cdf_context.rs | 108 +++++++++++++++++----------------- src/context/frame_header.rs | 7 +-- src/context/partition_unit.rs | 2 +- src/context/transform_unit.rs | 6 +- src/ec.rs | 5 +- src/util/cdf.rs | 4 +- 8 files changed, 79 insertions(+), 87 deletions(-) diff --git a/src/asm/x86/ec.rs b/src/asm/x86/ec.rs index 546006ab6b..f6db6a5f3f 100644 --- a/src/asm/x86/ec.rs +++ b/src/asm/x86/ec.rs @@ -13,7 +13,7 @@ use std::arch::x86_64::*; #[inline(always)] pub fn update_cdf(cdf: &mut [u16], val: u32) { - if cdf.len() == 5 { + if cdf.len() == 4 { return unsafe { update_cdf_4_sse2(cdf, val); }; @@ -95,8 +95,8 @@ mod test { #[test] fn update_cdf_4_sse2() { - let mut cdf = [7296, 3819, 1616, 0, 0]; - let mut cdf2 = [7296, 3819, 1616, 0, 0]; + let mut cdf = [7296, 3819, 1616, 0]; + let mut cdf2 = [7296, 3819, 1616, 0]; for i in 0..4 { rust::update_cdf(&mut cdf, i); unsafe { @@ -105,7 +105,7 @@ mod test { assert_eq!(cdf, cdf2); } - let mut cdf = [7297, 3820, 1617, 0, 0]; + let mut cdf = [7297, 3820, 1617, 0]; let mut cdf2 = cdf.clone(); for i in 0..4 { rust::update_cdf(&mut cdf, i); diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs index b3dd167caf..f629ed5f7b 100644 --- a/src/context/block_unit.rs +++ b/src/context/block_unit.rs @@ -485,19 +485,19 @@ impl<'a> BlockContext<'a> { #[derive(Clone, Copy)] pub struct NMVComponent { - pub classes_cdf: [u16; MV_CLASSES + 1], - pub class0_fp_cdf: [[u16; MV_FP_SIZE + 1]; CLASS0_SIZE], - pub fp_cdf: [u16; MV_FP_SIZE + 1], - pub sign_cdf: [u16; 2 + 1], - pub class0_hp_cdf: [u16; 2 + 1], - pub hp_cdf: [u16; 2 + 1], - pub class0_cdf: [u16; CLASS0_SIZE + 1], - pub bits_cdf: [[u16; 2 + 1]; MV_OFFSET_BITS], + pub classes_cdf: [u16; MV_CLASSES], + pub class0_fp_cdf: [[u16; MV_FP_SIZE]; CLASS0_SIZE], + pub fp_cdf: [u16; MV_FP_SIZE], + pub sign_cdf: [u16; 2], + pub class0_hp_cdf: [u16; 2], + pub hp_cdf: [u16; 2], + pub class0_cdf: [u16; CLASS0_SIZE], + pub bits_cdf: [[u16; 2]; MV_OFFSET_BITS], } #[derive(Clone, Copy)] pub struct NMVContext { - pub joints_cdf: [u16; MV_JOINTS + 1], + pub joints_cdf: [u16; MV_JOINTS], pub comps: [NMVComponent; 2], } @@ -625,7 +625,7 @@ impl IndexMut for FrameBlocks { impl<'a> ContextWriter<'a> { pub fn get_cdf_intra_mode_kf( &self, bo: TileBlockOffset, - ) -> &[u16; INTRA_MODES + 1] { + ) -> &[u16; INTRA_MODES] { static intra_mode_context: [usize; INTRA_MODES] = [0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0]; let above_mode = if bo.0.y > 0 { @@ -664,9 +664,7 @@ impl<'a> ContextWriter<'a> { symbol_with_update!(self, w, mode as u32, cdf); } - pub fn get_cdf_intra_mode( - &self, bsize: BlockSize, - ) -> &[u16; INTRA_MODES + 1] { + pub fn get_cdf_intra_mode(&self, bsize: BlockSize) -> &[u16; INTRA_MODES] { &self.fc.y_mode_cdf[size_group_lookup[bsize as usize] as usize] } @@ -689,7 +687,7 @@ impl<'a> ContextWriter<'a> { if bs.cfl_allowed() { symbol_with_update!(self, w, uv_mode as u32, cdf); } else { - symbol_with_update!(self, w, uv_mode as u32, &mut cdf[..UV_INTRA_MODES]); + symbol_with_update!(self, w, uv_mode as u32, &mut cdf[..INTRA_MODES]); } } diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 016881d583..201bf65b96 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -9,70 +9,67 @@ use super::*; -const CDF_LEN_MAX: usize = 16 + 1; +const CDF_LEN_MAX: usize = 16; #[derive(Clone, Copy)] #[repr(C)] pub struct CDFContext { - pub partition_cdf: [[u16; EXT_PARTITION_TYPES + 1]; PARTITION_CONTEXTS], - pub kf_y_cdf: [[[u16; INTRA_MODES + 1]; KF_MODE_CONTEXTS]; KF_MODE_CONTEXTS], - pub y_mode_cdf: [[u16; INTRA_MODES + 1]; BLOCK_SIZE_GROUPS], - pub uv_mode_cdf: [[[u16; UV_INTRA_MODES + 1]; INTRA_MODES]; 2], - pub cfl_sign_cdf: [u16; CFL_JOINT_SIGNS + 1], - pub cfl_alpha_cdf: [[u16; CFL_ALPHABET_SIZE + 1]; CFL_ALPHA_CONTEXTS], - pub newmv_cdf: [[u16; 2 + 1]; NEWMV_MODE_CONTEXTS], - pub zeromv_cdf: [[u16; 2 + 1]; GLOBALMV_MODE_CONTEXTS], - pub refmv_cdf: [[u16; 2 + 1]; REFMV_MODE_CONTEXTS], - pub intra_tx_cdf: [[[[u16; TX_TYPES + 1]; INTRA_MODES]; - TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTRA], - pub inter_tx_cdf: - [[[u16; TX_TYPES + 1]; TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTER], - pub tx_size_cdf: - [[[u16; MAX_TX_DEPTH + 1 + 1]; TX_SIZE_CONTEXTS]; MAX_TX_CATS], - pub txfm_partition_cdf: [[u16; 2 + 1]; TXFM_PARTITION_CONTEXTS], - pub skip_cdfs: [[u16; 3]; SKIP_CONTEXTS], - pub intra_inter_cdfs: [[u16; 3]; INTRA_INTER_CONTEXTS], - pub angle_delta_cdf: [[u16; 2 * MAX_ANGLE_DELTA + 1 + 1]; DIRECTIONAL_MODES], - pub filter_intra_cdfs: [[u16; 3]; BlockSize::BLOCK_SIZES_ALL], + pub partition_cdf: [[u16; EXT_PARTITION_TYPES]; PARTITION_CONTEXTS], + pub kf_y_cdf: [[[u16; INTRA_MODES]; KF_MODE_CONTEXTS]; KF_MODE_CONTEXTS], + pub y_mode_cdf: [[u16; INTRA_MODES]; BLOCK_SIZE_GROUPS], + pub uv_mode_cdf: [[[u16; UV_INTRA_MODES]; INTRA_MODES]; 2], + pub cfl_sign_cdf: [u16; CFL_JOINT_SIGNS], + pub cfl_alpha_cdf: [[u16; CFL_ALPHABET_SIZE]; CFL_ALPHA_CONTEXTS], + pub newmv_cdf: [[u16; 2]; NEWMV_MODE_CONTEXTS], + pub zeromv_cdf: [[u16; 2]; GLOBALMV_MODE_CONTEXTS], + pub refmv_cdf: [[u16; 2]; REFMV_MODE_CONTEXTS], + pub intra_tx_cdf: + [[[[u16; TX_TYPES]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTRA], + pub inter_tx_cdf: [[[u16; TX_TYPES]; TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTER], + pub tx_size_cdf: [[[u16; MAX_TX_DEPTH + 1]; TX_SIZE_CONTEXTS]; MAX_TX_CATS], + pub txfm_partition_cdf: [[u16; 2]; TXFM_PARTITION_CONTEXTS], + pub skip_cdfs: [[u16; 2]; SKIP_CONTEXTS], + pub intra_inter_cdfs: [[u16; 2]; INTRA_INTER_CONTEXTS], + pub angle_delta_cdf: [[u16; 2 * MAX_ANGLE_DELTA + 1]; DIRECTIONAL_MODES], + pub filter_intra_cdfs: [[u16; 2]; BlockSize::BLOCK_SIZES_ALL], pub palette_y_mode_cdfs: - [[[u16; 3]; PALETTE_Y_MODE_CONTEXTS]; PALETTE_BSIZE_CTXS], - pub palette_uv_mode_cdfs: [[u16; 3]; PALETTE_UV_MODE_CONTEXTS], - pub comp_mode_cdf: [[u16; 3]; COMP_INTER_CONTEXTS], - pub comp_ref_type_cdf: [[u16; 3]; COMP_REF_TYPE_CONTEXTS], - pub comp_ref_cdf: [[[u16; 3]; FWD_REFS - 1]; REF_CONTEXTS], - pub comp_bwd_ref_cdf: [[[u16; 3]; BWD_REFS - 1]; REF_CONTEXTS], - pub single_ref_cdfs: [[[u16; 2 + 1]; SINGLE_REFS - 1]; REF_CONTEXTS], - pub drl_cdfs: [[u16; 2 + 1]; DRL_MODE_CONTEXTS], - pub compound_mode_cdf: - [[u16; INTER_COMPOUND_MODES + 1]; INTER_MODE_CONTEXTS], + [[[u16; 2]; PALETTE_Y_MODE_CONTEXTS]; PALETTE_BSIZE_CTXS], + pub palette_uv_mode_cdfs: [[u16; 2]; PALETTE_UV_MODE_CONTEXTS], + pub comp_mode_cdf: [[u16; 2]; COMP_INTER_CONTEXTS], + pub comp_ref_type_cdf: [[u16; 2]; COMP_REF_TYPE_CONTEXTS], + pub comp_ref_cdf: [[[u16; 2]; FWD_REFS - 1]; REF_CONTEXTS], + pub comp_bwd_ref_cdf: [[[u16; 2]; BWD_REFS - 1]; REF_CONTEXTS], + pub single_ref_cdfs: [[[u16; 2]; SINGLE_REFS - 1]; REF_CONTEXTS], + pub drl_cdfs: [[u16; 2]; DRL_MODE_CONTEXTS], + pub compound_mode_cdf: [[u16; INTER_COMPOUND_MODES]; INTER_MODE_CONTEXTS], pub nmv_context: NMVContext, - pub deblock_delta_multi_cdf: [[u16; DELTA_LF_PROBS + 1 + 1]; FRAME_LF_COUNT], - pub deblock_delta_cdf: [u16; DELTA_LF_PROBS + 1 + 1], - pub spatial_segmentation_cdfs: [[u16; 8 + 1]; 3], - pub lrf_switchable_cdf: [u16; 3 + 1], - pub lrf_sgrproj_cdf: [u16; 2 + 1], - pub lrf_wiener_cdf: [u16; 2 + 1], + pub deblock_delta_multi_cdf: [[u16; DELTA_LF_PROBS + 1]; FRAME_LF_COUNT], + pub deblock_delta_cdf: [u16; DELTA_LF_PROBS + 1], + pub spatial_segmentation_cdfs: [[u16; 8]; 3], + pub lrf_switchable_cdf: [u16; 3], + pub lrf_sgrproj_cdf: [u16; 2], + pub lrf_wiener_cdf: [u16; 2], // lv_map - pub txb_skip_cdf: [[[u16; 3]; TXB_SKIP_CONTEXTS]; TxSize::TX_SIZES], - pub dc_sign_cdf: [[[u16; 3]; DC_SIGN_CONTEXTS]; PLANE_TYPES], + pub txb_skip_cdf: [[[u16; 2]; TXB_SKIP_CONTEXTS]; TxSize::TX_SIZES], + pub dc_sign_cdf: [[[u16; 2]; DC_SIGN_CONTEXTS]; PLANE_TYPES], pub eob_extra_cdf: - [[[[u16; 3]; EOB_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], + [[[[u16; 2]; EOB_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], - pub eob_flag_cdf16: [[[u16; 5 + 1]; 2]; PLANE_TYPES], - pub eob_flag_cdf32: [[[u16; 6 + 1]; 2]; PLANE_TYPES], - pub eob_flag_cdf64: [[[u16; 7 + 1]; 2]; PLANE_TYPES], - pub eob_flag_cdf128: [[[u16; 8 + 1]; 2]; PLANE_TYPES], - pub eob_flag_cdf256: [[[u16; 9 + 1]; 2]; PLANE_TYPES], - pub eob_flag_cdf512: [[[u16; 10 + 1]; 2]; PLANE_TYPES], - pub eob_flag_cdf1024: [[[u16; 11 + 1]; 2]; PLANE_TYPES], + pub eob_flag_cdf16: [[[u16; 5]; 2]; PLANE_TYPES], + pub eob_flag_cdf32: [[[u16; 6]; 2]; PLANE_TYPES], + pub eob_flag_cdf64: [[[u16; 7]; 2]; PLANE_TYPES], + pub eob_flag_cdf128: [[[u16; 8]; 2]; PLANE_TYPES], + pub eob_flag_cdf256: [[[u16; 9]; 2]; PLANE_TYPES], + pub eob_flag_cdf512: [[[u16; 10]; 2]; PLANE_TYPES], + pub eob_flag_cdf1024: [[[u16; 11]; 2]; PLANE_TYPES], pub coeff_base_eob_cdf: - [[[[u16; 3 + 1]; SIG_COEF_CONTEXTS_EOB]; PLANE_TYPES]; TxSize::TX_SIZES], + [[[[u16; 3]; SIG_COEF_CONTEXTS_EOB]; PLANE_TYPES]; TxSize::TX_SIZES], pub coeff_base_cdf: - [[[[u16; 4 + 1]; SIG_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], - pub coeff_br_cdf: [[[[u16; BR_CDF_SIZE + 1]; LEVEL_CONTEXTS]; PLANE_TYPES]; - TxSize::TX_SIZES], + [[[[u16; 4]; SIG_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], + pub coeff_br_cdf: + [[[[u16; BR_CDF_SIZE]; LEVEL_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], padding: [u16; CDF_LEN_MAX], } @@ -143,9 +140,10 @@ impl CDFContext { pub fn reset_counts(&mut self) { macro_rules! reset_1d { - ($field:expr) => {{ - $field[$field.len() - 2] = 0; - }}; + ($field:expr) => { + let r = $field.last_mut().unwrap(); + *r = 0; + }; } macro_rules! reset_2d { ($field:expr) => { @@ -603,7 +601,7 @@ impl<'a> ContextWriter<'a> { pub fn cdf_element_prob(cdf: &[u16], element: usize) -> u16 { (if element > 0 { cdf[element - 1] } else { 32768 }) - - (if element + 2 < cdf.len() { cdf[element] } else { 0 }) + - (if element + 1 < cdf.len() { cdf[element] } else { 0 }) } pub fn checkpoint(&self) -> ContextWriterCheckpoint { diff --git a/src/context/frame_header.rs b/src/context/frame_header.rs index 742f967298..36dbd88697 100644 --- a/src/context/frame_header.rs +++ b/src/context/frame_header.rs @@ -15,18 +15,15 @@ impl CDFContext { &self, w: &dyn Writer, rs: &TileRestorationState, filter: RestorationFilter, pli: usize, ) -> u32 { - let nsym = &self.lrf_switchable_cdf.len() - 1; match filter { - RestorationFilter::None => { - w.symbol_bits(0, &self.lrf_switchable_cdf[..nsym]) - } + RestorationFilter::None => w.symbol_bits(0, &self.lrf_switchable_cdf), RestorationFilter::Wiener { .. } => { unreachable!() // for now, not permanently } RestorationFilter::Sgrproj { set, xqd } => { // Does *not* use 'RESTORE_SGRPROJ' but rather just '2' let rp = &rs.planes[pli]; - let mut bits = w.symbol_bits(2, &self.lrf_switchable_cdf[..nsym]) + let mut bits = w.symbol_bits(2, &self.lrf_switchable_cdf) + ((SGRPROJ_PARAMS_BITS as u32) << OD_BITRES); for i in 0..2 { let s = SGRPROJ_PARAMS_S[set as usize][i]; diff --git a/src/context/partition_unit.rs b/src/context/partition_unit.rs index 3c360a447a..471a73b8f3 100644 --- a/src/context/partition_unit.rs +++ b/src/context/partition_unit.rs @@ -307,7 +307,7 @@ impl<'a> ContextWriter<'a> { let ctx = self.bc.partition_plane_context(bo, bsize); assert!(ctx < PARTITION_CONTEXTS); let partition_cdf = if bsize <= BlockSize::BLOCK_8X8 { - &mut self.fc.partition_cdf[ctx][..=PARTITION_TYPES] + &mut self.fc.partition_cdf[ctx][..PARTITION_TYPES] } else { &mut self.fc.partition_cdf[ctx] }; diff --git a/src/context/transform_unit.rs b/src/context/transform_unit.rs index 330113fad2..3d8d4cf203 100644 --- a/src/context/transform_unit.rs +++ b/src/context/transform_unit.rs @@ -543,7 +543,7 @@ impl<'a> ContextWriter<'a> { w, av1_tx_ind[tx_set as usize][tx_type as usize] as u32, &mut self.fc.inter_tx_cdf[tx_set_index as usize] - [square_tx_size as usize][..=num_tx_set[tx_set as usize]] + [square_tx_size as usize][..num_tx_set[tx_set as usize]] ); } else { let intra_dir = y_mode; @@ -557,7 +557,7 @@ impl<'a> ContextWriter<'a> { av1_tx_ind[tx_set as usize][tx_type as usize] as u32, &mut self.fc.intra_tx_cdf[tx_set_index as usize] [square_tx_size as usize][intra_dir as usize] - [..=num_tx_set[tx_set as usize]] + [..num_tx_set[tx_set as usize]] ); } } @@ -651,7 +651,7 @@ impl<'a> ContextWriter<'a> { self, w, depth as u32, - &mut self.fc.tx_size_cdf[tx_size_cat][tx_size_ctx][..=max_depths + 1] + &mut self.fc.tx_size_cdf[tx_size_cat][tx_size_ctx][..=max_depths] ); } diff --git a/src/ec.rs b/src/ec.rs index 43a16df589..870be98b26 100644 --- a/src/ec.rs +++ b/src/ec.rs @@ -544,7 +544,6 @@ where &mut self, s: u32, cdf: &mut [u16], log: &mut crate::context::CDFContextLog, ) { - let nsymbs = cdf.len() - 1; #[cfg(feature = "desync_finder")] { if self.debug { @@ -552,7 +551,7 @@ where } } log.push(cdf); - self.symbol(s, &cdf[..nsymbs]); + self.symbol(s, cdf); update_cdf(cdf, s); } @@ -890,7 +889,7 @@ impl BCodeWriter for BitWriter { pub(crate) mod rust { // Function to update the CDF for Writer calls that do so. pub fn update_cdf(cdf: &mut [u16], val: u32) { - let nsymbs = cdf.len() - 1; + let nsymbs = cdf.len(); let rate = 3 + (nsymbs >> 1).min(2) + (cdf[nsymbs - 1] >> 4) as usize; cdf[nsymbs - 1] += 1 - (cdf[nsymbs - 1] >> 5); diff --git a/src/util/cdf.rs b/src/util/cdf.rs index 1bcca687de..d81a153d11 100644 --- a/src/util/cdf.rs +++ b/src/util/cdf.rs @@ -10,11 +10,11 @@ // TODO: Nice to have (although I wasn't able to find a way to do it yet in rust): zero-fill arrays that are // shorter than required. Need const fn (Rust Issue #24111) or const generics (Rust RFC #2000) macro_rules! cdf { - ($($x:expr),+) => {[$(32768 - $x),+, 0, 0]} + ($($x:expr),+) => {[$(32768 - $x),+, 0]} } macro_rules! cdf_size { ($x:expr) => { - $x + 1 + $x }; } From b53c53c001f08472573e37c8115cf9028ba88562 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Sun, 7 Feb 2021 16:07:56 +0900 Subject: [PATCH 007/155] Remove the cdf_size macro now that it is a no-op --- src/entropymode.rs | 143 ++++++++++++++++++++------------------------- src/token_cdfs.rs | 49 ++++++++-------- src/util/cdf.rs | 6 -- 3 files changed, 85 insertions(+), 113 deletions(-) diff --git a/src/entropymode.rs b/src/entropymode.rs index ab45482278..87f6c3d0c0 100644 --- a/src/entropymode.rs +++ b/src/entropymode.rs @@ -37,8 +37,8 @@ pub const TXFM_PARTITION_CONTEXTS: usize = 21; // (TxSize::TX_SIZES - TxSize::TX // LUTS --------------------- -pub static default_kf_y_mode_cdf: [[[u16; cdf_size!(INTRA_MODES)]; - KF_MODE_CONTEXTS]; KF_MODE_CONTEXTS] = [ +pub static default_kf_y_mode_cdf: [[[u16; INTRA_MODES]; KF_MODE_CONTEXTS]; + KF_MODE_CONTEXTS] = [ [ cdf!( 15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244, 24189, 28165, @@ -151,8 +151,7 @@ pub static default_kf_y_mode_cdf: [[[u16; cdf_size!(INTRA_MODES)]; ], ]; -pub static default_angle_delta_cdf: [[u16; - cdf_size!(2 * MAX_ANGLE_DELTA + 1)]; +pub static default_angle_delta_cdf: [[u16; 2 * MAX_ANGLE_DELTA + 1]; DIRECTIONAL_MODES] = [ cdf!(2180, 5032, 7567, 22776, 26989, 30217), cdf!(2301, 5608, 8801, 23487, 26974, 30330), @@ -164,8 +163,7 @@ pub static default_angle_delta_cdf: [[u16; cdf!(3605, 10428, 12459, 17676, 21244, 30655), ]; -pub static default_if_y_mode_cdf: [[u16; cdf_size!(INTRA_MODES)]; - BLOCK_SIZE_GROUPS] = [ +pub static default_if_y_mode_cdf: [[u16; INTRA_MODES]; BLOCK_SIZE_GROUPS] = [ cdf!( 22801, 23489, 24293, 24756, 25601, 26123, 26606, 27418, 27945, 29228, 29685, 30349 @@ -184,8 +182,7 @@ pub static default_if_y_mode_cdf: [[u16; cdf_size!(INTRA_MODES)]; ), ]; -pub static default_uv_mode_cdf: [[[u16; cdf_size!(UV_INTRA_MODES)]; - INTRA_MODES]; 2] = [ +pub static default_uv_mode_cdf: [[[u16; UV_INTRA_MODES]; INTRA_MODES]; 2] = [ [ cdf!( 22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923, 28244, 30059, @@ -296,7 +293,7 @@ pub static default_uv_mode_cdf: [[[u16; cdf_size!(UV_INTRA_MODES)]; ], ]; -pub const default_partition_cdf: [[u16; cdf_size!(EXT_PARTITION_TYPES)]; +pub const default_partition_cdf: [[u16; EXT_PARTITION_TYPES]; PARTITION_CONTEXTS] = [ cdf!(19132, 25510, 30392, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX), cdf!(13928, 19855, 28540, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX), @@ -320,10 +317,9 @@ pub const default_partition_cdf: [[u16; cdf_size!(EXT_PARTITION_TYPES)]; cdf!(711, 966, 1172, 32448, 32538, 32617, 32664, CDFMAX, CDFMAX), ]; -pub static default_intra_ext_tx_cdf: [[[[u16; cdf_size!(TX_TYPES)]; - INTRA_MODES]; TX_SIZE_SQR_CONTEXTS]; - TX_SETS_INTRA] = [ - [[[0; cdf_size!(TX_TYPES)]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS], +pub static default_intra_ext_tx_cdf: [[[[u16; TX_TYPES]; INTRA_MODES]; + TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTRA] = [ + [[[0; TX_TYPES]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS], [ [ cdf!( @@ -762,9 +758,9 @@ pub static default_intra_ext_tx_cdf: [[[[u16; cdf_size!(TX_TYPES)]; ], ]; -pub static default_inter_ext_tx_cdf: [[[u16; cdf_size!(TX_TYPES)]; +pub static default_inter_ext_tx_cdf: [[[u16; TX_TYPES]; TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTER] = [ - [[0; cdf_size!(TX_TYPES)]; TX_SIZE_SQR_CONTEXTS], + [[0; TX_TYPES]; TX_SIZE_SQR_CONTEXTS], [ cdf!( 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504, 22848, 23934, @@ -821,10 +817,10 @@ pub static default_inter_ext_tx_cdf: [[[u16; cdf_size!(TX_TYPES)]; ], ]; -pub static default_cfl_sign_cdf: [u16; cdf_size!(CFL_JOINT_SIGNS)] = +pub static default_cfl_sign_cdf: [u16; CFL_JOINT_SIGNS] = cdf!(1418, 2123, 13340, 18405, 26972, 28343, 32294); -pub static default_cfl_alpha_cdf: [[u16; cdf_size!(CFL_ALPHABET_SIZE)]; +pub static default_cfl_alpha_cdf: [[u16; CFL_ALPHABET_SIZE]; CFL_ALPHA_CONTEXTS] = [ cdf!( 7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700, 32704, @@ -857,8 +853,7 @@ const SWITCHABLE_FILTERS: usize = 3; const SWITCHABLE_FILTER_CONTEXTS: usize = (SWITCHABLE_FILTERS + 1) * 4; #[allow(unused)] -pub static default_switchable_interp_cdf: [[u16; - cdf_size!(SWITCHABLE_FILTERS)]; +pub static default_switchable_interp_cdf: [[u16; SWITCHABLE_FILTERS]; SWITCHABLE_FILTER_CONTEXTS] = [ cdf!(31935, 32720), cdf!(5568, 32719), @@ -878,20 +873,20 @@ pub static default_switchable_interp_cdf: [[u16; cdf!(14969, 21398), ]; -pub static default_newmv_cdf: [[u16; cdf_size!(2)]; NEWMV_MODE_CONTEXTS] = [ +pub static default_newmv_cdf: [[u16; 2]; NEWMV_MODE_CONTEXTS] = [ cdf!(24035), cdf!(16630), cdf!(15339), cdf!(8386), cdf!(12222), cdf!(4676), - [0; cdf_size!(2)], + [0; 2], ]; -pub static default_zeromv_cdf: [[u16; cdf_size!(2)]; GLOBALMV_MODE_CONTEXTS] = +pub static default_zeromv_cdf: [[u16; 2]; GLOBALMV_MODE_CONTEXTS] = [cdf!(2175), cdf!(1054)]; -pub static default_refmv_cdf: [[u16; cdf_size!(2)]; REFMV_MODE_CONTEXTS] = [ +pub static default_refmv_cdf: [[u16; 2]; REFMV_MODE_CONTEXTS] = [ cdf!(23974), cdf!(24188), cdf!(17848), @@ -900,11 +895,10 @@ pub static default_refmv_cdf: [[u16; cdf_size!(2)]; REFMV_MODE_CONTEXTS] = [ cdf!(19923), ]; -pub static default_drl_cdf: [[u16; cdf_size!(2)]; DRL_MODE_CONTEXTS] = +pub static default_drl_cdf: [[u16; 2]; DRL_MODE_CONTEXTS] = [cdf!(13104), cdf!(24560), cdf!(18945)]; -pub static default_compound_mode_cdf: [[u16; - cdf_size!(INTER_COMPOUND_MODES)]; +pub static default_compound_mode_cdf: [[u16; INTER_COMPOUND_MODES]; INTER_MODE_CONTEXTS] = [ cdf!(7760, 13823, 15808, 17641, 19156, 20666, 26891), cdf!(10730, 19452, 21145, 22749, 24039, 25131, 28724), @@ -917,12 +911,12 @@ pub static default_compound_mode_cdf: [[u16; ]; #[allow(unused)] -pub static default_interintra_cdf: [[u16; cdf_size!(2)]; BLOCK_SIZE_GROUPS] = +pub static default_interintra_cdf: [[u16; 2]; BLOCK_SIZE_GROUPS] = [cdf!(16384), cdf!(26887), cdf!(27597), cdf!(30237)]; #[allow(unused)] pub static default_interintra_mode_cdf: [[u16; - cdf_size!(InterIntraMode::INTERINTRA_MODES as usize)]; + InterIntraMode::INTERINTRA_MODES as usize]; BLOCK_SIZE_GROUPS as usize] = [ cdf!(8192, 16384, 24576), cdf!(1875, 11082, 27332), @@ -931,7 +925,7 @@ pub static default_interintra_mode_cdf: [[u16; ]; #[allow(unused)] -pub static default_wedge_interintra_cdf: [[u16; cdf_size!(2)]; +pub static default_wedge_interintra_cdf: [[u16; 2]; BlockSize::BLOCK_SIZES_ALL] = [ cdf!(16384), cdf!(16384), @@ -959,7 +953,7 @@ pub static default_wedge_interintra_cdf: [[u16; cdf_size!(2)]; #[allow(unused)] pub static default_compound_type_cdf: [[u16; - cdf_size!(CompoundType::COMPOUND_TYPES as usize - 1)]; + CompoundType::COMPOUND_TYPES as usize - 1]; BlockSize::BLOCK_SIZES_ALL as usize] = [ cdf!(16384), cdf!(16384), @@ -986,8 +980,7 @@ pub static default_compound_type_cdf: [[u16; ]; #[allow(unused)] -pub static default_wedge_idx_cdf: [[u16; cdf_size!(16)]; - BlockSize::BLOCK_SIZES_ALL] = [ +pub static default_wedge_idx_cdf: [[u16; 16]; BlockSize::BLOCK_SIZES_ALL] = [ cdf!( 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720 @@ -1080,7 +1073,7 @@ pub static default_wedge_idx_cdf: [[u16; cdf_size!(16)]; #[allow(unused)] pub static default_motion_mode_cdf: [[u16; - cdf_size!(MotionMode::MOTION_MODES as usize)]; + MotionMode::MOTION_MODES as usize]; BlockSize::BLOCK_SIZES_ALL as usize] = [ cdf!(10923, 21845), cdf!(10923, 21845), @@ -1107,8 +1100,7 @@ pub static default_motion_mode_cdf: [[u16; ]; #[allow(unused)] -pub static default_obmc_cdf: [[u16; cdf_size!(2)]; - BlockSize::BLOCK_SIZES_ALL] = [ +pub static default_obmc_cdf: [[u16; 2]; BlockSize::BLOCK_SIZES_ALL] = [ cdf!(16384), cdf!(16384), cdf!(16384), @@ -1133,26 +1125,24 @@ pub static default_obmc_cdf: [[u16; cdf_size!(2)]; cdf!(26879), ]; -pub static default_intra_inter_cdf: [[u16; cdf_size!(2)]; - INTRA_INTER_CONTEXTS] = [cdf!(806), cdf!(16662), cdf!(20186), cdf!(26538)]; +pub static default_intra_inter_cdf: [[u16; 2]; INTRA_INTER_CONTEXTS] = + [cdf!(806), cdf!(16662), cdf!(20186), cdf!(26538)]; -pub static default_comp_mode_cdf: [[u16; cdf_size!(2)]; COMP_INTER_CONTEXTS] = +pub static default_comp_mode_cdf: [[u16; 2]; COMP_INTER_CONTEXTS] = [cdf!(26828), cdf!(24035), cdf!(12031), cdf!(10640), cdf!(2901)]; -pub static default_comp_ref_type_cdf: [[u16; cdf_size!(2)]; - COMP_REF_TYPE_CONTEXTS] = +pub static default_comp_ref_type_cdf: [[u16; 2]; COMP_REF_TYPE_CONTEXTS] = [cdf!(1198), cdf!(2070), cdf!(9166), cdf!(7499), cdf!(22475)]; #[allow(unused)] -pub static default_uni_comp_ref_cdf: [[[u16; cdf_size!(2)]; - UNIDIR_COMP_REFS - 1]; +pub static default_uni_comp_ref_cdf: [[[u16; 2]; UNIDIR_COMP_REFS - 1]; UNI_COMP_REF_CONTEXTS] = [ [cdf!(5284), cdf!(3865), cdf!(3128)], [cdf!(23152), cdf!(14173), cdf!(15270)], [cdf!(31774), cdf!(25120), cdf!(26710)], ]; -pub static default_single_ref_cdf: [[[u16; cdf_size!(2)]; SINGLE_REFS - 1]; +pub static default_single_ref_cdf: [[[u16; 2]; SINGLE_REFS - 1]; REF_CONTEXTS] = [ [cdf!(4897), cdf!(1555), cdf!(4236), cdf!(8650), cdf!(904), cdf!(1444)], [ @@ -1173,15 +1163,13 @@ pub static default_single_ref_cdf: [[[u16; cdf_size!(2)]; SINGLE_REFS - 1]; ], ]; -pub static default_comp_ref_cdf: [[[u16; cdf_size!(2)]; FWD_REFS - 1]; - REF_CONTEXTS] = [ +pub static default_comp_ref_cdf: [[[u16; 2]; FWD_REFS - 1]; REF_CONTEXTS] = [ [cdf!(4946), cdf!(9468), cdf!(1503)], [cdf!(19891), cdf!(22441), cdf!(15160)], [cdf!(30731), cdf!(31059), cdf!(27544)], ]; -pub static default_comp_bwdref_cdf: [[[u16; cdf_size!(2)]; BWD_REFS - 1]; - REF_CONTEXTS] = [ +pub static default_comp_bwdref_cdf: [[[u16; 2]; BWD_REFS - 1]; REF_CONTEXTS] = [ [cdf!(2235), cdf!(1423)], [cdf!(17182), cdf!(15175)], [cdf!(30606), cdf!(30489)], @@ -1189,7 +1177,7 @@ pub static default_comp_bwdref_cdf: [[[u16; cdf_size!(2)]; BWD_REFS - 1]; #[allow(unused)] pub static default_palette_y_size_cdf: [[u16; - cdf_size!(PaletteSize::PALETTE_SIZES as usize)]; + PaletteSize::PALETTE_SIZES as usize]; PALETTE_BSIZE_CTXS] = [ cdf!(7952, 13000, 18149, 21478, 25527, 29241), cdf!(7139, 11421, 16195, 19544, 23666, 28073), @@ -1202,7 +1190,7 @@ pub static default_palette_y_size_cdf: [[u16; #[allow(unused)] pub static default_palette_uv_size_cdf: [[u16; - cdf_size!(PaletteSize::PALETTE_SIZES as usize)]; + PaletteSize::PALETTE_SIZES as usize]; PALETTE_BSIZE_CTXS] = [ cdf!(8713, 19979, 27128, 29609, 31331, 32272), cdf!(5839, 15573, 23581, 26947, 29848, 31700), @@ -1213,8 +1201,7 @@ pub static default_palette_uv_size_cdf: [[u16; cdf!(1269, 5435, 10433, 18963, 21700, 25865), ]; -pub static default_palette_y_mode_cdfs: [[[u16; cdf_size!(2)]; - PALETTE_Y_MODE_CONTEXTS]; +pub static default_palette_y_mode_cdfs: [[[u16; 2]; PALETTE_Y_MODE_CONTEXTS]; PALETTE_BSIZE_CTXS] = [ [cdf!(31676), cdf!(3419), cdf!(1261)], [cdf!(31912), cdf!(2859), cdf!(980)], @@ -1225,12 +1212,12 @@ pub static default_palette_y_mode_cdfs: [[[u16; cdf_size!(2)]; [cdf!(32450), cdf!(7946), cdf!(129)], ]; -pub static default_palette_uv_mode_cdfs: [[u16; cdf_size!(2)]; - PALETTE_UV_MODE_CONTEXTS] = [cdf!(32461), cdf!(21488)]; +pub static default_palette_uv_mode_cdfs: [[u16; 2]; PALETTE_UV_MODE_CONTEXTS] = + [cdf!(32461), cdf!(21488)]; #[allow(unused)] pub static default_palette_y_color_index_cdf: [[[u16; - cdf_size!(PaletteColor::PALETTE_COLORS as usize)]; + PaletteColor::PALETTE_COLORS as usize]; PALETTE_COLOR_INDEX_CONTEXTS]; PaletteSize::PALETTE_SIZES as usize] = [ [ @@ -1286,7 +1273,7 @@ pub static default_palette_y_color_index_cdf: [[[u16; #[allow(unused)] pub static default_palette_uv_color_index_cdf: [[[u16; - cdf_size!(PaletteColor::PALETTE_COLORS as usize)]; + PaletteColor::PALETTE_COLORS as usize]; PALETTE_COLOR_INDEX_CONTEXTS]; PaletteSize::PALETTE_SIZES as usize] = [ [ @@ -1340,8 +1327,7 @@ pub static default_palette_uv_color_index_cdf: [[[u16; ], ]; -pub static default_txfm_partition_cdf: [[u16; cdf_size!(2)]; - TXFM_PARTITION_CONTEXTS] = [ +pub static default_txfm_partition_cdf: [[u16; 2]; TXFM_PARTITION_CONTEXTS] = [ cdf!(28581), cdf!(23846), cdf!(20847), @@ -1365,21 +1351,19 @@ pub static default_txfm_partition_cdf: [[u16; cdf_size!(2)]; cdf!(16088), ]; -pub static default_skip_cdfs: [[u16; cdf_size!(2)]; SKIP_CONTEXTS] = +pub static default_skip_cdfs: [[u16; 2]; SKIP_CONTEXTS] = [cdf!(31671), cdf!(16515), cdf!(4576)]; #[allow(unused)] -pub static default_skip_mode_cdfs: [[u16; cdf_size!(2)]; SKIP_MODE_CONTEXTS] = +pub static default_skip_mode_cdfs: [[u16; 2]; SKIP_MODE_CONTEXTS] = [cdf!(32621), cdf!(20708), cdf!(8127)]; #[allow(unused)] -pub static default_compound_idx_cdfs: [[u16; cdf_size!(2)]; - COMP_INDEX_CONTEXTS] = +pub static default_compound_idx_cdfs: [[u16; 2]; COMP_INDEX_CONTEXTS] = [cdf!(18244), cdf!(12865), cdf!(7053), cdf!(13259), cdf!(9334), cdf!(4644)]; #[allow(unused)] -pub static default_comp_group_idx_cdfs: [[u16; cdf_size!(2)]; - COMP_GROUP_IDX_CONTEXTS] = [ +pub static default_comp_group_idx_cdfs: [[u16; 2]; COMP_GROUP_IDX_CONTEXTS] = [ cdf!(26607), cdf!(22891), cdf!(18840), @@ -1389,15 +1373,14 @@ pub static default_comp_group_idx_cdfs: [[u16; cdf_size!(2)]; ]; #[allow(unused)] -pub static default_intrabc_cdf: [u16; cdf_size!(2)] = cdf!(30531); +pub static default_intrabc_cdf: [u16; 2] = cdf!(30531); #[allow(unused)] pub static default_filter_intra_mode_cdf: [u16; - cdf_size!(FilterIntraMode::FILTER_INTRA_MODES as usize)] = + FilterIntraMode::FILTER_INTRA_MODES as usize] = cdf!(8949, 12776, 17211, 29558); -pub static default_filter_intra_cdfs: [[u16; cdf_size!(2)]; - BlockSize::BLOCK_SIZES_ALL] = [ +pub static default_filter_intra_cdfs: [[u16; 2]; BlockSize::BLOCK_SIZES_ALL] = [ cdf!(4621), cdf!(6743), cdf!(5893), @@ -1422,18 +1405,18 @@ pub static default_filter_intra_cdfs: [[u16; cdf_size!(2)]; cdf!(16384), ]; -pub static default_switchable_restore_cdf: [u16; - cdf_size!(RESTORE_SWITCHABLE_TYPES)] = cdf!(9413, 22581); +pub static default_switchable_restore_cdf: [u16; RESTORE_SWITCHABLE_TYPES] = + cdf!(9413, 22581); -pub static default_wiener_restore_cdf: [u16; cdf_size!(2)] = cdf!(11570); +pub static default_wiener_restore_cdf: [u16; 2] = cdf!(11570); -pub static default_sgrproj_restore_cdf: [u16; cdf_size!(2)] = cdf!(16855); +pub static default_sgrproj_restore_cdf: [u16; 2] = cdf!(16855); #[allow(unused)] -pub static default_delta_q_cdf: [u16; cdf_size!(DELTA_Q_PROBS + 1)] = +pub static default_delta_q_cdf: [u16; DELTA_Q_PROBS + 1] = cdf!(28160, 32120, 32677); -pub static default_delta_lf_multi_cdf: [[u16; cdf_size!(DELTA_LF_PROBS + 1)]; +pub static default_delta_lf_multi_cdf: [[u16; DELTA_LF_PROBS + 1]; FRAME_LF_COUNT] = [ cdf!(28160, 32120, 32677), cdf!(28160, 32120, 32677), @@ -1441,29 +1424,27 @@ pub static default_delta_lf_multi_cdf: [[u16; cdf_size!(DELTA_LF_PROBS + 1)]; cdf!(28160, 32120, 32677), ]; -pub static default_delta_lf_cdf: [u16; cdf_size!(DELTA_LF_PROBS + 1)] = +pub static default_delta_lf_cdf: [u16; DELTA_LF_PROBS + 1] = cdf!(28160, 32120, 32677); // FIXME(someone) need real defaults here #[allow(unused)] -pub static default_seg_tree_cdf: [u16; cdf_size!(MAX_SEGMENTS)] = +pub static default_seg_tree_cdf: [u16; MAX_SEGMENTS] = cdf!(4096, 8192, 12288, 16384, 20480, 24576, 28672); #[allow(unused)] -pub static default_segment_pred_cdf: [[u16; cdf_size!(2)]; - SEG_TEMPORAL_PRED_CTXS] = +pub static default_segment_pred_cdf: [[u16; 2]; SEG_TEMPORAL_PRED_CTXS] = [cdf!(128 * 128), cdf!(128 * 128), cdf!(128 * 128)]; -pub static default_spatial_pred_seg_tree_cdf: [[u16; - cdf_size!(MAX_SEGMENTS)]; +pub static default_spatial_pred_seg_tree_cdf: [[u16; MAX_SEGMENTS]; SPATIAL_PREDICTION_PROBS] = [ cdf!(5622, 7893, 16093, 18233, 27809, 28373, 32533), cdf!(14274, 18230, 22557, 24935, 29980, 30851, 32344), cdf!(27527, 28487, 28723, 28890, 32397, 32647, 32679), ]; -pub static default_tx_size_cdf: [[[u16; cdf_size!(MAX_TX_DEPTH + 1)]; - TX_SIZE_CONTEXTS]; MAX_TX_CATS] = [ +pub static default_tx_size_cdf: [[[u16; MAX_TX_DEPTH + 1]; TX_SIZE_CONTEXTS]; + MAX_TX_CATS] = [ [cdf!(19968, CDFMAX), cdf!(19968, CDFMAX), cdf!(24320, CDFMAX)], [cdf!(12272, 30172), cdf!(12272, 30172), cdf!(18677, 30848)], [cdf!(12986, 15180), cdf!(12986, 15180), cdf!(24302, 25602)], diff --git a/src/token_cdfs.rs b/src/token_cdfs.rs index 370f9b4a9d..980553e6e1 100644 --- a/src/token_cdfs.rs +++ b/src/token_cdfs.rs @@ -14,9 +14,8 @@ use crate::transform::*; const TOKEN_CDF_Q_CTXS: usize = 4; -pub static av1_default_dc_sign_cdfs: [[[[u16; cdf_size!(2)]; - DC_SIGN_CONTEXTS]; PLANE_TYPES]; - TOKEN_CDF_Q_CTXS] = [ +pub static av1_default_dc_sign_cdfs: [[[[u16; 2]; DC_SIGN_CONTEXTS]; + PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = [ [ [cdf!(128 * 125), cdf!(128 * 102), cdf!(128 * 147)], [cdf!(128 * 119), cdf!(128 * 101), cdf!(128 * 135)], @@ -35,8 +34,7 @@ pub static av1_default_dc_sign_cdfs: [[[[u16; cdf_size!(2)]; ], ]; -pub static av1_default_txb_skip_cdfs: [[[[u16; cdf_size!(2)]; - TXB_SKIP_CONTEXTS]; +pub static av1_default_txb_skip_cdfs: [[[[u16; 2]; TXB_SKIP_CONTEXTS]; TxSize::TX_SIZES]; TOKEN_CDF_Q_CTXS] = [ [ [ @@ -348,9 +346,9 @@ pub static av1_default_txb_skip_cdfs: [[[[u16; cdf_size!(2)]; ], ]; -pub static av1_default_eob_extra_cdfs: [[[[[u16; cdf_size!(2)]; - EOB_COEF_CONTEXTS]; PLANE_TYPES]; - TxSize::TX_SIZES]; TOKEN_CDF_Q_CTXS] = [ +pub static av1_default_eob_extra_cdfs: [[[[[u16; 2]; EOB_COEF_CONTEXTS]; + PLANE_TYPES]; TxSize::TX_SIZES]; + TOKEN_CDF_Q_CTXS] = [ [ [ [ @@ -841,8 +839,8 @@ pub static av1_default_eob_extra_cdfs: [[[[[u16; cdf_size!(2)]; ], ]; -pub static av1_default_eob_multi16_cdfs: [[[[u16; cdf_size!(5)]; 2]; - PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = [ +pub static av1_default_eob_multi16_cdfs: [[[[u16; 5]; 2]; PLANE_TYPES]; + TOKEN_CDF_Q_CTXS] = [ [ [cdf!(840, 1039, 1980, 4895), cdf!(370, 671, 1883, 4471)], [cdf!(3247, 4950, 9688, 14563), cdf!(1904, 3354, 7763, 14647)], @@ -861,8 +859,8 @@ pub static av1_default_eob_multi16_cdfs: [[[[u16; cdf_size!(5)]; 2]; ], ]; -pub static av1_default_eob_multi32_cdfs: [[[[u16; cdf_size!(6)]; 2]; - PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = [ +pub static av1_default_eob_multi32_cdfs: [[[[u16; 6]; 2]; PLANE_TYPES]; + TOKEN_CDF_Q_CTXS] = [ [ [cdf!(400, 520, 977, 2102, 6542), cdf!(210, 405, 1315, 3326, 7537)], [ @@ -896,8 +894,8 @@ pub static av1_default_eob_multi32_cdfs: [[[[u16; cdf_size!(6)]; 2]; ], ]; -pub static av1_default_eob_multi64_cdfs: [[[[u16; cdf_size!(7)]; 2]; - PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = [ +pub static av1_default_eob_multi64_cdfs: [[[[u16; 7]; 2]; PLANE_TYPES]; + TOKEN_CDF_Q_CTXS] = [ [ [ cdf!(329, 498, 1101, 1784, 3265, 7758), @@ -940,8 +938,8 @@ pub static av1_default_eob_multi64_cdfs: [[[[u16; cdf_size!(7)]; 2]; ], ]; -pub static av1_default_eob_multi128_cdfs: [[[[u16; cdf_size!(8)]; 2]; - PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = [ +pub static av1_default_eob_multi128_cdfs: [[[[u16; 8]; 2]; PLANE_TYPES]; + TOKEN_CDF_Q_CTXS] = [ [ [ cdf!(219, 482, 1140, 2091, 3680, 6028, 12586), @@ -984,8 +982,8 @@ pub static av1_default_eob_multi128_cdfs: [[[[u16; cdf_size!(8)]; 2]; ], ]; -pub static av1_default_eob_multi256_cdfs: [[[[u16; cdf_size!(9)]; 2]; - PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = [ +pub static av1_default_eob_multi256_cdfs: [[[[u16; 9]; 2]; PLANE_TYPES]; + TOKEN_CDF_Q_CTXS] = [ [ [ cdf!(310, 584, 1887, 3589, 6168, 8611, 11352, 15652), @@ -1028,8 +1026,8 @@ pub static av1_default_eob_multi256_cdfs: [[[[u16; cdf_size!(9)]; 2]; ], ]; -pub static av1_default_eob_multi512_cdfs: [[[[u16; cdf_size!(10)]; 2]; - PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = [ +pub static av1_default_eob_multi512_cdfs: [[[[u16; 10]; 2]; PLANE_TYPES]; + TOKEN_CDF_Q_CTXS] = [ [ [ cdf!(641, 983, 3707, 5430, 10234, 14958, 18788, 23412, 26061), @@ -1072,8 +1070,8 @@ pub static av1_default_eob_multi512_cdfs: [[[[u16; cdf_size!(10)]; 2]; ], ]; -pub static av1_default_eob_multi1024_cdfs: [[[[u16; cdf_size!(11)]; 2]; - PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = [ +pub static av1_default_eob_multi1024_cdfs: [[[[u16; 11]; 2]; PLANE_TYPES]; + TOKEN_CDF_Q_CTXS] = [ [ [ cdf!(393, 421, 751, 1623, 3160, 6352, 13345, 18047, 22571, 25830), @@ -1120,7 +1118,7 @@ pub static av1_default_eob_multi1024_cdfs: [[[[u16; cdf_size!(11)]; 2]; ], ]; -pub static av1_default_coeff_lps_multi_cdfs: [[[[[u16; cdf_size!(BR_CDF_SIZE)]; +pub static av1_default_coeff_lps_multi_cdfs: [[[[[u16; BR_CDF_SIZE]; LEVEL_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES]; @@ -2095,8 +2093,7 @@ pub static av1_default_coeff_lps_multi_cdfs: [[[[[u16; cdf_size!(BR_CDF_SIZE)]; ], ]; -pub static av1_default_coeff_base_multi_cdfs: [[[[[u16; - cdf_size!(NUM_BASE_LEVELS + 2)]; +pub static av1_default_coeff_base_multi_cdfs: [[[[[u16; NUM_BASE_LEVELS + 2]; SIG_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES]; @@ -3912,7 +3909,7 @@ pub static av1_default_coeff_base_multi_cdfs: [[[[[u16; ]; pub static av1_default_coeff_base_eob_multi_cdfs: [[[[[u16; - cdf_size!(NUM_BASE_LEVELS + 1)]; + NUM_BASE_LEVELS + 1]; SIG_COEF_CONTEXTS_EOB]; PLANE_TYPES]; TxSize::TX_SIZES]; diff --git a/src/util/cdf.rs b/src/util/cdf.rs index d81a153d11..6a0b44ee16 100644 --- a/src/util/cdf.rs +++ b/src/util/cdf.rs @@ -12,9 +12,3 @@ macro_rules! cdf { ($($x:expr),+) => {[$(32768 - $x),+, 0]} } - -macro_rules! cdf_size { - ($x:expr) => { - $x - }; -} From 816a7c5509492e58792aec0c5063e630261f51c8 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Sun, 7 Feb 2021 01:20:30 +0900 Subject: [PATCH 008/155] Reduce reads in CDFContextLog::rollback Hoist loading the vector pointer outside the loop, as we will not modify it while iterating. --- src/context/cdf_context.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 201bf65b96..daf2977321 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -554,12 +554,13 @@ impl CDFContextLog { let base = fc as *mut _ as *mut u8; let mut len = self.data.len(); unsafe { + let mut src = self.data.get_unchecked_mut(len).as_ptr(); while len > checkpoint { len -= 1; - let src = self.data.get_unchecked_mut(len); - let offset = src[CDF_LEN_MAX] as usize; + src = src.sub(CDF_LEN_MAX + 1); + let offset = *src.add(CDF_LEN_MAX) as usize; let dst = base.add(offset) as *mut u16; - dst.copy_from_nonoverlapping(src.as_ptr(), CDF_LEN_MAX); + dst.copy_from_nonoverlapping(src, CDF_LEN_MAX); } self.data.set_len(len); } From 91403722b5240c9c043d35c967ffdfac971c2c48 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Mon, 8 Feb 2021 02:16:56 +0900 Subject: [PATCH 009/155] Group fields of CDFContext by CDF size Ordering the CDFs by size ensures that however we split rollback write sizes, the writes will not overlap between regions. However, NMVContext requires padding as each ordered region does not contain any CDF of the global maximum size. --- src/context/block_unit.rs | 17 +++++-- src/context/cdf_context.rs | 93 ++++++++++++++++++++------------------ 2 files changed, 63 insertions(+), 47 deletions(-) diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs index f629ed5f7b..df7b4fe138 100644 --- a/src/context/block_unit.rs +++ b/src/context/block_unit.rs @@ -484,20 +484,28 @@ impl<'a> BlockContext<'a> { } #[derive(Clone, Copy)] +#[repr(C)] pub struct NMVComponent { - pub classes_cdf: [u16; MV_CLASSES], - pub class0_fp_cdf: [[u16; MV_FP_SIZE]; CLASS0_SIZE], - pub fp_cdf: [u16; MV_FP_SIZE], pub sign_cdf: [u16; 2], pub class0_hp_cdf: [u16; 2], pub hp_cdf: [u16; 2], pub class0_cdf: [u16; CLASS0_SIZE], pub bits_cdf: [[u16; 2]; MV_OFFSET_BITS], + + pub class0_fp_cdf: [[u16; MV_FP_SIZE]; CLASS0_SIZE], + pub fp_cdf: [u16; MV_FP_SIZE], + + pub classes_cdf: [u16; MV_CLASSES], + // MV_CLASSES + 5 == 16; pad the last CDF for rollback. + padding: [u16; 5], } #[derive(Clone, Copy)] +#[repr(C)] pub struct NMVContext { pub joints_cdf: [u16; MV_JOINTS], + // MV_JOINTS + 12 == 16; pad the last CDF for rollback. + padding: [u16; 12], pub comps: [NMVComponent; 2], } @@ -505,6 +513,7 @@ pub struct NMVContext { pub static default_nmv_context: NMVContext = { NMVContext { joints_cdf: cdf!(4096, 11264, 19328), + padding: [0; 12], comps: [ NMVComponent { classes_cdf: cdf!( @@ -528,6 +537,7 @@ pub static default_nmv_context: NMVContext = { cdf!(128 * 234), cdf!(128 * 240), ], + padding: [0; 5], }, NMVComponent { classes_cdf: cdf!( @@ -551,6 +561,7 @@ pub static default_nmv_context: NMVContext = { cdf!(128 * 234), cdf!(128 * 240), ], + padding: [0; 5], }, ], } diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index daf2977321..350400b788 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -14,64 +14,71 @@ const CDF_LEN_MAX: usize = 16; #[derive(Clone, Copy)] #[repr(C)] pub struct CDFContext { - pub partition_cdf: [[u16; EXT_PARTITION_TYPES]; PARTITION_CONTEXTS], - pub kf_y_cdf: [[[u16; INTRA_MODES]; KF_MODE_CONTEXTS]; KF_MODE_CONTEXTS], - pub y_mode_cdf: [[u16; INTRA_MODES]; BLOCK_SIZE_GROUPS], - pub uv_mode_cdf: [[[u16; UV_INTRA_MODES]; INTRA_MODES]; 2], - pub cfl_sign_cdf: [u16; CFL_JOINT_SIGNS], - pub cfl_alpha_cdf: [[u16; CFL_ALPHABET_SIZE]; CFL_ALPHA_CONTEXTS], - pub newmv_cdf: [[u16; 2]; NEWMV_MODE_CONTEXTS], - pub zeromv_cdf: [[u16; 2]; GLOBALMV_MODE_CONTEXTS], - pub refmv_cdf: [[u16; 2]; REFMV_MODE_CONTEXTS], - pub intra_tx_cdf: - [[[[u16; TX_TYPES]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTRA], - pub inter_tx_cdf: [[[u16; TX_TYPES]; TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTER], - pub tx_size_cdf: [[[u16; MAX_TX_DEPTH + 1]; TX_SIZE_CONTEXTS]; MAX_TX_CATS], - pub txfm_partition_cdf: [[u16; 2]; TXFM_PARTITION_CONTEXTS], - pub skip_cdfs: [[u16; 2]; SKIP_CONTEXTS], - pub intra_inter_cdfs: [[u16; 2]; INTRA_INTER_CONTEXTS], - pub angle_delta_cdf: [[u16; 2 * MAX_ANGLE_DELTA + 1]; DIRECTIONAL_MODES], - pub filter_intra_cdfs: [[u16; 2]; BlockSize::BLOCK_SIZES_ALL], - pub palette_y_mode_cdfs: - [[[u16; 2]; PALETTE_Y_MODE_CONTEXTS]; PALETTE_BSIZE_CTXS], - pub palette_uv_mode_cdfs: [[u16; 2]; PALETTE_UV_MODE_CONTEXTS], + pub comp_bwd_ref_cdf: [[[u16; 2]; BWD_REFS - 1]; REF_CONTEXTS], pub comp_mode_cdf: [[u16; 2]; COMP_INTER_CONTEXTS], - pub comp_ref_type_cdf: [[u16; 2]; COMP_REF_TYPE_CONTEXTS], pub comp_ref_cdf: [[[u16; 2]; FWD_REFS - 1]; REF_CONTEXTS], - pub comp_bwd_ref_cdf: [[[u16; 2]; BWD_REFS - 1]; REF_CONTEXTS], - pub single_ref_cdfs: [[[u16; 2]; SINGLE_REFS - 1]; REF_CONTEXTS], + pub comp_ref_type_cdf: [[u16; 2]; COMP_REF_TYPE_CONTEXTS], + pub dc_sign_cdf: [[[u16; 2]; DC_SIGN_CONTEXTS]; PLANE_TYPES], pub drl_cdfs: [[u16; 2]; DRL_MODE_CONTEXTS], - pub compound_mode_cdf: [[u16; INTER_COMPOUND_MODES]; INTER_MODE_CONTEXTS], - pub nmv_context: NMVContext, - pub deblock_delta_multi_cdf: [[u16; DELTA_LF_PROBS + 1]; FRAME_LF_COUNT], - pub deblock_delta_cdf: [u16; DELTA_LF_PROBS + 1], - pub spatial_segmentation_cdfs: [[u16; 8]; 3], - pub lrf_switchable_cdf: [u16; 3], + pub eob_extra_cdf: + [[[[u16; 2]; EOB_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], + pub filter_intra_cdfs: [[u16; 2]; BlockSize::BLOCK_SIZES_ALL], + pub intra_inter_cdfs: [[u16; 2]; INTRA_INTER_CONTEXTS], pub lrf_sgrproj_cdf: [u16; 2], pub lrf_wiener_cdf: [u16; 2], - - // lv_map + pub newmv_cdf: [[u16; 2]; NEWMV_MODE_CONTEXTS], + pub palette_uv_mode_cdfs: [[u16; 2]; PALETTE_UV_MODE_CONTEXTS], + pub palette_y_mode_cdfs: + [[[u16; 2]; PALETTE_Y_MODE_CONTEXTS]; PALETTE_BSIZE_CTXS], + pub refmv_cdf: [[u16; 2]; REFMV_MODE_CONTEXTS], + pub single_ref_cdfs: [[[u16; 2]; SINGLE_REFS - 1]; REF_CONTEXTS], + pub skip_cdfs: [[u16; 2]; SKIP_CONTEXTS], pub txb_skip_cdf: [[[u16; 2]; TXB_SKIP_CONTEXTS]; TxSize::TX_SIZES], - pub dc_sign_cdf: [[[u16; 2]; DC_SIGN_CONTEXTS]; PLANE_TYPES], - pub eob_extra_cdf: - [[[[u16; 2]; EOB_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], + pub txfm_partition_cdf: [[u16; 2]; TXFM_PARTITION_CONTEXTS], + pub zeromv_cdf: [[u16; 2]; GLOBALMV_MODE_CONTEXTS], + + pub coeff_base_eob_cdf: + [[[[u16; 3]; SIG_COEF_CONTEXTS_EOB]; PLANE_TYPES]; TxSize::TX_SIZES], + pub lrf_switchable_cdf: [u16; 3], + pub tx_size_cdf: [[[u16; MAX_TX_DEPTH + 1]; TX_SIZE_CONTEXTS]; MAX_TX_CATS], + + pub coeff_base_cdf: + [[[[u16; 4]; SIG_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], + pub coeff_br_cdf: + [[[[u16; BR_CDF_SIZE]; LEVEL_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], + pub deblock_delta_cdf: [u16; DELTA_LF_PROBS + 1], + pub deblock_delta_multi_cdf: [[u16; DELTA_LF_PROBS + 1]; FRAME_LF_COUNT], pub eob_flag_cdf16: [[[u16; 5]; 2]; PLANE_TYPES], + pub eob_flag_cdf32: [[[u16; 6]; 2]; PLANE_TYPES], + + pub angle_delta_cdf: [[u16; 2 * MAX_ANGLE_DELTA + 1]; DIRECTIONAL_MODES], pub eob_flag_cdf64: [[[u16; 7]; 2]; PLANE_TYPES], + + pub cfl_sign_cdf: [u16; CFL_JOINT_SIGNS], + pub compound_mode_cdf: [[u16; INTER_COMPOUND_MODES]; INTER_MODE_CONTEXTS], pub eob_flag_cdf128: [[[u16; 8]; 2]; PLANE_TYPES], + pub spatial_segmentation_cdfs: [[u16; 8]; 3], + pub eob_flag_cdf256: [[[u16; 9]; 2]; PLANE_TYPES], + pub eob_flag_cdf512: [[[u16; 10]; 2]; PLANE_TYPES], + pub partition_cdf: [[u16; EXT_PARTITION_TYPES]; PARTITION_CONTEXTS], + pub eob_flag_cdf1024: [[[u16; 11]; 2]; PLANE_TYPES], - pub coeff_base_eob_cdf: - [[[[u16; 3]; SIG_COEF_CONTEXTS_EOB]; PLANE_TYPES]; TxSize::TX_SIZES], - pub coeff_base_cdf: - [[[[u16; 4]; SIG_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], - pub coeff_br_cdf: - [[[[u16; BR_CDF_SIZE]; LEVEL_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], + pub kf_y_cdf: [[[u16; INTRA_MODES]; KF_MODE_CONTEXTS]; KF_MODE_CONTEXTS], + pub y_mode_cdf: [[u16; INTRA_MODES]; BLOCK_SIZE_GROUPS], + + pub uv_mode_cdf: [[[u16; UV_INTRA_MODES]; INTRA_MODES]; 2], - padding: [u16; CDF_LEN_MAX], + pub cfl_alpha_cdf: [[u16; CFL_ALPHABET_SIZE]; CFL_ALPHA_CONTEXTS], + pub inter_tx_cdf: [[[u16; TX_TYPES]; TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTER], + pub intra_tx_cdf: + [[[[u16; TX_TYPES]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTRA], + + pub nmv_context: NMVContext, } impl CDFContext { @@ -133,8 +140,6 @@ impl CDFContext { coeff_base_eob_cdf: av1_default_coeff_base_eob_multi_cdfs[qctx], coeff_base_cdf: av1_default_coeff_base_multi_cdfs[qctx], coeff_br_cdf: av1_default_coeff_lps_multi_cdfs[qctx], - - padding: [0; CDF_LEN_MAX], } } From 165b9b5f038ce25ff99d2eb380be27b5557ca41e Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Fri, 5 Feb 2021 01:43:46 +0900 Subject: [PATCH 010/155] Partition CDF log by size The vast majority of writes are to small CDFs. --- src/context/cdf_context.rs | 129 ++++++++++++++++++++++++++++--------- 1 file changed, 99 insertions(+), 30 deletions(-) diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 350400b788..cd5aedf76c 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -11,6 +11,12 @@ use super::*; const CDF_LEN_MAX: usize = 16; +#[derive(Clone)] +pub struct CDFContextCheckpoint { + small: usize, + large: usize, +} + #[derive(Clone, Copy)] #[repr(C)] pub struct CDFContext { @@ -520,58 +526,121 @@ macro_rules! symbol_with_update { #[derive(Clone)] pub struct ContextWriterCheckpoint { - pub fc: usize, + pub fc: CDFContextCheckpoint, pub bc: BlockContextCheckpoint, } -pub struct CDFContextLog { - base: usize, - data: Vec<[u16; CDF_LEN_MAX + 1]>, +struct CDFContextLogBase { + pub base: *const CDFContext, + pub data: Vec, } -impl CDFContextLog { - fn new(fc: &CDFContext) -> Self { - Self { base: fc as *const _ as usize, data: Vec::with_capacity(1 << 15) } - } - fn checkpoint(&self) -> usize { - self.data.len() +impl CDFContextLogBase { + fn new(fc: &CDFContext, capacity: usize) -> Self { + Self { base: fc as _, data: Vec::with_capacity(capacity) } } +} + +trait CDFContextLogSize { + const CDF_LEN_MAX: usize; +} +trait CDFContextLogOps: CDFContextLogSize { #[inline(always)] - pub fn push(&mut self, cdf: &[u16]) { - let offset = cdf.as_ptr() as usize - self.base; + fn push(log: &mut CDFContextLogBase, cdf: &[u16]) { + debug_assert!(cdf.len() <= Self::CDF_LEN_MAX); + let offset = cdf.as_ptr() as usize - log.base as usize; debug_assert!(offset <= u16::MAX.into()); unsafe { // Maintain an invariant of non-zero spare capacity, so that branching // may be deferred until writes are issued. Benchmarks indicate this is // faster than first testing capacity and possibly reallocating. - let len = self.data.len(); - debug_assert!(len < self.data.capacity()); - let entry = self.data.get_unchecked_mut(len); - let dst = entry.as_mut_ptr(); - dst.copy_from_nonoverlapping(cdf.as_ptr(), CDF_LEN_MAX); - entry[CDF_LEN_MAX] = offset as u16; - self.data.set_len(len + 1); - self.data.reserve(1); + let len = log.data.len(); + debug_assert!(len + Self::CDF_LEN_MAX < log.data.capacity()); + let dst = log.data.get_unchecked_mut(len) as *mut u16; + dst.copy_from_nonoverlapping(cdf.as_ptr(), Self::CDF_LEN_MAX); + *dst.add(Self::CDF_LEN_MAX) = offset as u16; + log.data.set_len(len + Self::CDF_LEN_MAX + 1); + log.data.reserve(Self::CDF_LEN_MAX + 1); } } #[inline(always)] - pub fn rollback(&mut self, fc: &mut CDFContext, checkpoint: usize) { + fn rollback( + log: &mut CDFContextLogBase, fc: &mut CDFContext, checkpoint: usize, + ) { let base = fc as *mut _ as *mut u8; - let mut len = self.data.len(); + let mut len = log.data.len(); unsafe { - let mut src = self.data.get_unchecked_mut(len).as_ptr(); + let mut src = log.data.get_unchecked_mut(len) as *mut u16; while len > checkpoint { - len -= 1; - src = src.sub(CDF_LEN_MAX + 1); - let offset = *src.add(CDF_LEN_MAX) as usize; + len -= Self::CDF_LEN_MAX + 1; + src = src.sub(Self::CDF_LEN_MAX + 1); + let offset = *src.add(Self::CDF_LEN_MAX) as usize; let dst = base.add(offset) as *mut u16; - dst.copy_from_nonoverlapping(src, CDF_LEN_MAX); + dst.copy_from_nonoverlapping(src, Self::CDF_LEN_MAX); } - self.data.set_len(len); + log.data.set_len(len); } } +} + +struct CDFContextLogSmall(CDFContextLogBase); +struct CDFContextLogLarge(CDFContextLogBase); + +impl CDFContextLogOps for CDFContextLogSmall {} +impl CDFContextLogOps for CDFContextLogLarge {} +impl CDFContextLogSize for CDFContextLogSmall { + const CDF_LEN_MAX: usize = 4; +} +impl CDFContextLogSize for CDFContextLogLarge { + const CDF_LEN_MAX: usize = CDF_LEN_MAX; +} +impl CDFContextLogSmall { + fn new(fc: &CDFContext) -> Self { + Self(CDFContextLogBase::new(fc, 1 << 18)) + } +} +impl CDFContextLogLarge { + fn new(fc: &CDFContext) -> Self { + Self(CDFContextLogBase::new(fc, 1 << 13)) + } +} + +pub struct CDFContextLog { + small: CDFContextLogSmall, + large: CDFContextLogLarge, +} + +impl CDFContextLog { + pub fn new(fc: &CDFContext) -> Self { + Self { + small: CDFContextLogSmall::new(fc), + large: CDFContextLogLarge::new(fc), + } + } + fn checkpoint(&self) -> CDFContextCheckpoint { + CDFContextCheckpoint { + small: self.small.0.data.len(), + large: self.large.0.data.len(), + } + } + #[inline(always)] + pub fn push(&mut self, cdf: &[u16]) { + if cdf.len() <= CDFContextLogSmall::CDF_LEN_MAX { + CDFContextLogSmall::push(&mut self.small.0, cdf); + } else { + CDFContextLogLarge::push(&mut self.large.0, cdf); + } + } + #[inline(always)] + pub fn rollback( + &mut self, fc: &mut CDFContext, checkpoint: &CDFContextCheckpoint, + ) { + CDFContextLogSmall::rollback(&mut self.small.0, fc, checkpoint.small); + CDFContextLogLarge::rollback(&mut self.large.0, fc, checkpoint.large); + } pub fn clear(&mut self) { - self.data.clear(); + self.small.0.data.clear(); + self.large.0.data.clear(); } } @@ -618,7 +687,7 @@ impl<'a> ContextWriter<'a> { } pub fn rollback(&mut self, checkpoint: &ContextWriterCheckpoint) { - self.fc_log.rollback(&mut self.fc, checkpoint.fc); + self.fc_log.rollback(&mut self.fc, &checkpoint.fc); self.bc.rollback(&checkpoint.bc); #[cfg(feature = "desync_finder")] { From afc9ac18dda44e307d8b5c32a3abab7532dbe3b8 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 10 Jan 2021 14:12:10 -0500 Subject: [PATCH 011/155] x86: lr: Add AVX2 implementation of wiener filter for 16 bpc Relative speed-ups over C code (compared with gcc-9.3.0): C AVX2 wiener_5tap_10bpc: 194892.0 14831.9 13.14x wiener_5tap_12bpc: 194295.4 14828.9 13.10x wiener_7tap_10bpc: 194391.7 19461.4 9.99x wiener_7tap_12bpc: 194136.1 19418.7 10.00x --- build.rs | 1 + src/x86/looprestoration16_avx2.asm | 466 +++++++++++++++++++++++++++++ 2 files changed, 467 insertions(+) create mode 100644 src/x86/looprestoration16_avx2.asm diff --git a/build.rs b/build.rs index 488da1010e..b537274c85 100644 --- a/build.rs +++ b/build.rs @@ -89,6 +89,7 @@ fn build_nasm_files() { "src/x86/ipred_ssse3.asm", "src/x86/itx.asm", "src/x86/itx_ssse3.asm", + "src/x86/looprestoration16_avx2.asm", "src/x86/mc_avx2.asm", "src/x86/mc16_avx2.asm", "src/x86/mc_avx512.asm", diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm new file mode 100644 index 0000000000..2012860b8b --- /dev/null +++ b/src/x86/looprestoration16_avx2.asm @@ -0,0 +1,466 @@ +; Copyright (c) 2017-2021, The rav1e contributors +; Copyright (c) 2021, Nathan Egge +; All rights reserved. +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +wiener5_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +wiener5_shufB: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13 +wiener5_shufC: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1 +wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + +wiener7_shufB: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9 +wiener7_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +wiener7_shufD: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1 +rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 +rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 +wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + +pq_3: dq (6 - 4) + 1 +pq_5: dq (6 - 2) + 1 +pd_65540: dd (1 << (8 + (6 - 4) + 6)) + (1 << (6 - 4)) +pd_262160: dd (1 << (8 + (6 - 2) + 6)) + (1 << (6 - 2)) + +pq_11: dq 12 - (6 - 4) + 1 +pq_9: dq 12 - (6 - 2) + 1 +nd_1047552: dd (1 << (12 - (6 - 4))) - (1 << (12 + 8)) +nd_1048320: dd (1 << (12 - (6 - 2))) - (1 << (12 + 8)) + +pb_wiener5_l: times 2 db 2, 3 +pb_wiener5_r: times 2 db -6, -5 + +pb_wiener7_l: times 2 db 4, 5 +pb_wiener7_m: times 2 db -4, -3 +pb_wiener7_r: times 2 db -8, -7 + +SECTION .text + +INIT_YMM avx2 +cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax + movifnidn wd, wm + movifnidn hd, hm + movifnidn edgeb, edgem + vbroadcasti128 m6, [wiener5_shufA] + vpbroadcastd m12, [fq + 2] + vbroadcasti128 m7, [wiener5_shufB] + vpbroadcastw m13, [fq + 6] + vbroadcasti128 m8, [wiener5_shufC] + popcnt bdmaxd, bdmaxm + vpbroadcastd m9, [pd_65540] + movq xm10, [pq_3] + cmp bdmaxd, 10 + je .bits10 + vpbroadcastd m9, [pd_262160] + movq xm10, [pq_5] +.bits10: + pxor m11, m11 + add wq, wq + add srcq, wq + add dstq, wq + neg wq + DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x +.v_loop: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + test leftq, leftq + jz .h_loop + movd xm4, [leftq + 4] + vpblendd m4, [srcq + xq - 4], 0xfe + add leftq, 8 + jmp .h_main +.h_extend_left: + vbroadcasti128 m5, [srcq + xq] + mova m4, [srcq + xq] + palignr m4, m5, 12 + pshufb m4, [wiener5_l_shuf] + jmp .h_main +.h_loop: + movu m4, [srcq + xq - 4] +.h_main: + movu m5, [srcq + xq + 4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp xd, -18*2 + jl .h_have_right + movd xm2, xd + vpbroadcastd m0, [pb_wiener5_l] + vpbroadcastd m1, [pb_wiener5_r] + vpbroadcastb m2, xm2 + movu m3, [pb_0to31] + psubb m0, m2 + psubb m1, m2 + pminub m0, m3 + pminub m1, m3 + pshufb m4, m0 + pshufb m5, m1 +.h_have_right: + pshufb m0, m4, m6 + pshufb m2, m4, m7 + paddw m0, m2 + pmaddwd m0, m12 + pshufb m1, m5, m6 + pshufb m3, m5, m7 + paddw m1, m3 + pmaddwd m1, m12 + pshufb m4, m8 + pmaddwd m4, m13 + pshufb m5, m8 + pmaddwd m5, m13 + paddd m0, m4 + paddd m1, m5 + paddd m0, m9 + paddd m1, m9 + psrad m0, xm10 + psrad m1, xm10 + packssdw m0, m1 + pmaxsw m0, m11 + mova [dstq + xq], m0 + add xq, 32 + jl .h_loop + add srcq, ssq + add dstq, 384*2 + dec hd + jg .v_loop + RET + +DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14 + +INIT_YMM avx2 +cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax + movifnidn wd, wm + movifnidn hd, hm + movifnidn edgeb, edgem + pxor m6, m6 + vpbroadcastd m7, [fq + 2] + vpbroadcastd m8, [fq + 6] + popcnt bdmaxd, bdmaxm + vpbroadcastd m9, [nd_1047552] + movq xm10, [pq_11] + cmp bdmaxd, 10 + je .bits10 + vpbroadcastd m9, [nd_1048320] + movq xm10, [pq_9] +.bits10: + vpbroadcastw m11, bdmaxm + add wq, wq + add midq, wq + add dstq, wq + neg wq + DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x + mov msq, 2*384 + mov t0, midq + lea t1, [t0 + msq] + lea t2, [t1 + msq] + lea t3, [t2 + msq] + lea t4, [t3 + msq] + test edgeb, 4 ; LR_HAVE_TOP + jnz .have_top + mov t0, t2 + mov t1, t2 +.have_top: + test edgeb, 8 ; LR_HAVE_BOTTOM + jnz .v_loop + cmp hd, 2 + jg .v_loop + cmp hd, 1 + jne .limit_v + mov t3, t2 +.limit_v: + mov t4, t3 +.v_loop: + mov xq, wq +.h_loop: + mova m1, [t0 + xq] + mova m2, [t1 + xq] + mova m3, [t2 + xq] + mova m4, [t3 + xq] + mova m5, [t4 + xq] + punpcklwd m0, m1, m2 + pmaddwd m0, m7 + punpckhwd m1, m2 + pmaddwd m1, m7 + punpcklwd m2, m5, m4 + pmaddwd m2, m7 + punpckhwd m5, m4 + pmaddwd m5, m7 + paddd m0, m2 + paddd m1, m5 + punpcklwd m2, m3, m6 + pmaddwd m2, m8 + punpckhwd m3, m6 + pmaddwd m3, m8 + paddd m0, m2 + paddd m1, m3 + paddd m0, m9 + paddd m1, m9 + psrad m0, xm10 + psrad m1, xm10 + packusdw m0, m1 + pminuw m0, m11 + mova [dstq + xq], m0 + add xq, 32 + jl .h_loop + add dstq, dsq + mov t0, t1 + mov t1, t2 + mov t2, t3 + mov t3, t4 + add t4, msq + test edgeb, 8 ; LR_HAVE_BOTTOM + jnz .have_bottom + cmp hd, 3 + jg .have_bottom + mov t4, t3 +.have_bottom: + dec hd + jg .v_loop + RET + +INIT_YMM avx2 +cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh + movifnidn wd, wm + movifnidn hd, hm + movifnidn edgeb, edgem + vpbroadcastd m7, [fq] + vpbroadcastd m8, [fq + 4] + vbroadcasti128 m10, [rev_w] + vbroadcasti128 m11, [wiener5_shufA] + vbroadcasti128 m12, [wiener7_shufB] + vbroadcasti128 m13, [wiener7_shufC] + vbroadcasti128 m14, [wiener7_shufD] + vbroadcasti128 m15, [rev_d] + popcnt bdmaxd, bdmaxm + vpbroadcastd m9, [pd_65540] + mov rhq, [pq_3] + cmp bdmaxd, 10 + je .bits10 + vpbroadcastd m9, [pd_262160] + mov rhq, [pq_5] +.bits10: + add wq, wq + add srcq, wq + add dstq, wq + neg wq + DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh +.v_loop: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + test leftq, leftq + jz .h_loop + movq xm4, [leftq + 2] + vpblendw xm4, [srcq + xq - 6], 0xf8 + vinserti128 m4, [srcq + xq + 10], 1 + add leftq, 8 + jmp .h_main +.h_extend_left: + vbroadcasti128 m5, [srcq + xq] + mova m4, [srcq + xq] + palignr m4, m5, 10 + pshufb m4, [wiener7_l_shuf] + jmp .h_main +.h_loop: + movu m4, [srcq + xq - 6] +.h_main: + movu m5, [srcq + xq + 2] + movu m6, [srcq + xq + 6] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp xd, -19*2 + jl .h_have_right + movd xm3, xd + vpbroadcastd m0, [pb_wiener7_l] + vpbroadcastd m1, [pb_wiener7_m] + vpbroadcastd m2, [pb_wiener7_r] + vpbroadcastb m3, xm3 + psubb m0, m3 + psubb m1, m3 + psubb m2, m3 + movu m3, [pb_0to31] + pminub m0, m3 + pminub m1, m3 + pminub m2, m3 + pshufb m4, m0 + pshufb m5, m1 + pshufb m6, m2 + cmp xd, -9*2 + jne .hack + vpbroadcastw xm3, [srcq + xq + 16] + vinserti128 m5, xm3, 1 + jmp .h_have_right +.hack: + cmp xd, -1*2 + jne .h_have_right + vpbroadcastw xm5, [srcq + xq] +.h_have_right: + pshufb m6, m10 + pshufb m0, m4, m11 + pshufb m2, m5, m12 + paddw m0, m2 + pmaddwd m0, m7 + pshufb m2, m4, m13 + pshufb m4, m14 + paddw m2, m4 + pmaddwd m2, m8 + pshufb m1, m6, m11 + pshufb m5, m11 + pmaddwd m1, m7 + pmaddwd m5, m7 + pshufb m3, m6, m13 + pshufb m6, m14 + paddw m3, m6 + pmaddwd m3, m8 + paddd m0, m2 + paddd m1, m3 + pshufb m1, m15 + paddd m1, m5 + movq xm4, rhq + pxor m5, m5 + paddd m0, m9 + paddd m1, m9 + psrad m0, xm4 + psrad m1, xm4 + packssdw m0, m1 + pmaxsw m0, m5 + mova [dstq + xq], m0 + add xq, 32 + jl .h_loop + add srcq, ssq + add dstq, 384*2 + dec hd + jg .v_loop + RET + +INIT_YMM avx2 +cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax + movifnidn wd, wm + movifnidn hd, hm + movifnidn edgeb, edgem + pxor m6, m6 + vpbroadcastd m7, [fq] + vpbroadcastw m8, [fq + 4] + vpbroadcastd m9, [fq + 6] + popcnt bdmaxd, bdmaxm + vpbroadcastd m10, [nd_1047552] + movq xm11, [pq_11] + cmp bdmaxd, 10 + je .bits10 + vpbroadcastd m10, [nd_1048320] + movq xm11, [pq_9] +.bits10: + vpbroadcastw m12, bdmaxm + add wq, wq + add midq, wq + add dstq, wq + neg wq + DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x + mov msq, 2*384 + mov t0, midq + mov t1, t0 + lea t2, [t1 + msq] + lea t3, [t2 + msq] + lea t4, [t3 + msq] + lea t5, [t4 + msq] + lea t6, [t5 + msq] + test edgeb, 4 ; LR_HAVE_TOP + jnz .have_top + mov t0, t3 + mov t1, t3 + mov t2, t3 +.have_top: + cmp hd, 3 + jg .v_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .no_bottom0 + cmp hd, 1 + jg .v_loop + jmp .h3 +.no_bottom0: + cmp hd, 2 + je .h2 + jns .h3 +.h1: + mov t4, t3 +.h2: + mov t5, t4 +.h3: + mov t6, t5 +.v_loop: + mov xq, wq +.h_loop: + mova m1, [t0 + xq] + mova m2, [t1 + xq] + mova m3, [t5 + xq] + mova m4, [t6 + xq] + punpcklwd m0, m1, m2 + pmaddwd m0, m7 + punpckhwd m1, m2 + pmaddwd m1, m7 + punpcklwd m2, m4, m3 + pmaddwd m2, m7 + punpckhwd m4, m3 + pmaddwd m4, m7 + paddd m0, m2 + paddd m1, m4 + mova m3, [t2 + xq] + mova m4, [t4 + xq] + punpcklwd m2, m3, m4 + pmaddwd m2, m8 + punpckhwd m3, m4 + pmaddwd m3, m8 + paddd m0, m2 + paddd m1, m3 + mova m3, [t3 + xq] + punpcklwd m2, m3, m6 + pmaddwd m2, m9 + punpckhwd m3, m6 + pmaddwd m3, m9 + paddd m0, m2 + paddd m1, m3 + paddd m0, m10 + paddd m1, m10 + psrad m0, xm11 + psrad m1, xm11 + packusdw m0, m1 + pminuw m0, m12 + mova [dstq + xq], m0 + add xq, 32 + jl .h_loop + add dstq, dsq + mov t0, t1 + mov t1, t2 + mov t2, t3 + mov t3, t4 + mov t4, t5 + mov t5, t6 + add t6, msq + cmp hd, 4 + jg .next_row + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .no_bottom + cmp hd, 2 + jg .next_row +.no_bottom: + mov t6, t5 +.next_row: + dec hd + jg .v_loop + RET + +%endif ; ARCH_X86_64 From a183c8ef417ed3a92f420bcfa12526cf75ea15a4 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sat, 16 Jan 2021 11:13:41 -0500 Subject: [PATCH 012/155] x86: cdef: Add SIMD implementation of cdef_dir for 16bpc Relative speed-ups over C code (compared with gcc-9.3.0): C ASM cdef_dir_16bpc_avx2: 534.2 72.5 7.36x cdef_dir_16bpc_ssse3: 534.2 104.8 5.10x cdef_dir_16bpc_ssse3 (x86-32): 854.1 116.2 7.35x --- build.rs | 3 ++ src/x86/cdef16_avx2.asm | 51 ++++++++++++++++++++++++++ src/x86/cdef16_sse.asm | 79 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+) create mode 100644 src/x86/cdef16_avx2.asm create mode 100644 src/x86/cdef16_sse.asm diff --git a/build.rs b/build.rs index b537274c85..c9b81f64a2 100644 --- a/build.rs +++ b/build.rs @@ -100,6 +100,9 @@ fn build_nasm_files() { "src/x86/satd.asm", "src/x86/sse.asm", "src/x86/cdef.asm", + "src/x86/cdef_sse.asm", + "src/x86/cdef16_avx2.asm", + "src/x86/cdef16_sse.asm", "src/x86/tables.asm", ]; diff --git a/src/x86/cdef16_avx2.asm b/src/x86/cdef16_avx2.asm new file mode 100644 index 0000000000..9491d81431 --- /dev/null +++ b/src/x86/cdef16_avx2.asm @@ -0,0 +1,51 @@ +; Copyright (c) 2017-2021, The rav1e contributors +; Copyright (c) 2021, Nathan Egge +; All rights reserved. +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION .text + +cextern cdef_dir_8bpc_avx2 + +INIT_YMM avx2 +cglobal cdef_dir_16bpc, 4, 4, 3, 32 + 8*8, src, ss, var, bdmax + popcnt bdmaxd, bdmaxd + movzx bdmaxq, bdmaxw + sub bdmaxq, 8 + movq xm2, bdmaxq + DEFINE_ARGS src, ss, var, ss3 + lea ss3q, [ssq*3] + mova xm0, [srcq + ssq*0] + mova xm1, [srcq + ssq*1] + vinserti128 m0, [srcq + ssq*2], 1 + vinserti128 m1, [srcq + ss3q], 1 + psraw m0, xm2 + psraw m1, xm2 + vpackuswb m0, m1 + mova [rsp + 32 + 0*8], m0 + lea srcq, [srcq + ssq*4] + mova xm0, [srcq + ssq*0] + mova xm1, [srcq + ssq*1] + vinserti128 m0, [srcq + ssq*2], 1 + vinserti128 m1, [srcq + ss3q], 1 + psraw m0, xm2 + psraw m1, xm2 + vpackuswb m0, m1 + mova [rsp + 32 + 4*8], m0 + lea srcq, [rsp + 32] ; WIN64 shadow space + mov ssq, 8 + call mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX) + RET + +%endif ; ARCH_X86_64 diff --git a/src/x86/cdef16_sse.asm b/src/x86/cdef16_sse.asm new file mode 100644 index 0000000000..ced7621498 --- /dev/null +++ b/src/x86/cdef16_sse.asm @@ -0,0 +1,79 @@ +; Copyright (c) 2017-2021, The rav1e contributors +; Copyright (c) 2021, Nathan Egge +; All rights reserved. +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%ifn ARCH_X86_64 +SECTION_RODATA 16 + +pq_dir_shr: dq 2, 4 +%endif + +SECTION .text + +cextern cdef_dir_8bpc_ssse3 + +INIT_XMM ssse3 +cglobal cdef_dir_16bpc, 2, 4, 4, 32 + 8*8, src, ss, var, bdmax + bsr bdmaxd, bdmaxm +%if ARCH_X86_64 + movzx bdmaxq, bdmaxw + sub bdmaxq, 7 + movq m4, bdmaxq +%else + push r4 + sub bdmaxd, 9 + LEA r4, pq_dir_shr + movq m4, [r4 + bdmaxd*4] + pop r4 +%endif + DEFINE_ARGS src, ss, var, ss3 + lea ss3q, [ssq*3] + mova m0, [srcq + ssq*0] + mova m1, [srcq + ssq*1] + mova m2, [srcq + ssq*2] + mova m3, [srcq + ss3q] + psraw m0, m4 + psraw m1, m4 + psraw m2, m4 + psraw m3, m4 + packuswb m0, m1 + packuswb m2, m3 + mova [rsp + 32 + 0*8], m0 + mova [rsp + 32 + 2*8], m2 + lea srcq, [srcq + ssq*4] + mova m0, [srcq + ssq*0] + mova m1, [srcq + ssq*1] + mova m2, [srcq + ssq*2] + mova m3, [srcq + ss3q] + psraw m0, m4 + psraw m1, m4 + psraw m2, m4 + psraw m3, m4 + packuswb m0, m1 + packuswb m2, m3 + mova [rsp + 32 + 4*8], m0 + mova [rsp + 32 + 6*8], m2 + lea srcq, [rsp + 32] ; WIN64 shadow space + mov ssq, 8 +%if ARCH_X86_64 + call mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX) +%else + movifnidn vard, varm + push eax ; align stack + push vard + push ssd + push srcd + call mangle(private_prefix %+ _cdef_dir_8bpc) + add esp, 0x10 +%endif + RET From bc17f4852ce71b429cd14d2a03fd9936fa3d4a38 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Tue, 19 Jan 2021 23:01:20 -0500 Subject: [PATCH 013/155] Enable AVX2 cdef_dir HBD function. --- src/asm/x86/cdef.rs | 21 ++++++++++++++++----- src/x86/cdef.asm | 2 +- src/x86/cdef_sse.asm | 4 ++-- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/asm/x86/cdef.rs b/src/asm/x86/cdef.rs index 1bf05041bd..7439396287 100644 --- a/src/asm/x86/cdef.rs +++ b/src/asm/x86/cdef.rs @@ -183,8 +183,12 @@ cpu_function_lookup_table!( type CdefDirLBDFn = unsafe extern fn(tmp: *const u8, tmp_stride: isize, var: *mut u32) -> i32; -type CdefDirHBDFn = - unsafe extern fn(tmp: *const u16, tmp_stride: isize, var: *mut u32) -> i32; +type CdefDirHBDFn = unsafe extern fn( + tmp: *const u16, + tmp_stride: isize, + var: *mut u32, + bitdepth_max: i32, +) -> i32; #[inline(always)] #[allow(clippy::let_and_return)] @@ -223,6 +227,7 @@ pub(crate) fn cdef_find_dir( img.as_ptr() as *const _, T::to_asm_stride(img.plane.cfg.stride), var as *mut u32, + (1 << (coeff_shift + 8)) - 1, ) } } else { @@ -241,21 +246,27 @@ pub(crate) fn cdef_find_dir( } extern { - fn rav1e_cdef_dir_8_avx2( + fn rav1e_cdef_dir_8bpc_avx2( tmp: *const u8, tmp_stride: isize, var: *mut u32, ) -> i32; } +extern { + fn rav1e_cdef_dir_16bpc_avx2( + tmp: *const u16, tmp_stride: isize, var: *mut u32, bitdepth_max: i32, + ) -> i32; +} + cpu_function_lookup_table!( CDEF_DIR_LBD_FNS: [Option], default: None, - [(AVX2, Some(rav1e_cdef_dir_8_avx2))] + [(AVX2, Some(rav1e_cdef_dir_8bpc_avx2))] ); cpu_function_lookup_table!( CDEF_DIR_HBD_FNS: [Option], default: None, - [] + [(AVX2, Some(rav1e_cdef_dir_16bpc_avx2))] ); #[cfg(test)] diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 0ea98df67f..abdf125c21 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -264,7 +264,7 @@ CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 INIT_YMM avx2 -cglobal cdef_dir_8, 3, 4, 15, src, stride, var, stride3 +cglobal cdef_dir_8bpc, 3, 4, 15, src, stride, var, stride3 lea stride3q, [strideq*3] movq xm0, [srcq+strideq*0] movq xm1, [srcq+strideq*1] diff --git a/src/x86/cdef_sse.asm b/src/x86/cdef_sse.asm index 2dcaf22a68..9335e727bc 100644 --- a/src/x86/cdef_sse.asm +++ b/src/x86/cdef_sse.asm @@ -758,7 +758,7 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ %macro CDEF_DIR 0 %if ARCH_X86_64 -cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3 +cglobal cdef_dir_8bpc, 3, 5, 16, 32, src, stride, var, stride3 lea stride3q, [strideq*3] movq m1, [srcq+strideq*0] movhps m1, [srcq+strideq*1] @@ -1030,7 +1030,7 @@ cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3 shr r1d, 10 mov [varq], r1d %else -cglobal cdef_dir, 2, 4, 8, 96, src, stride, var, stride3 +cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3 %define base r2-shufw_6543210x LEA r2, shufw_6543210x pxor m0, m0 From 9faf476f496a6be224847b0c00fe7f517cdb44d1 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Tue, 9 Feb 2021 11:01:51 +0100 Subject: [PATCH 014/155] Mark the window for 0.5.0 API changes --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index c04c3c687d..8c9ee8615d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rav1e" -version = "0.4.0" +version = "0.5.0-alpha" authors = ["Thomas Daede "] edition = "2018" build = "build.rs" From d43c653b50ef8f5c5995a81bf2f590e20b6f5540 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Thu, 14 Jan 2021 15:06:33 +0100 Subject: [PATCH 015/155] Add Opaque as shorthand for Box --- src/api/internal.rs | 4 ++-- src/api/util.rs | 6 +++++- src/capi.rs | 7 ++----- src/frame/mod.rs | 3 ++- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/api/internal.rs b/src/api/internal.rs index 6d116db646..465f5e317b 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -10,7 +10,7 @@ use crate::activity::ActivityMask; use crate::api::lookahead::*; -use crate::api::{EncoderConfig, EncoderStatus, FrameType, Packet}; +use crate::api::{EncoderConfig, EncoderStatus, FrameType, Opaque, Packet}; use crate::color::ChromaSampling::Cs400; use crate::cpu_features::CpuFeatureLevel; use crate::dist::get_satd; @@ -256,7 +256,7 @@ pub(crate) struct ContextInner { /// The next `output_frameno` to be computed by lookahead. next_lookahead_output_frameno: u64, /// Optional opaque to be sent back to the user - opaque_q: BTreeMap>, + opaque_q: BTreeMap, } impl ContextInner { diff --git a/src/api/util.rs b/src/api/util.rs index 800d4d4a19..4ae0056a68 100644 --- a/src/api/util.rs +++ b/src/api/util.rs @@ -13,11 +13,15 @@ use crate::serialize::{Deserialize, Serialize}; use crate::stats::EncoderStats; use crate::util::Pixel; +use std::any::Any; use std::fmt; use std::sync::Arc; use thiserror::*; +/// Opaque type to be passed from Frame to Packet +pub type Opaque = Box; + // TODO: use the num crate? /// A rational number. #[derive(Clone, Copy, Debug)] @@ -181,7 +185,7 @@ pub struct Packet { pub enc_stats: EncoderStats, /// Optional user-provided opaque data #[cfg_attr(feature = "serialize", serde(skip))] - pub opaque: Option>, + pub opaque: Option, } impl PartialEq for Packet { diff --git a/src/capi.rs b/src/capi.rs index f4b4273c9d..fb2ed3c4f7 100644 --- a/src/capi.rs +++ b/src/capi.rs @@ -180,7 +180,7 @@ impl EncContext { } fn send_frame( &mut self, frame: Option, frame_type: FrameTypeOverride, - opaque: Option>, + opaque: Option, ) -> Result<(), rav1e::EncoderStatus> { let info = rav1e::FrameParameters { frame_type_override: frame_type, opaque }; @@ -1062,10 +1062,7 @@ pub unsafe extern fn rav1e_send_frame( let maybe_opaque = if frame.is_null() { None } else { - (*frame) - .opaque - .take() - .map(|o| Box::new(o) as Box) + (*frame).opaque.take().map(|o| Box::new(o) as rav1e::Opaque) }; let ret = (*ctx) diff --git a/src/frame/mod.rs b/src/frame/mod.rs index c245d6029d..af0f11d273 100644 --- a/src/frame/mod.rs +++ b/src/frame/mod.rs @@ -9,6 +9,7 @@ use num_derive::FromPrimitive; +use crate::api::Opaque; use crate::context::SB_SIZE; use crate::mc::SUBPEL_FILTER_SIZE; use crate::util::*; @@ -39,7 +40,7 @@ pub struct FrameParameters { /// Force emitted frame to be of the type selected pub frame_type_override: FrameTypeOverride, /// Output the provided data in the matching encoded Packet - pub opaque: Option>, + pub opaque: Option, } pub use v_frame::frame::Frame; From 83b41f8d5e2d03184e36640aa3dfa87b5f8daef6 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Fri, 15 Jan 2021 12:18:03 +0100 Subject: [PATCH 016/155] Make Opaque a newtype And add Sync to its bounds Fixes: #2641 --- src/api/test.rs | 2 +- src/api/util.rs | 22 +++++++++++++++++++++- src/capi.rs | 3 ++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/api/test.rs b/src/api/test.rs index 21d8fa3459..5cbeaa6ea1 100644 --- a/src/api/test.rs +++ b/src/api/test.rs @@ -1176,7 +1176,7 @@ fn send_frame_kf(ctx: &mut Context, keyframe: bool) { let frame_type_override = if keyframe { FrameTypeOverride::Key } else { FrameTypeOverride::No }; - let opaque = Some(Box::new(keyframe) as Box); + let opaque = Some(Opaque::new(keyframe)); let fp = FrameParameters { frame_type_override, opaque }; diff --git a/src/api/util.rs b/src/api/util.rs index 4ae0056a68..71b8611a9e 100644 --- a/src/api/util.rs +++ b/src/api/util.rs @@ -20,7 +20,27 @@ use std::sync::Arc; use thiserror::*; /// Opaque type to be passed from Frame to Packet -pub type Opaque = Box; +#[derive(Debug)] +pub struct Opaque(Box); + +impl Opaque { + /// Wrap a type in the opaque struct + pub fn new(t: T) -> Self { + Opaque(Box::new(t) as Box) + } + + /// Attempt to downcast the opaque to a concrete type. + pub fn downcast(self) -> Result, Opaque> { + if self.0.is::() { + unsafe { + let raw: *mut (dyn Any + Send + Sync) = Box::into_raw(self.0); + Ok(Box::from_raw(raw as *mut T)) + } + } else { + Err(self) + } + } +} // TODO: use the num crate? /// A rational number. diff --git a/src/capi.rs b/src/capi.rs index fb2ed3c4f7..d0aeda1f82 100644 --- a/src/capi.rs +++ b/src/capi.rs @@ -67,6 +67,7 @@ struct FrameOpaque { } unsafe impl Send for FrameOpaque {} +unsafe impl Sync for FrameOpaque {} impl Default for FrameOpaque { fn default() -> Self { @@ -1062,7 +1063,7 @@ pub unsafe extern fn rav1e_send_frame( let maybe_opaque = if frame.is_null() { None } else { - (*frame).opaque.take().map(|o| Box::new(o) as rav1e::Opaque) + (*frame).opaque.take().map(|o| rav1e::Opaque::new(o)) }; let ret = (*ctx) From 1be607d4535d8d468790e69a4a418372eed711e3 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 9 Feb 2021 22:26:58 +0900 Subject: [PATCH 017/155] Reduce size of BlockContextCheckpoint --- src/context/block_unit.rs | 63 +++++++++++++++++++++++++++++++------- src/context/cdf_context.rs | 6 ++-- src/encoder.rs | 2 +- src/rdo.rs | 9 +++--- 4 files changed, 62 insertions(+), 18 deletions(-) diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs index df7b4fe138..109ec24dcb 100644 --- a/src/context/block_unit.rs +++ b/src/context/block_unit.rs @@ -211,13 +211,15 @@ impl Default for Block { #[derive(Clone)] pub struct BlockContextCheckpoint { + x: usize, + chroma_sampling: ChromaSampling, cdef_coded: bool, - above_partition_context: [u8; PARTITION_CONTEXT_MAX_WIDTH], + above_partition_context: [u8; MIB_SIZE >> 1], // left context is also at 8x8 granularity left_partition_context: [u8; MIB_SIZE >> 1], - above_tx_context: [u8; COEFF_CONTEXT_MAX_WIDTH], + above_tx_context: [u8; MIB_SIZE], left_tx_context: [u8; MIB_SIZE], - above_coeff_context: [[u8; COEFF_CONTEXT_MAX_WIDTH]; MAX_PLANES], + above_coeff_context: [[u8; MIB_SIZE]; MAX_PLANES], left_coeff_context: [[u8; MIB_SIZE]; MAX_PLANES], } @@ -256,25 +258,64 @@ impl<'a> BlockContext<'a> { } } - pub const fn checkpoint(&self) -> BlockContextCheckpoint { - BlockContextCheckpoint { + pub fn checkpoint( + &self, tile_bo: &TileBlockOffset, chroma_sampling: ChromaSampling, + ) -> BlockContextCheckpoint { + let x = tile_bo.0.x & (COEFF_CONTEXT_MAX_WIDTH - MIB_SIZE); + let mut checkpoint = BlockContextCheckpoint { + x, + chroma_sampling, cdef_coded: self.cdef_coded, - above_partition_context: self.above_partition_context, + above_partition_context: [0; MIB_SIZE >> 1], left_partition_context: self.left_partition_context, - above_tx_context: self.above_tx_context, + above_tx_context: [0; MIB_SIZE], left_tx_context: self.left_tx_context, - above_coeff_context: self.above_coeff_context, + above_coeff_context: [[0; MIB_SIZE]; MAX_PLANES], left_coeff_context: self.left_coeff_context, + }; + checkpoint.above_partition_context.copy_from_slice( + &self.above_partition_context[(x >> 1)..][..(MIB_SIZE >> 1)], + ); + checkpoint + .above_tx_context + .copy_from_slice(&self.above_tx_context[x..][..MIB_SIZE]); + let num_planes = + if chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; + for (p, (dst, src)) in checkpoint + .above_coeff_context + .iter_mut() + .zip(self.above_coeff_context.iter()) + .enumerate() + .take(num_planes) + { + let xdec = (p > 0 && chroma_sampling != ChromaSampling::Cs444) as usize; + dst.copy_from_slice(&src[(x >> xdec)..][..MIB_SIZE]); } + checkpoint } pub fn rollback(&mut self, checkpoint: &BlockContextCheckpoint) { + let x = checkpoint.x & (COEFF_CONTEXT_MAX_WIDTH - MIB_SIZE); self.cdef_coded = checkpoint.cdef_coded; - self.above_partition_context = checkpoint.above_partition_context; + self.above_partition_context[(x >> 1)..][..(MIB_SIZE >> 1)] + .copy_from_slice(&checkpoint.above_partition_context); self.left_partition_context = checkpoint.left_partition_context; - self.above_tx_context = checkpoint.above_tx_context; + self.above_tx_context[x..][..MIB_SIZE] + .copy_from_slice(&checkpoint.above_tx_context); self.left_tx_context = checkpoint.left_tx_context; - self.above_coeff_context = checkpoint.above_coeff_context; + let num_planes = + if checkpoint.chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; + for (p, (dst, src)) in self + .above_coeff_context + .iter_mut() + .zip(checkpoint.above_coeff_context.iter()) + .enumerate() + .take(num_planes) + { + let xdec = (p > 0 && checkpoint.chroma_sampling != ChromaSampling::Cs444) + as usize; + dst[(x >> xdec)..][..MIB_SIZE].copy_from_slice(src); + } self.left_coeff_context = checkpoint.left_coeff_context; } diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index cd5aedf76c..a2f6eb6cc0 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -679,10 +679,12 @@ impl<'a> ContextWriter<'a> { - (if element + 1 < cdf.len() { cdf[element] } else { 0 }) } - pub fn checkpoint(&self) -> ContextWriterCheckpoint { + pub fn checkpoint( + &self, tile_bo: &TileBlockOffset, chroma_sampling: ChromaSampling, + ) -> ContextWriterCheckpoint { ContextWriterCheckpoint { fc: self.fc_log.checkpoint(), - bc: self.bc.checkpoint(), + bc: self.bc.checkpoint(tile_bo, chroma_sampling), } } diff --git a/src/encoder.rs b/src/encoder.rs index 756f82bed0..8098285197 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -2388,7 +2388,7 @@ fn encode_partition_bottomup( let mut best_partition = PartitionType::PARTITION_INVALID; - let cw_checkpoint = cw.checkpoint(); + let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling); let w_pre_checkpoint = w_pre_cdef.checkpoint(); let w_post_checkpoint = w_post_cdef.checkpoint(); diff --git a/src/rdo.rs b/src/rdo.rs index 88773c4e05..ba753dc9bf 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -928,7 +928,7 @@ pub fn rdo_mode_decision( inter_cfg: &InterConfig, ) -> PartitionParameters { let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; - let cw_checkpoint = cw.checkpoint(); + let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling); let rdo_type = if fi.use_tx_domain_rate { RDOType::TxDistEstRate @@ -976,7 +976,7 @@ pub fn rdo_mode_decision( cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, best.sidx); let chroma_mode = PredictionMode::UV_CFL_PRED; - let cw_checkpoint = cw.checkpoint(); + let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling); let wr: &mut dyn Writer = &mut WriterCounter::new(); let angle_delta = AngleDelta { y: best.angle_delta.y, uv: 0 }; @@ -1650,7 +1650,8 @@ pub fn rdo_tx_type_decision( if cw_checkpoint.is_none() { // Only run the first call // Prevents creating multiple checkpoints for own version of cw - *cw_checkpoint = Some(cw.checkpoint()); + *cw_checkpoint = + Some(cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling)); } let rdo_type = if fi.use_tx_domain_distortion { @@ -1879,7 +1880,7 @@ pub fn rdo_partition_decision( let mut best_rd = cached_block.rd_cost; let mut best_pred_modes = cached_block.part_modes.clone(); - let cw_checkpoint = cw.checkpoint(); + let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling); let w_pre_checkpoint = w_pre_cdef.checkpoint(); let w_post_checkpoint = w_post_cdef.checkpoint(); From b5ce71a6f4e330d47814483ea6812e95925e9048 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Wed, 10 Feb 2021 23:05:11 +0900 Subject: [PATCH 018/155] Correct a misuse of pattern in ac_q and dc_q The original intent was a simple lookup but the expression required the compiler to make a copy of the array referenced. --- src/quantize.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/quantize.rs b/src/quantize.rs index f02d96eeab..52df8bbfd1 100644 --- a/src/quantize.rs +++ b/src/quantize.rs @@ -32,7 +32,7 @@ pub fn get_log_tx_scale(tx_size: TxSize) -> usize { } pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> i16 { - let &table = match bit_depth { + let table = match bit_depth { 8 => &dc_qlookup_Q3, 10 => &dc_qlookup_10_Q3, 12 => &dc_qlookup_12_Q3, @@ -43,7 +43,7 @@ pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> i16 { } pub fn ac_q(qindex: u8, delta_q: i8, bit_depth: usize) -> i16 { - let &table = match bit_depth { + let table = match bit_depth { 8 => &ac_qlookup_Q3, 10 => &ac_qlookup_10_Q3, 12 => &ac_qlookup_12_Q3, From 784aeab92cff18684b3f0bb50a670e430a51e3fc Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Wed, 10 Feb 2021 15:08:58 +0900 Subject: [PATCH 019/155] Use explicit table lookup for ac_q and dc_q Also compute the table indices such that bounds checks are elided. --- src/quantize.rs | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/src/quantize.rs b/src/quantize.rs index 52df8bbfd1..dcea1fab74 100644 --- a/src/quantize.rs +++ b/src/quantize.rs @@ -32,25 +32,17 @@ pub fn get_log_tx_scale(tx_size: TxSize) -> usize { } pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> i16 { - let table = match bit_depth { - 8 => &dc_qlookup_Q3, - 10 => &dc_qlookup_10_Q3, - 12 => &dc_qlookup_12_Q3, - _ => unimplemented!(), - }; - - table[(qindex as isize + delta_q as isize).max(0).min(255) as usize] + static DC_Q: [&[i16; 256]; 3] = + [&dc_qlookup_Q3, &dc_qlookup_10_Q3, &dc_qlookup_12_Q3]; + let bd = ((bit_depth ^ 8) >> 1).min(2); + DC_Q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)] } pub fn ac_q(qindex: u8, delta_q: i8, bit_depth: usize) -> i16 { - let table = match bit_depth { - 8 => &ac_qlookup_Q3, - 10 => &ac_qlookup_10_Q3, - 12 => &ac_qlookup_12_Q3, - _ => unimplemented!(), - }; - - table[(qindex as isize + delta_q as isize).max(0).min(255) as usize] + static AC_Q: [&[i16; 256]; 3] = + [&ac_qlookup_Q3, &ac_qlookup_10_Q3, &ac_qlookup_12_Q3]; + let bd = ((bit_depth ^ 8) >> 1).min(2); + AC_Q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)] } // TODO: Handle lossless properly. From a89f7c0432dfaf9b03ecb07fee5935095bd1ef65 Mon Sep 17 00:00:00 2001 From: Luni-4 Date: Fri, 12 Feb 2021 14:21:28 +0100 Subject: [PATCH 020/155] Fix clippy warnings --- src/rdo.rs | 24 +++++++++++++----------- v_frame/src/pixel.rs | 1 + 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/rdo.rs b/src/rdo.rs index ba753dc9bf..f065ded1cd 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -1771,7 +1771,7 @@ fn rdo_partition_none( cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, child_modes: &mut ArrayVec<[PartitionParameters; 4]>, -) -> Option { +) -> f64 { debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height); let mode = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg); @@ -1779,7 +1779,7 @@ fn rdo_partition_none( child_modes.push(mode); - Some(cost) + cost } // VERTICAL, HORIZONTAL or simple SPLIT @@ -1893,15 +1893,17 @@ pub fn rdo_partition_decision( let mut child_modes = ArrayVec::<[_; 4]>::new(); let cost = match partition { - PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => rdo_partition_none( - fi, - ts, - cw, - bsize, - tile_bo, - inter_cfg, - &mut child_modes, - ), + PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => { + Some(rdo_partition_none( + fi, + ts, + cw, + bsize, + tile_bo, + inter_cfg, + &mut child_modes, + )) + } PARTITION_SPLIT | PARTITION_HORZ | PARTITION_VERT => { rdo_partition_simple( fi, diff --git a/v_frame/src/pixel.rs b/v_frame/src/pixel.rs index 865706d1e8..69972b3609 100644 --- a/v_frame/src/pixel.rs +++ b/v_frame/src/pixel.rs @@ -114,6 +114,7 @@ pub trait Pixel: /// Converts stride in pixels to stride in bytes. #[inline] + #[allow(clippy::wrong_self_convention)] fn to_asm_stride(in_stride: usize) -> isize { (in_stride * size_of::()) as isize } From a62c40c2220e07bfca68597be15bb0ae39af6d7c Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Sun, 14 Feb 2021 09:41:46 +0100 Subject: [PATCH 021/155] Fix the channel-api based cli --- src/bin/rav1e-ch.rs | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/src/bin/rav1e-ch.rs b/src/bin/rav1e-ch.rs index f2bf5a23b4..780c86d596 100644 --- a/src/bin/rav1e-ch.rs +++ b/src/bin/rav1e-ch.rs @@ -67,29 +67,19 @@ impl Source { cfg_if::cfg_if! { if #[cfg(all(unix, feature = "signal-hook"))] { fn new(limit: usize, input: D) -> Self { - let exit_requested = { - use std::sync::atomic::*; - let e = Arc::new(AtomicBool::from(false)); - - fn setup_signal(sig: i32, e: Arc) { - unsafe { - signal_hook::register(sig, move || { - if e.load(Ordering::SeqCst) { - std::process::exit(128 + sig); - } - e.store(true, Ordering::SeqCst); - info!("Exit requested, flushing."); - }) - .expect("Cannot register the signal hooks"); - } - } - - setup_signal(signal_hook::SIGTERM, e.clone()); - setup_signal(signal_hook::SIGQUIT, e.clone()); - setup_signal(signal_hook::SIGINT, e.clone()); - - e - }; + use signal_hook::{flag, consts}; + + // Make sure double CTRL+C and similar kills + let exit_requested = Arc::new(std::sync::atomic::AtomicBool::new(false)); + for sig in consts::TERM_SIGNALS { + // When terminated by a second term signal, exit with exit code 1. + // This will do nothing the first time (because term_now is false). + flag::register_conditional_shutdown(*sig, 1, Arc::clone(&exit_requested)).unwrap(); + // But this will "arm" the above for the second time, by setting it to true. + // The order of registering these is important, if you put this one first, it will + // first arm and then terminate ‒ all in the first round. + flag::register(*sig, Arc::clone(&exit_requested)).unwrap(); + } Self { limit, input, count: 0, exit_requested, } } } else { From 1869a8b2d78b2f4db6e00e88cac3f144e637ab37 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Mon, 15 Feb 2021 15:54:37 +0900 Subject: [PATCH 022/155] CI: Update libaom to 2.0.2-dmo0~bpo10+1 --- .github/workflows/rav1e.yml | 6 +++--- .travis/install-aom.sh | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index d1a9578ba3..b482f264f3 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -128,11 +128,11 @@ jobs: matrix.conf == 'grcov-coveralls' env: LINK: https://www.deb-multimedia.org/pool/main/a/aom-dmo - AOM_VERSION: 2.0.1-dmo0~bpo10+1 + AOM_VERSION: 2.0.2-dmo0~bpo10+1 AOM_DEV_SHA256: >- - 57adde89e7e82da7839298739fe7c481f38daa15b0ca709ecd0881ab60787410 + d31eee6524ea64c080312eeafc65355e378e043b1d738ff9b9bde3734a85779c AOM_LIB_SHA256: >- - cc418d1be85ae01eac96a5501b284554405bf8d73c59395a988de6812c57a84a + db8a04ca0984604f410c6bd8810ee31666a3bfd3964f3109cdb8f1ae33fec664 run: | echo "$LINK/libaom-dev_${AOM_VERSION}_amd64.deb" >> DEBS echo "$LINK/libaom2_${AOM_VERSION}_amd64.deb" >> DEBS diff --git a/.travis/install-aom.sh b/.travis/install-aom.sh index 937fd94f48..1316d56f94 100755 --- a/.travis/install-aom.sh +++ b/.travis/install-aom.sh @@ -1,7 +1,7 @@ #!/bin/bash set -ex -AOM_VERSION="2.0.1-dmo0~bpo10+1" +AOM_VERSION="2.0.2-dmo0~bpo10+1" PKG_URL="https://www.deb-multimedia.org/pool/main/a/aom-dmo" ARCH="arm64" @@ -13,8 +13,8 @@ curl -O "${PKG_URL}/libaom-dev_${AOM_VERSION}_${ARCH}.deb" \ -O "${PKG_URL}/libaom2_${AOM_VERSION}_${ARCH}.deb" sha256sum --check --ignore-missing < Date: Sun, 14 Feb 2021 02:04:20 +0900 Subject: [PATCH 023/155] Tune flow of inline methods in symbol_with_update * Mostly avoid calling Vec::reserve() in CDFContextLogOps::push(). * Manually elide bounds checks in WriterBase::symbol() as they are not easily inferred by the compiler although statically known. * Rewrite ec::rust::update_cdf() to be panic-free and hint to the compiler not to unroll beyond the maximum CDF length. --- src/context/cdf_context.rs | 12 ++++++++---- src/ec.rs | 25 ++++++++++++++++++------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index a2f6eb6cc0..4733797682 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -9,7 +9,7 @@ use super::*; -const CDF_LEN_MAX: usize = 16; +pub const CDF_LEN_MAX: usize = 16; #[derive(Clone)] pub struct CDFContextCheckpoint { @@ -555,12 +555,16 @@ trait CDFContextLogOps: CDFContextLogSize { // may be deferred until writes are issued. Benchmarks indicate this is // faster than first testing capacity and possibly reallocating. let len = log.data.len(); - debug_assert!(len + Self::CDF_LEN_MAX < log.data.capacity()); + let new_len = len + Self::CDF_LEN_MAX + 1; + let capacity = log.data.capacity(); + debug_assert!(new_len <= capacity); let dst = log.data.get_unchecked_mut(len) as *mut u16; dst.copy_from_nonoverlapping(cdf.as_ptr(), Self::CDF_LEN_MAX); *dst.add(Self::CDF_LEN_MAX) = offset as u16; - log.data.set_len(len + Self::CDF_LEN_MAX + 1); - log.data.reserve(Self::CDF_LEN_MAX + 1); + log.data.set_len(new_len); + if Self::CDF_LEN_MAX + 1 > capacity.wrapping_sub(new_len) { + log.data.reserve(Self::CDF_LEN_MAX + 1); + } } } #[inline(always)] diff --git a/src/ec.rs b/src/ec.rs index 870be98b26..c94235094f 100644 --- a/src/ec.rs +++ b/src/ec.rs @@ -524,9 +524,12 @@ where #[inline(always)] fn symbol(&mut self, s: u32, cdf: &[u16]) { debug_assert!(cdf[cdf.len() - 1] < (1 << EC_PROB_SHIFT)); - let nms = cdf.len() - s as usize; - let fl = if s > 0 { cdf[s as usize - 1] } else { 32768 }; - let fh = cdf[s as usize]; + let s = s as usize; + debug_assert!(s < cdf.len()); + // The above is stricter than the following overflow check: s <= cdf.len() + let nms = cdf.len() - s; + let fl = if s > 0 { unsafe { *cdf.get_unchecked(s - 1) } } else { 32768 }; + let fh = unsafe { *cdf.get_unchecked(s) }; debug_assert!((fh >> EC_PROB_SHIFT) <= (fl >> EC_PROB_SHIFT)); debug_assert!(fl <= 32768); self.store(fl, fh, nms as u16); @@ -888,13 +891,21 @@ impl BCodeWriter for BitWriter { pub(crate) mod rust { // Function to update the CDF for Writer calls that do so. + #[inline] pub fn update_cdf(cdf: &mut [u16], val: u32) { + use crate::context::CDF_LEN_MAX; let nsymbs = cdf.len(); - let rate = 3 + (nsymbs >> 1).min(2) + (cdf[nsymbs - 1] >> 4) as usize; - cdf[nsymbs - 1] += 1 - (cdf[nsymbs - 1] >> 5); - + let mut rate = 3 + (nsymbs >> 1).min(2); + if let Some(count) = cdf.last_mut() { + rate += (*count >> 4) as usize; + *count += 1 - (*count >> 5); + } else { + return; + } // Single loop (faster) - for (i, v) in cdf[..nsymbs - 1].iter_mut().enumerate() { + for (i, v) in + cdf[..nsymbs - 1].iter_mut().enumerate().take(CDF_LEN_MAX - 1) + { if i as u32 >= val { *v -= *v >> rate; } else { From fb0247afb04e9a7ea80253b15fef820ece88bf95 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 16 Feb 2021 20:25:37 +0900 Subject: [PATCH 024/155] Split uv_mode_cdf to remove explicit slice --- src/context/block_unit.rs | 6 +- src/context/cdf_context.rs | 15 ++- src/entropymode.rs | 217 ++++++++++++++++++------------------- 3 files changed, 121 insertions(+), 117 deletions(-) diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs index 109ec24dcb..dac0e9f9b5 100644 --- a/src/context/block_unit.rs +++ b/src/context/block_unit.rs @@ -734,12 +734,12 @@ impl<'a> ContextWriter<'a> { &mut self, w: &mut dyn Writer, uv_mode: PredictionMode, y_mode: PredictionMode, bs: BlockSize, ) { - let cdf = - &mut self.fc.uv_mode_cdf[bs.cfl_allowed() as usize][y_mode as usize]; if bs.cfl_allowed() { + let cdf = &mut self.fc.uv_mode_cfl_cdf[y_mode as usize]; symbol_with_update!(self, w, uv_mode as u32, cdf); } else { - symbol_with_update!(self, w, uv_mode as u32, &mut cdf[..INTRA_MODES]); + let cdf = &mut self.fc.uv_mode_cdf[y_mode as usize]; + symbol_with_update!(self, w, uv_mode as u32, cdf); } } diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 4733797682..087c19c958 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -76,8 +76,9 @@ pub struct CDFContext { pub kf_y_cdf: [[[u16; INTRA_MODES]; KF_MODE_CONTEXTS]; KF_MODE_CONTEXTS], pub y_mode_cdf: [[u16; INTRA_MODES]; BLOCK_SIZE_GROUPS], + pub uv_mode_cdf: [[u16; INTRA_MODES]; INTRA_MODES], - pub uv_mode_cdf: [[[u16; UV_INTRA_MODES]; INTRA_MODES]; 2], + pub uv_mode_cfl_cdf: [[u16; UV_INTRA_MODES]; INTRA_MODES], pub cfl_alpha_cdf: [[u16; CFL_ALPHABET_SIZE]; CFL_ALPHA_CONTEXTS], pub inter_tx_cdf: [[[u16; TX_TYPES]; TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTER], @@ -100,6 +101,7 @@ impl CDFContext { kf_y_cdf: default_kf_y_mode_cdf, y_mode_cdf: default_if_y_mode_cdf, uv_mode_cdf: default_uv_mode_cdf, + uv_mode_cfl_cdf: default_uv_mode_cfl_cdf, cfl_sign_cdf: default_cfl_sign_cdf, cfl_alpha_cdf: default_cfl_alpha_cdf, newmv_cdf: default_newmv_cdf, @@ -191,10 +193,8 @@ impl CDFContext { reset_3d!(self.kf_y_cdf); reset_2d!(self.y_mode_cdf); - for i in 0..INTRA_MODES { - self.uv_mode_cdf[0][i][UV_INTRA_MODES - 2] = 0; - self.uv_mode_cdf[1][i][UV_INTRA_MODES - 1] = 0; - } + reset_2d!(self.uv_mode_cdf); + reset_2d!(self.uv_mode_cfl_cdf); reset_1d!(self.cfl_sign_cdf); reset_2d!(self.cfl_alpha_cdf); reset_2d!(self.newmv_cdf); @@ -286,6 +286,10 @@ impl CDFContext { let uv_mode_cdf_start = self.uv_mode_cdf.first().unwrap().as_ptr() as usize; let uv_mode_cdf_end = uv_mode_cdf_start + size_of_val(&self.uv_mode_cdf); + let uv_mode_cfl_cdf_start = + self.uv_mode_cfl_cdf.first().unwrap().as_ptr() as usize; + let uv_mode_cfl_cdf_end = + uv_mode_cfl_cdf_start + size_of_val(&self.uv_mode_cfl_cdf); let cfl_sign_cdf_start = self.cfl_sign_cdf.as_ptr() as usize; let cfl_sign_cdf_end = cfl_sign_cdf_start + size_of_val(&self.cfl_sign_cdf); @@ -442,6 +446,7 @@ impl CDFContext { ("kf_y_cdf", kf_y_cdf_start, kf_y_cdf_end), ("y_mode_cdf", y_mode_cdf_start, y_mode_cdf_end), ("uv_mode_cdf", uv_mode_cdf_start, uv_mode_cdf_end), + ("uv_mode_cfl_cdf", uv_mode_cfl_cdf_start, uv_mode_cfl_cdf_end), ("cfl_sign_cdf", cfl_sign_cdf_start, cfl_sign_cdf_end), ("cfl_alpha_cdf", cfl_alpha_cdf_start, cfl_alpha_cdf_end), ("newmv_cdf", newmv_cdf_start, newmv_cdf_end), diff --git a/src/entropymode.rs b/src/entropymode.rs index 87f6c3d0c0..5197f42e22 100644 --- a/src/entropymode.rs +++ b/src/entropymode.rs @@ -182,115 +182,114 @@ pub static default_if_y_mode_cdf: [[u16; INTRA_MODES]; BLOCK_SIZE_GROUPS] = [ ), ]; -pub static default_uv_mode_cdf: [[[u16; UV_INTRA_MODES]; INTRA_MODES]; 2] = [ - [ - cdf!( - 22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923, 28244, 30059, - 30941, 31961, CDFMAX - ), - cdf!( - 9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824, 28359, 29505, - 29800, 31796, CDFMAX - ), - cdf!( - 9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854, 30764, - 31777, 32029, CDFMAX - ), - cdf!( - 13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148, 28577, 30612, - 31355, 32493, CDFMAX - ), - cdf!( - 9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243, 31101, 31744, - 32363, CDFMAX - ), - cdf!( - 11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458, 29711, 31161, - 31441, 32550, CDFMAX - ), - cdf!( - 14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200, 30245, 31837, - 32342, 32667, CDFMAX - ), - cdf!( - 12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128, 29267, 30643, - 31961, 32461, CDFMAX - ), - cdf!( - 12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273, 28443, 30388, - 30767, 32416, CDFMAX - ), - cdf!( - 19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719, 23174, 28861, - 30379, 32175, CDFMAX - ), - cdf!( - 18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119, 23527, 27053, - 31397, 32148, CDFMAX - ), - cdf!( - 17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907, 22482, 25896, - 26541, 31819, CDFMAX - ), - cdf!( - 12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166, 15255, 15753, - 16039, 16606, CDFMAX - ), - ], - [ - cdf!( - 10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656, 15986, 20086, - 20995, 22455, 24212 - ), - cdf!( - 4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451, 22099, 24228, - 24693, 27032, 29472 - ), - cdf!( - 5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774, 23138, - 24256, 24703, 26679 - ), - cdf!( - 6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371, 21520, - 22206, 23389, 24182 - ), - cdf!( - 4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411, 24911, 25380, - 26027, 26376 - ), - cdf!( - 5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981, 24780, 25386, - 26517, 27176 - ), - cdf!( - 4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803, 23188, 23763, - 24455, 24940 - ), - cdf!( - 6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059, 22336, 23204, - 23964, 24793 - ), - cdf!( - 5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898, 22494, 23139, - 24764, 25989 - ), - cdf!( - 10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004, 15534, 20714, - 21789, 23443, 24861 - ), - cdf!( - 10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235, 15902, 20102, - 22696, 23774, 25838 - ), - cdf!( - 10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163, 15636, 19676, - 20474, 23519, 25208 - ), - cdf!( - 3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248, 9875, 10521, - 29048 - ), - ], +pub static default_uv_mode_cdf: [[u16; INTRA_MODES]; INTRA_MODES] = [ + cdf!( + 22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923, 28244, 30059, + 30941, 31961 + ), + cdf!( + 9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824, 28359, 29505, + 29800, 31796 + ), + cdf!( + 9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854, 30764, 31777, + 32029 + ), + cdf!( + 13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148, 28577, 30612, + 31355, 32493 + ), + cdf!( + 9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243, 31101, 31744, + 32363 + ), + cdf!( + 11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458, 29711, 31161, + 31441, 32550 + ), + cdf!( + 14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200, 30245, 31837, + 32342, 32667 + ), + cdf!( + 12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128, 29267, 30643, + 31961, 32461 + ), + cdf!( + 12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273, 28443, 30388, + 30767, 32416 + ), + cdf!( + 19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719, 23174, 28861, + 30379, 32175 + ), + cdf!( + 18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119, 23527, 27053, + 31397, 32148 + ), + cdf!( + 17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907, 22482, 25896, + 26541, 31819 + ), + cdf!( + 12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166, 15255, 15753, + 16039, 16606 + ), +]; + +pub static default_uv_mode_cfl_cdf: [[u16; UV_INTRA_MODES]; INTRA_MODES] = [ + cdf!( + 10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656, 15986, 20086, + 20995, 22455, 24212 + ), + cdf!( + 4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451, 22099, 24228, + 24693, 27032, 29472 + ), + cdf!( + 5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774, 23138, 24256, + 24703, 26679 + ), + cdf!( + 6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371, 21520, 22206, + 23389, 24182 + ), + cdf!( + 4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411, 24911, 25380, + 26027, 26376 + ), + cdf!( + 5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981, 24780, 25386, + 26517, 27176 + ), + cdf!( + 4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803, 23188, 23763, + 24455, 24940 + ), + cdf!( + 6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059, 22336, 23204, + 23964, 24793 + ), + cdf!( + 5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898, 22494, 23139, + 24764, 25989 + ), + cdf!( + 10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004, 15534, 20714, + 21789, 23443, 24861 + ), + cdf!( + 10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235, 15902, 20102, + 22696, 23774, 25838 + ), + cdf!( + 10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163, 15636, 19676, + 20474, 23519, 25208 + ), + cdf!( + 3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248, 9875, 10521, + 29048 + ), ]; pub const default_partition_cdf: [[u16; EXT_PARTITION_TYPES]; From 35a3467b842ec6ca9c96337935ab83550d3ba545 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 16 Feb 2021 20:56:42 +0900 Subject: [PATCH 025/155] Split tx_size_cdf to remove explicit slice --- src/context/cdf_context.rs | 17 ++++++++++------- src/context/transform_unit.rs | 13 +++++++------ src/entropymode.rs | 7 +++++-- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 087c19c958..fc58caa0f4 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -42,11 +42,12 @@ pub struct CDFContext { pub txb_skip_cdf: [[[u16; 2]; TXB_SKIP_CONTEXTS]; TxSize::TX_SIZES], pub txfm_partition_cdf: [[u16; 2]; TXFM_PARTITION_CONTEXTS], pub zeromv_cdf: [[u16; 2]; GLOBALMV_MODE_CONTEXTS], + pub tx_size_8x8_cdf: [[u16; MAX_TX_DEPTH]; TX_SIZE_CONTEXTS], pub coeff_base_eob_cdf: [[[[u16; 3]; SIG_COEF_CONTEXTS_EOB]; PLANE_TYPES]; TxSize::TX_SIZES], pub lrf_switchable_cdf: [u16; 3], - pub tx_size_cdf: [[[u16; MAX_TX_DEPTH + 1]; TX_SIZE_CONTEXTS]; MAX_TX_CATS], + pub tx_size_cdf: [[[u16; MAX_TX_DEPTH + 1]; TX_SIZE_CONTEXTS]; BIG_TX_CATS], pub coeff_base_cdf: [[[[u16; 4]; SIG_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], @@ -109,6 +110,7 @@ impl CDFContext { refmv_cdf: default_refmv_cdf, intra_tx_cdf: default_intra_ext_tx_cdf, inter_tx_cdf: default_inter_ext_tx_cdf, + tx_size_8x8_cdf: default_tx_size_8x8_cdf, tx_size_cdf: default_tx_size_cdf, txfm_partition_cdf: default_txfm_partition_cdf, skip_cdfs: default_skip_cdfs, @@ -211,12 +213,8 @@ impl CDFContext { self.inter_tx_cdf[3][i][1] = 0; } - for i in 0..TX_SIZE_CONTEXTS { - self.tx_size_cdf[0][i][MAX_TX_DEPTH - 1] = 0; - } - reset_2d!(self.tx_size_cdf[1]); - reset_2d!(self.tx_size_cdf[2]); - reset_2d!(self.tx_size_cdf[3]); + reset_2d!(self.tx_size_8x8_cdf); + reset_3d!(self.tx_size_cdf); for i in 0..TXFM_PARTITION_CONTEXTS { self.txfm_partition_cdf[i][1] = 0; @@ -311,6 +309,10 @@ impl CDFContext { self.inter_tx_cdf.first().unwrap().as_ptr() as usize; let inter_tx_cdf_end = inter_tx_cdf_start + size_of_val(&self.inter_tx_cdf); + let tx_size_8x8_cdf_start = + self.tx_size_8x8_cdf.first().unwrap().as_ptr() as usize; + let tx_size_8x8_cdf_end = + tx_size_8x8_cdf_start + size_of_val(&self.tx_size_8x8_cdf); let tx_size_cdf_start = self.tx_size_cdf.first().unwrap().as_ptr() as usize; let tx_size_cdf_end = tx_size_cdf_start + size_of_val(&self.tx_size_cdf); @@ -454,6 +456,7 @@ impl CDFContext { ("refmv_cdf", refmv_cdf_start, refmv_cdf_end), ("intra_tx_cdf", intra_tx_cdf_start, intra_tx_cdf_end), ("inter_tx_cdf", inter_tx_cdf_start, inter_tx_cdf_end), + ("tx_size_8x8_cdf", tx_size_8x8_cdf_start, tx_size_8x8_cdf_end), ("tx_size_cdf", tx_size_cdf_start, tx_size_cdf_end), ("txfm_partition_cdf", txfm_partition_cdf_start, txfm_partition_cdf_end), ("skip_cdfs", skip_cdfs_start, skip_cdfs_end), diff --git a/src/context/transform_unit.rs b/src/context/transform_unit.rs index 3d8d4cf203..9190feb533 100644 --- a/src/context/transform_unit.rs +++ b/src/context/transform_unit.rs @@ -647,12 +647,13 @@ impl<'a> ContextWriter<'a> { debug_assert!(depth <= max_depths); debug_assert!(!tx_size.is_rect() || bsize.is_rect_tx_allowed()); - symbol_with_update!( - self, - w, - depth as u32, - &mut self.fc.tx_size_cdf[tx_size_cat][tx_size_ctx][..=max_depths] - ); + if tx_size_cat > 0 { + let cdf = &mut self.fc.tx_size_cdf[tx_size_cat - 1][tx_size_ctx]; + symbol_with_update!(self, w, depth as u32, cdf); + } else { + let cdf = &mut self.fc.tx_size_8x8_cdf[tx_size_ctx]; + symbol_with_update!(self, w, depth as u32, cdf); + } } // Based on https://aomediacodec.github.io/av1-spec/#cdf-selection-process diff --git a/src/entropymode.rs b/src/entropymode.rs index 5197f42e22..759fe8e03b 100644 --- a/src/entropymode.rs +++ b/src/entropymode.rs @@ -32,6 +32,7 @@ const SEG_TEMPORAL_PRED_CTXS: usize = 3; const TX_SIZE_LUMA_MIN: usize = TxSize::TX_4X4 as usize; const TX_SIZE_CTX_MIN: usize = TX_SIZE_LUMA_MIN + 1; pub const MAX_TX_CATS: usize = TxSize::TX_SIZES - TX_SIZE_CTX_MIN; +pub const BIG_TX_CATS: usize = MAX_TX_CATS - 1; // All except 8x8, which has lower max depth. pub const MAX_TX_DEPTH: usize = 2; pub const TXFM_PARTITION_CONTEXTS: usize = 21; // (TxSize::TX_SIZES - TxSize::TX_8X8) * 6 - 3; @@ -1442,9 +1443,11 @@ pub static default_spatial_pred_seg_tree_cdf: [[u16; MAX_SEGMENTS]; cdf!(27527, 28487, 28723, 28890, 32397, 32647, 32679), ]; +pub static default_tx_size_8x8_cdf: [[u16; MAX_TX_DEPTH]; TX_SIZE_CONTEXTS] = + [cdf!(19968), cdf!(19968), cdf!(24320)]; + pub static default_tx_size_cdf: [[[u16; MAX_TX_DEPTH + 1]; TX_SIZE_CONTEXTS]; - MAX_TX_CATS] = [ - [cdf!(19968, CDFMAX), cdf!(19968, CDFMAX), cdf!(24320, CDFMAX)], + BIG_TX_CATS] = [ [cdf!(12272, 30172), cdf!(12272, 30172), cdf!(18677, 30848)], [cdf!(12986, 15180), cdf!(12986, 15180), cdf!(24302, 25602)], [cdf!(5782, 11475), cdf!(5782, 11475), cdf!(16803, 22759)], From aa83ef60004441e39d8f43921618bb3e72effe12 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 16 Feb 2021 21:42:00 +0900 Subject: [PATCH 026/155] Split partition_cdf to remove explicit slice --- src/context/cdf_context.rs | 28 +++++++++----- src/context/partition_unit.rs | 72 +++++++++++++++++++++++++++-------- src/entropymode.rs | 24 +++++++----- 3 files changed, 89 insertions(+), 35 deletions(-) diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index fc58caa0f4..d51449c468 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -55,6 +55,7 @@ pub struct CDFContext { [[[[u16; BR_CDF_SIZE]; LEVEL_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], pub deblock_delta_cdf: [u16; DELTA_LF_PROBS + 1], pub deblock_delta_multi_cdf: [[u16; DELTA_LF_PROBS + 1]; FRAME_LF_COUNT], + pub partition_w8_cdf: [[u16; 4]; PARTITION_TYPES], pub eob_flag_cdf16: [[[u16; 5]; 2]; PLANE_TYPES], @@ -67,11 +68,12 @@ pub struct CDFContext { pub compound_mode_cdf: [[u16; INTER_COMPOUND_MODES]; INTER_MODE_CONTEXTS], pub eob_flag_cdf128: [[[u16; 8]; 2]; PLANE_TYPES], pub spatial_segmentation_cdfs: [[u16; 8]; 3], + pub partition_w128_cdf: [[u16; 8]; PARTITION_TYPES], pub eob_flag_cdf256: [[[u16; 9]; 2]; PLANE_TYPES], pub eob_flag_cdf512: [[[u16; 10]; 2]; PLANE_TYPES], - pub partition_cdf: [[u16; EXT_PARTITION_TYPES]; PARTITION_CONTEXTS], + pub partition_cdf: [[u16; EXT_PARTITION_TYPES]; 3 * PARTITION_TYPES], pub eob_flag_cdf1024: [[[u16; 11]; 2]; PLANE_TYPES], @@ -98,6 +100,8 @@ impl CDFContext { _ => 3, }; CDFContext { + partition_w8_cdf: default_partition_w8_cdf, + partition_w128_cdf: default_partition_w128_cdf, partition_cdf: default_partition_cdf, kf_y_cdf: default_kf_y_mode_cdf, y_mode_cdf: default_if_y_mode_cdf, @@ -182,15 +186,9 @@ impl CDFContext { }; } - for i in 0..4 { - self.partition_cdf[i][3] = 0; - } - for i in 4..16 { - self.partition_cdf[i][9] = 0; - } - for i in 16..20 { - self.partition_cdf[i][7] = 0; - } + reset_2d!(self.partition_w8_cdf); + reset_2d!(self.partition_w128_cdf); + reset_2d!(self.partition_cdf); reset_3d!(self.kf_y_cdf); reset_2d!(self.y_mode_cdf); @@ -273,6 +271,14 @@ impl CDFContext { pub fn build_map(&self) -> Vec<(&'static str, usize, usize)> { use std::mem::size_of_val; + let partition_w8_cdf_start = + self.partition_w8_cdf.first().unwrap().as_ptr() as usize; + let partition_w8_cdf_end = + partition_w8_cdf_start + size_of_val(&self.partition_w8_cdf); + let partition_w128_cdf_start = + self.partition_w128_cdf.first().unwrap().as_ptr() as usize; + let partition_w128_cdf_end = + partition_w128_cdf_start + size_of_val(&self.partition_w128_cdf); let partition_cdf_start = self.partition_cdf.first().unwrap().as_ptr() as usize; let partition_cdf_end = @@ -444,6 +450,8 @@ impl CDFContext { coeff_br_cdf_start + size_of_val(&self.coeff_br_cdf); vec![ + ("partition_w8_cdf", partition_w8_cdf_start, partition_w8_cdf_end), + ("partition_w128_cdf", partition_w128_cdf_start, partition_w128_cdf_end), ("partition_cdf", partition_cdf_start, partition_cdf_end), ("kf_y_cdf", kf_y_cdf_start, kf_y_cdf_end), ("y_mode_cdf", y_mode_cdf_start, y_mode_cdf_end), diff --git a/src/context/partition_unit.rs b/src/context/partition_unit.rs index 471a73b8f3..2ea629986c 100644 --- a/src/context/partition_unit.rs +++ b/src/context/partition_unit.rs @@ -306,18 +306,22 @@ impl<'a> ContextWriter<'a> { let has_rows = (bo.0.y + hbs) < self.bc.blocks.rows(); let ctx = self.bc.partition_plane_context(bo, bsize); assert!(ctx < PARTITION_CONTEXTS); - let partition_cdf = if bsize <= BlockSize::BLOCK_8X8 { - &mut self.fc.partition_cdf[ctx][..PARTITION_TYPES] - } else { - &mut self.fc.partition_cdf[ctx] - }; if !has_rows && !has_cols { return; } if has_rows && has_cols { - symbol_with_update!(self, w, p as u32, partition_cdf); + if ctx < PARTITION_TYPES { + let cdf = &mut self.fc.partition_w8_cdf[ctx]; + symbol_with_update!(self, w, p as u32, cdf); + } else if ctx < 4 * PARTITION_TYPES { + let cdf = &mut self.fc.partition_cdf[ctx - PARTITION_TYPES]; + symbol_with_update!(self, w, p as u32, cdf); + } else { + let cdf = &mut self.fc.partition_w128_cdf[ctx - 4 * PARTITION_TYPES]; + symbol_with_update!(self, w, p as u32, cdf); + } } else if !has_rows && has_cols { assert!( p == PartitionType::PARTITION_SPLIT @@ -325,11 +329,29 @@ impl<'a> ContextWriter<'a> { ); assert!(bsize > BlockSize::BLOCK_8X8); let mut cdf = [0u16; 2]; - ContextWriter::partition_gather_vert_alike( - &mut cdf, - partition_cdf, - bsize, - ); + if ctx < PARTITION_TYPES { + let partition_cdf = &mut self.fc.partition_w8_cdf[ctx]; + ContextWriter::partition_gather_vert_alike( + &mut cdf, + partition_cdf, + bsize, + ); + } else if ctx < 4 * PARTITION_TYPES { + let partition_cdf = &mut self.fc.partition_cdf[ctx - PARTITION_TYPES]; + ContextWriter::partition_gather_vert_alike( + &mut cdf, + partition_cdf, + bsize, + ); + } else { + let partition_cdf = + &mut self.fc.partition_w128_cdf[ctx - 4 * PARTITION_TYPES]; + ContextWriter::partition_gather_vert_alike( + &mut cdf, + partition_cdf, + bsize, + ); + } w.symbol((p == PartitionType::PARTITION_SPLIT) as u32, &cdf); } else { assert!( @@ -338,11 +360,29 @@ impl<'a> ContextWriter<'a> { ); assert!(bsize > BlockSize::BLOCK_8X8); let mut cdf = [0u16; 2]; - ContextWriter::partition_gather_horz_alike( - &mut cdf, - partition_cdf, - bsize, - ); + if ctx < PARTITION_TYPES { + let partition_cdf = &mut self.fc.partition_w8_cdf[ctx]; + ContextWriter::partition_gather_horz_alike( + &mut cdf, + partition_cdf, + bsize, + ); + } else if ctx < 4 * PARTITION_TYPES { + let partition_cdf = &mut self.fc.partition_cdf[ctx - PARTITION_TYPES]; + ContextWriter::partition_gather_horz_alike( + &mut cdf, + partition_cdf, + bsize, + ); + } else { + let partition_cdf = + &mut self.fc.partition_w128_cdf[ctx - 4 * PARTITION_TYPES]; + ContextWriter::partition_gather_horz_alike( + &mut cdf, + partition_cdf, + bsize, + ); + } w.symbol((p == PartitionType::PARTITION_SPLIT) as u32, &cdf); } } diff --git a/src/entropymode.rs b/src/entropymode.rs index 759fe8e03b..42538ee4cc 100644 --- a/src/entropymode.rs +++ b/src/entropymode.rs @@ -293,12 +293,15 @@ pub static default_uv_mode_cfl_cdf: [[u16; UV_INTRA_MODES]; INTRA_MODES] = [ ), ]; +pub const default_partition_w8_cdf: [[u16; 4]; PARTITION_TYPES] = [ + cdf!(19132, 25510, 30392), + cdf!(13928, 19855, 28540), + cdf!(12522, 23679, 28629), + cdf!(9896, 18783, 25853), +]; + pub const default_partition_cdf: [[u16; EXT_PARTITION_TYPES]; - PARTITION_CONTEXTS] = [ - cdf!(19132, 25510, 30392, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX), - cdf!(13928, 19855, 28540, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX), - cdf!(12522, 23679, 28629, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX), - cdf!(9896, 18783, 25853, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX), + 3 * PARTITION_TYPES] = [ cdf!(15597, 20929, 24571, 26706, 27664, 28821, 29601, 30571, 31902), cdf!(7925, 11043, 16785, 22470, 23971, 25043, 26651, 28701, 29834), cdf!(5414, 13269, 15111, 20488, 22360, 24500, 25537, 26336, 32117), @@ -311,10 +314,13 @@ pub const default_partition_cdf: [[u16; EXT_PARTITION_TYPES]; cdf!(6732, 7490, 9497, 27944, 28250, 28515, 28969, 29630, 30104), cdf!(5945, 7663, 8348, 28683, 29117, 29749, 30064, 30298, 32238), cdf!(870, 1212, 1487, 31198, 31394, 31574, 31743, 31881, 32332), - cdf!(27899, 28219, 28529, 32484, 32539, 32619, 32639, CDFMAX, CDFMAX), - cdf!(6607, 6990, 8268, 32060, 32219, 32338, 32371, CDFMAX, CDFMAX), - cdf!(5429, 6676, 7122, 32027, 32227, 32531, 32582, CDFMAX, CDFMAX), - cdf!(711, 966, 1172, 32448, 32538, 32617, 32664, CDFMAX, CDFMAX), +]; + +pub const default_partition_w128_cdf: [[u16; 8]; PARTITION_TYPES] = [ + cdf!(27899, 28219, 28529, 32484, 32539, 32619, 32639), + cdf!(6607, 6990, 8268, 32060, 32219, 32338, 32371), + cdf!(5429, 6676, 7122, 32027, 32227, 32531, 32582), + cdf!(711, 966, 1172, 32448, 32538, 32617, 32664), ]; pub static default_intra_ext_tx_cdf: [[[[u16; TX_TYPES]; INTRA_MODES]; From 9e836318c5e27232b2c12e90984954abaa2913e2 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 16 Feb 2021 22:06:39 +0900 Subject: [PATCH 027/155] Split inter_tx_cdf to remove explicit slice --- src/context/cdf_context.rs | 36 ++++++++++---- src/context/transform_unit.rs | 18 ++++--- src/entropymode.rs | 92 +++++++++++++---------------------- 3 files changed, 72 insertions(+), 74 deletions(-) diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index d51449c468..6a42ac2a79 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -43,6 +43,7 @@ pub struct CDFContext { pub txfm_partition_cdf: [[u16; 2]; TXFM_PARTITION_CONTEXTS], pub zeromv_cdf: [[u16; 2]; GLOBALMV_MODE_CONTEXTS], pub tx_size_8x8_cdf: [[u16; MAX_TX_DEPTH]; TX_SIZE_CONTEXTS], + pub inter_tx_3_cdf: [[u16; 2]; TX_SIZE_SQR_CONTEXTS], pub coeff_base_eob_cdf: [[[[u16; 3]; SIG_COEF_CONTEXTS_EOB]; PLANE_TYPES]; TxSize::TX_SIZES], @@ -77,6 +78,8 @@ pub struct CDFContext { pub eob_flag_cdf1024: [[[u16; 11]; 2]; PLANE_TYPES], + pub inter_tx_2_cdf: [[u16; 12]; TX_SIZE_SQR_CONTEXTS], + pub kf_y_cdf: [[[u16; INTRA_MODES]; KF_MODE_CONTEXTS]; KF_MODE_CONTEXTS], pub y_mode_cdf: [[u16; INTRA_MODES]; BLOCK_SIZE_GROUPS], pub uv_mode_cdf: [[u16; INTRA_MODES]; INTRA_MODES], @@ -84,7 +87,7 @@ pub struct CDFContext { pub uv_mode_cfl_cdf: [[u16; UV_INTRA_MODES]; INTRA_MODES], pub cfl_alpha_cdf: [[u16; CFL_ALPHABET_SIZE]; CFL_ALPHA_CONTEXTS], - pub inter_tx_cdf: [[[u16; TX_TYPES]; TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTER], + pub inter_tx_1_cdf: [[u16; TX_TYPES]; TX_SIZE_SQR_CONTEXTS], pub intra_tx_cdf: [[[[u16; TX_TYPES]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTRA], @@ -113,7 +116,9 @@ impl CDFContext { zeromv_cdf: default_zeromv_cdf, refmv_cdf: default_refmv_cdf, intra_tx_cdf: default_intra_ext_tx_cdf, - inter_tx_cdf: default_inter_ext_tx_cdf, + inter_tx_3_cdf: default_inter_tx_3_cdf, + inter_tx_2_cdf: default_inter_tx_2_cdf, + inter_tx_1_cdf: default_inter_tx_1_cdf, tx_size_8x8_cdf: default_tx_size_8x8_cdf, tx_size_cdf: default_tx_size_cdf, txfm_partition_cdf: default_txfm_partition_cdf, @@ -206,11 +211,12 @@ impl CDFContext { self.intra_tx_cdf[1][i][j][6] = 0; self.intra_tx_cdf[2][i][j][4] = 0; } - self.inter_tx_cdf[1][i][15] = 0; - self.inter_tx_cdf[2][i][11] = 0; - self.inter_tx_cdf[3][i][1] = 0; } + reset_2d!(self.inter_tx_3_cdf); + reset_2d!(self.inter_tx_2_cdf); + reset_2d!(self.inter_tx_1_cdf); + reset_2d!(self.tx_size_8x8_cdf); reset_3d!(self.tx_size_cdf); @@ -311,10 +317,18 @@ impl CDFContext { self.intra_tx_cdf.first().unwrap().as_ptr() as usize; let intra_tx_cdf_end = intra_tx_cdf_start + size_of_val(&self.intra_tx_cdf); - let inter_tx_cdf_start = - self.inter_tx_cdf.first().unwrap().as_ptr() as usize; - let inter_tx_cdf_end = - inter_tx_cdf_start + size_of_val(&self.inter_tx_cdf); + let inter_tx_3_cdf_start = + self.inter_tx_3_cdf.first().unwrap().as_ptr() as usize; + let inter_tx_3_cdf_end = + inter_tx_3_cdf_start + size_of_val(&self.inter_tx_3_cdf); + let inter_tx_2_cdf_start = + self.inter_tx_2_cdf.first().unwrap().as_ptr() as usize; + let inter_tx_2_cdf_end = + inter_tx_2_cdf_start + size_of_val(&self.inter_tx_2_cdf); + let inter_tx_1_cdf_start = + self.inter_tx_1_cdf.first().unwrap().as_ptr() as usize; + let inter_tx_1_cdf_end = + inter_tx_1_cdf_start + size_of_val(&self.inter_tx_1_cdf); let tx_size_8x8_cdf_start = self.tx_size_8x8_cdf.first().unwrap().as_ptr() as usize; let tx_size_8x8_cdf_end = @@ -463,7 +477,9 @@ impl CDFContext { ("zeromv_cdf", zeromv_cdf_start, zeromv_cdf_end), ("refmv_cdf", refmv_cdf_start, refmv_cdf_end), ("intra_tx_cdf", intra_tx_cdf_start, intra_tx_cdf_end), - ("inter_tx_cdf", inter_tx_cdf_start, inter_tx_cdf_end), + ("inter_tx_3_cdf", inter_tx_3_cdf_start, inter_tx_3_cdf_end), + ("inter_tx_2_cdf", inter_tx_2_cdf_start, inter_tx_2_cdf_end), + ("inter_tx_1_cdf", inter_tx_1_cdf_start, inter_tx_1_cdf_end), ("tx_size_8x8_cdf", tx_size_8x8_cdf_start, tx_size_8x8_cdf_end), ("tx_size_cdf", tx_size_cdf_start, tx_size_cdf_end), ("txfm_partition_cdf", txfm_partition_cdf_start, txfm_partition_cdf_end), diff --git a/src/context/transform_unit.rs b/src/context/transform_unit.rs index 9190feb533..f85406afc0 100644 --- a/src/context/transform_unit.rs +++ b/src/context/transform_unit.rs @@ -538,13 +538,17 @@ impl<'a> ContextWriter<'a> { assert!(av1_tx_used[tx_set as usize][tx_type as usize] != 0); if is_inter { - symbol_with_update!( - self, - w, - av1_tx_ind[tx_set as usize][tx_type as usize] as u32, - &mut self.fc.inter_tx_cdf[tx_set_index as usize] - [square_tx_size as usize][..num_tx_set[tx_set as usize]] - ); + let s = av1_tx_ind[tx_set as usize][tx_type as usize] as u32; + if tx_set_index == 1 { + let cdf = &mut self.fc.inter_tx_1_cdf[square_tx_size as usize]; + symbol_with_update!(self, w, s, cdf); + } else if tx_set_index == 2 { + let cdf = &mut self.fc.inter_tx_2_cdf[square_tx_size as usize]; + symbol_with_update!(self, w, s, cdf); + } else { + let cdf = &mut self.fc.inter_tx_3_cdf[square_tx_size as usize]; + symbol_with_update!(self, w, s, cdf); + } } else { let intra_dir = y_mode; // TODO: Once use_filter_intra is enabled, diff --git a/src/entropymode.rs b/src/entropymode.rs index 42538ee4cc..5525fb6291 100644 --- a/src/entropymode.rs +++ b/src/entropymode.rs @@ -764,65 +764,43 @@ pub static default_intra_ext_tx_cdf: [[[[u16; TX_TYPES]; INTRA_MODES]; ], ]; -pub static default_inter_ext_tx_cdf: [[[u16; TX_TYPES]; - TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTER] = [ - [[0; TX_TYPES]; TX_SIZE_SQR_CONTEXTS], - [ - cdf!( - 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504, 22848, 23934, - 25474, 27727, 28915, 30631 - ), - cdf!( - 1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674, 20408, 22517, - 25010, 27116, 28856, 30749 - ), - cdf!( - 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, - 24576, 26624, 28672, 30720 - ), - cdf!( - 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, - 24576, 26624, 28672, 30720 - ), - ], - [ - cdf!( - 2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, - 30037, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, - 30037, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595, 28526, 30529, - CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, - 30037, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - ], - [ - cdf!( - 16384, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4167, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 1998, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 748, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - ], +pub static default_inter_tx_1_cdf: [[u16; TX_TYPES]; TX_SIZE_SQR_CONTEXTS] = [ + cdf!( + 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504, 22848, 23934, + 25474, 27727, 28915, 30631 + ), + cdf!( + 1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674, 20408, 22517, + 25010, 27116, 28856, 30749 + ), + cdf!( + 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, + 24576, 26624, 28672, 30720 + ), + cdf!( + 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, + 24576, 26624, 28672, 30720 + ), ]; +pub static default_inter_tx_2_cdf: [[u16; 12]; TX_SIZE_SQR_CONTEXTS] = [ + cdf!( + 2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, 30037 + ), + cdf!( + 2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, 30037 + ), + cdf!( + 770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595, 28526, 30529 + ), + cdf!( + 2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, 30037 + ), +]; + +pub static default_inter_tx_3_cdf: [[u16; 2]; TX_SIZE_SQR_CONTEXTS] = + [cdf!(16384), cdf!(4167), cdf!(1998), cdf!(748)]; + pub static default_cfl_sign_cdf: [u16; CFL_JOINT_SIGNS] = cdf!(1418, 2123, 13340, 18405, 26972, 28343, 32294); From 36e115eaefdd04ba4ade87420b9b7e7280183e0f Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 16 Feb 2021 22:57:39 +0900 Subject: [PATCH 028/155] Split intra_tx_cdf to remove explicit slice --- src/context/cdf_context.rs | 30 +- src/context/transform_unit.rs | 18 +- src/entropymode.rs | 557 ++++++++-------------------------- 3 files changed, 148 insertions(+), 457 deletions(-) diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 6a42ac2a79..9c58fa70a5 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -59,11 +59,13 @@ pub struct CDFContext { pub partition_w8_cdf: [[u16; 4]; PARTITION_TYPES], pub eob_flag_cdf16: [[[u16; 5]; 2]; PLANE_TYPES], + pub intra_tx_2_cdf: [[[u16; 5]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS], pub eob_flag_cdf32: [[[u16; 6]; 2]; PLANE_TYPES], pub angle_delta_cdf: [[u16; 2 * MAX_ANGLE_DELTA + 1]; DIRECTIONAL_MODES], pub eob_flag_cdf64: [[[u16; 7]; 2]; PLANE_TYPES], + pub intra_tx_1_cdf: [[[u16; 7]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS], pub cfl_sign_cdf: [u16; CFL_JOINT_SIGNS], pub compound_mode_cdf: [[u16; INTER_COMPOUND_MODES]; INTER_MODE_CONTEXTS], @@ -88,8 +90,6 @@ pub struct CDFContext { pub cfl_alpha_cdf: [[u16; CFL_ALPHABET_SIZE]; CFL_ALPHA_CONTEXTS], pub inter_tx_1_cdf: [[u16; TX_TYPES]; TX_SIZE_SQR_CONTEXTS], - pub intra_tx_cdf: - [[[[u16; TX_TYPES]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTRA], pub nmv_context: NMVContext, } @@ -115,7 +115,8 @@ impl CDFContext { newmv_cdf: default_newmv_cdf, zeromv_cdf: default_zeromv_cdf, refmv_cdf: default_refmv_cdf, - intra_tx_cdf: default_intra_ext_tx_cdf, + intra_tx_2_cdf: default_intra_tx_2_cdf, + intra_tx_1_cdf: default_intra_tx_1_cdf, inter_tx_3_cdf: default_inter_tx_3_cdf, inter_tx_2_cdf: default_inter_tx_2_cdf, inter_tx_1_cdf: default_inter_tx_1_cdf, @@ -206,12 +207,8 @@ impl CDFContext { reset_2d!(self.zeromv_cdf); reset_2d!(self.refmv_cdf); - for i in 0..TX_SIZE_SQR_CONTEXTS { - for j in 0..INTRA_MODES { - self.intra_tx_cdf[1][i][j][6] = 0; - self.intra_tx_cdf[2][i][j][4] = 0; - } - } + reset_3d!(self.intra_tx_2_cdf); + reset_3d!(self.intra_tx_1_cdf); reset_2d!(self.inter_tx_3_cdf); reset_2d!(self.inter_tx_2_cdf); @@ -313,10 +310,14 @@ impl CDFContext { let zeromv_cdf_end = zeromv_cdf_start + size_of_val(&self.zeromv_cdf); let refmv_cdf_start = self.refmv_cdf.first().unwrap().as_ptr() as usize; let refmv_cdf_end = refmv_cdf_start + size_of_val(&self.refmv_cdf); - let intra_tx_cdf_start = - self.intra_tx_cdf.first().unwrap().as_ptr() as usize; - let intra_tx_cdf_end = - intra_tx_cdf_start + size_of_val(&self.intra_tx_cdf); + let intra_tx_2_cdf_start = + self.intra_tx_2_cdf.first().unwrap().as_ptr() as usize; + let intra_tx_2_cdf_end = + intra_tx_2_cdf_start + size_of_val(&self.intra_tx_2_cdf); + let intra_tx_1_cdf_start = + self.intra_tx_1_cdf.first().unwrap().as_ptr() as usize; + let intra_tx_1_cdf_end = + intra_tx_1_cdf_start + size_of_val(&self.intra_tx_1_cdf); let inter_tx_3_cdf_start = self.inter_tx_3_cdf.first().unwrap().as_ptr() as usize; let inter_tx_3_cdf_end = @@ -476,7 +477,8 @@ impl CDFContext { ("newmv_cdf", newmv_cdf_start, newmv_cdf_end), ("zeromv_cdf", zeromv_cdf_start, zeromv_cdf_end), ("refmv_cdf", refmv_cdf_start, refmv_cdf_end), - ("intra_tx_cdf", intra_tx_cdf_start, intra_tx_cdf_end), + ("intra_tx_2_cdf", intra_tx_2_cdf_start, intra_tx_2_cdf_end), + ("intra_tx_1_cdf", intra_tx_1_cdf_start, intra_tx_1_cdf_end), ("inter_tx_3_cdf", inter_tx_3_cdf_start, inter_tx_3_cdf_end), ("inter_tx_2_cdf", inter_tx_2_cdf_start, inter_tx_2_cdf_end), ("inter_tx_1_cdf", inter_tx_1_cdf_start, inter_tx_1_cdf_end), diff --git a/src/context/transform_unit.rs b/src/context/transform_unit.rs index f85406afc0..3f76a1d4f6 100644 --- a/src/context/transform_unit.rs +++ b/src/context/transform_unit.rs @@ -555,14 +555,16 @@ impl<'a> ContextWriter<'a> { // intra_dir = // fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]; - symbol_with_update!( - self, - w, - av1_tx_ind[tx_set as usize][tx_type as usize] as u32, - &mut self.fc.intra_tx_cdf[tx_set_index as usize] - [square_tx_size as usize][intra_dir as usize] - [..num_tx_set[tx_set as usize]] - ); + let s = av1_tx_ind[tx_set as usize][tx_type as usize] as u32; + if tx_set_index == 1 { + let cdf = &mut self.fc.intra_tx_1_cdf[square_tx_size as usize] + [intra_dir as usize]; + symbol_with_update!(self, w, s, cdf); + } else { + let cdf = &mut self.fc.intra_tx_2_cdf[square_tx_size as usize] + [intra_dir as usize]; + symbol_with_update!(self, w, s, cdf); + } } } } diff --git a/src/entropymode.rs b/src/entropymode.rs index 5525fb6291..1efbfe9caa 100644 --- a/src/entropymode.rs +++ b/src/entropymode.rs @@ -323,444 +323,131 @@ pub const default_partition_w128_cdf: [[u16; 8]; PARTITION_TYPES] = [ cdf!(711, 966, 1172, 32448, 32538, 32617, 32664), ]; -pub static default_intra_ext_tx_cdf: [[[[u16; TX_TYPES]; INTRA_MODES]; - TX_SIZE_SQR_CONTEXTS]; TX_SETS_INTRA] = [ - [[[0; TX_TYPES]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS], +pub static default_intra_tx_1_cdf: [[[u16; 7]; INTRA_MODES]; + TX_SIZE_SQR_CONTEXTS] = [ [ - [ - cdf!( - 1535, 8035, 9461, 12751, 23467, 27825, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 564, 3335, 9709, 10870, 18143, 28094, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 672, 3247, 3676, 11982, 19415, 23127, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 5279, 13885, 15487, 18044, 23527, 30252, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4423, 6074, 7985, 10416, 25693, 29298, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 1486, 4241, 9460, 10662, 16456, 27694, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 439, 2838, 3522, 6737, 18058, 23754, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 1190, 4233, 4855, 11670, 20281, 24377, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 1045, 4312, 8647, 10159, 18644, 29335, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 202, 3734, 4747, 7298, 17127, 24016, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 447, 4312, 6819, 8884, 16010, 23858, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 277, 4369, 5255, 8905, 16465, 22271, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 3409, 5436, 10599, 15599, 19687, 24040, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - ], - [ - cdf!( - 1870, 13742, 14530, 16498, 23770, 27698, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 326, 8796, 14632, 15079, 19272, 27486, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 484, 7576, 7712, 14443, 19159, 22591, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 1126, 15340, 15895, 17023, 20896, 30279, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 655, 4854, 5249, 5913, 22099, 27138, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 1299, 6458, 8885, 9290, 14851, 25497, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 311, 5295, 5552, 6885, 16107, 22672, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 883, 8059, 8270, 11258, 17289, 21549, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 741, 7580, 9318, 10345, 16688, 29046, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 110, 7406, 7915, 9195, 16041, 23329, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 363, 7974, 9357, 10673, 15629, 24474, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 153, 7647, 8112, 9936, 15307, 19996, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 3511, 6332, 11165, 15335, 19323, 23594, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - ], - [ - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - ], - [ - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 4681, 9362, 14043, 18725, 23406, 28087, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - ], + cdf!(1535, 8035, 9461, 12751, 23467, 27825), + cdf!(564, 3335, 9709, 10870, 18143, 28094), + cdf!(672, 3247, 3676, 11982, 19415, 23127), + cdf!(5279, 13885, 15487, 18044, 23527, 30252), + cdf!(4423, 6074, 7985, 10416, 25693, 29298), + cdf!(1486, 4241, 9460, 10662, 16456, 27694), + cdf!(439, 2838, 3522, 6737, 18058, 23754), + cdf!(1190, 4233, 4855, 11670, 20281, 24377), + cdf!(1045, 4312, 8647, 10159, 18644, 29335), + cdf!(202, 3734, 4747, 7298, 17127, 24016), + cdf!(447, 4312, 6819, 8884, 16010, 23858), + cdf!(277, 4369, 5255, 8905, 16465, 22271), + cdf!(3409, 5436, 10599, 15599, 19687, 24040), ], [ - [ - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - ], - [ - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - ], - [ - cdf!( - 1127, 12814, 22772, 27483, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 145, 6761, 11980, 26667, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 362, 5887, 11678, 16725, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 385, 15213, 18587, 30693, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 25, 2914, 23134, 27903, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 60, 4470, 11749, 23991, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 37, 3332, 14511, 21448, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 157, 6320, 13036, 17439, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 119, 6719, 12906, 29396, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 47, 5537, 12576, 21499, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 269, 6076, 11258, 23115, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 83, 5615, 12001, 17228, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 1968, 5556, 12023, 18547, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - ], - [ - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - cdf!( - 6554, 13107, 19661, 26214, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, - CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX, CDFMAX - ), - ], + cdf!(1870, 13742, 14530, 16498, 23770, 27698), + cdf!(326, 8796, 14632, 15079, 19272, 27486), + cdf!(484, 7576, 7712, 14443, 19159, 22591), + cdf!(1126, 15340, 15895, 17023, 20896, 30279), + cdf!(655, 4854, 5249, 5913, 22099, 27138), + cdf!(1299, 6458, 8885, 9290, 14851, 25497), + cdf!(311, 5295, 5552, 6885, 16107, 22672), + cdf!(883, 8059, 8270, 11258, 17289, 21549), + cdf!(741, 7580, 9318, 10345, 16688, 29046), + cdf!(110, 7406, 7915, 9195, 16041, 23329), + cdf!(363, 7974, 9357, 10673, 15629, 24474), + cdf!(153, 7647, 8112, 9936, 15307, 19996), + cdf!(3511, 6332, 11165, 15335, 19323, 23594), + ], + [ + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + ], + [ + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + cdf!(4681, 9362, 14043, 18725, 23406, 28087), + ], +]; + +pub static default_intra_tx_2_cdf: [[[u16; 5]; INTRA_MODES]; + TX_SIZE_SQR_CONTEXTS] = [ + [ + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + ], + [ + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + ], + [ + cdf!(1127, 12814, 22772, 27483), + cdf!(145, 6761, 11980, 26667), + cdf!(362, 5887, 11678, 16725), + cdf!(385, 15213, 18587, 30693), + cdf!(25, 2914, 23134, 27903), + cdf!(60, 4470, 11749, 23991), + cdf!(37, 3332, 14511, 21448), + cdf!(157, 6320, 13036, 17439), + cdf!(119, 6719, 12906, 29396), + cdf!(47, 5537, 12576, 21499), + cdf!(269, 6076, 11258, 23115), + cdf!(83, 5615, 12001, 17228), + cdf!(1968, 5556, 12023, 18547), + ], + [ + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), + cdf!(6554, 13107, 19661, 26214), ], ]; From d8f26bcaeca7ca888141704aa652d89b2490f256 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 16 Feb 2021 23:09:03 +0900 Subject: [PATCH 029/155] Use reset_2d macro to reset txfm_partition_cdf --- src/context/cdf_context.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 9c58fa70a5..c9eeeebdda 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -217,9 +217,7 @@ impl CDFContext { reset_2d!(self.tx_size_8x8_cdf); reset_3d!(self.tx_size_cdf); - for i in 0..TXFM_PARTITION_CONTEXTS { - self.txfm_partition_cdf[i][1] = 0; - } + reset_2d!(self.txfm_partition_cdf); reset_2d!(self.skip_cdfs); reset_2d!(self.intra_inter_cdfs); From 94de371ee22c5ec1fd0cd3f496af5aa918540a7f Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Thu, 18 Feb 2021 00:23:40 +0900 Subject: [PATCH 030/155] Remove early coercion to slice of eob_flag_cdfNNNN --- src/context/block_unit.rs | 42 +++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs index dac0e9f9b5..7e791ab992 100644 --- a/src/context/block_unit.rs +++ b/src/context/block_unit.rs @@ -1875,20 +1875,36 @@ impl<'a> ContextWriter<'a> { let eob_multi_size: usize = tx_size.area_log2() - 4; let eob_multi_ctx: usize = if tx_class == TX_CLASS_2D { 0 } else { 1 }; - symbol_with_update!( - self, - w, - eob_pt - 1, - match eob_multi_size { - 0 => &mut self.fc.eob_flag_cdf16[plane_type][eob_multi_ctx], - 1 => &mut self.fc.eob_flag_cdf32[plane_type][eob_multi_ctx], - 2 => &mut self.fc.eob_flag_cdf64[plane_type][eob_multi_ctx], - 3 => &mut self.fc.eob_flag_cdf128[plane_type][eob_multi_ctx], - 4 => &mut self.fc.eob_flag_cdf256[plane_type][eob_multi_ctx], - 5 => &mut self.fc.eob_flag_cdf512[plane_type][eob_multi_ctx], - _ => &mut self.fc.eob_flag_cdf1024[plane_type][eob_multi_ctx], + match eob_multi_size { + 0 => { + let cdf = &mut self.fc.eob_flag_cdf16[plane_type][eob_multi_ctx]; + symbol_with_update!(self, w, eob_pt - 1, cdf); } - ); + 1 => { + let cdf = &mut self.fc.eob_flag_cdf32[plane_type][eob_multi_ctx]; + symbol_with_update!(self, w, eob_pt - 1, cdf); + } + 2 => { + let cdf = &mut self.fc.eob_flag_cdf64[plane_type][eob_multi_ctx]; + symbol_with_update!(self, w, eob_pt - 1, cdf); + } + 3 => { + let cdf = &mut self.fc.eob_flag_cdf128[plane_type][eob_multi_ctx]; + symbol_with_update!(self, w, eob_pt - 1, cdf); + } + 4 => { + let cdf = &mut self.fc.eob_flag_cdf256[plane_type][eob_multi_ctx]; + symbol_with_update!(self, w, eob_pt - 1, cdf); + } + 5 => { + let cdf = &mut self.fc.eob_flag_cdf512[plane_type][eob_multi_ctx]; + symbol_with_update!(self, w, eob_pt - 1, cdf); + } + _ => { + let cdf = &mut self.fc.eob_flag_cdf1024[plane_type][eob_multi_ctx]; + symbol_with_update!(self, w, eob_pt - 1, cdf); + } + } let eob_offset_bits = k_eob_offset_bits[eob_pt as usize]; From 16362843c0fd80d3e9eb00d245d0243343ce1b84 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Thu, 18 Feb 2021 02:44:35 +0900 Subject: [PATCH 031/155] Add variant of symbol_with_update for 4-symbol CDFs --- src/context/block_unit.rs | 24 ++++++++---------------- src/context/cdf_context.rs | 7 +++++++ src/context/mod.rs | 2 +- src/context/partition_unit.rs | 2 +- src/ec.rs | 24 ++++++++++++++++++++---- 5 files changed, 37 insertions(+), 22 deletions(-) diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs index 7e791ab992..a9630f0cec 100644 --- a/src/context/block_unit.rs +++ b/src/context/block_unit.rs @@ -1744,12 +1744,8 @@ impl<'a> ContextWriter<'a> { MotionVector { row: mv.row - ref_mv.row, col: mv.col - ref_mv.col }; let j: MvJointType = av1_get_mv_joint(diff); - symbol_with_update!( - self, - w, - j as u32, - &mut self.fc.nmv_context.joints_cdf - ); + let cdf = &mut self.fc.nmv_context.joints_cdf; + symbol_with_update!(self, w, j as u32, cdf, 4); if mv_joint_vertical(j) { self.encode_mv_component(w, diff.row as i32, 0, mv_precision); @@ -1776,7 +1772,7 @@ impl<'a> ContextWriter<'a> { for (&delta, cdf) in deltas.iter().zip(cdfs.iter_mut()) { let abs = delta.abs() as u32; - symbol_with_update!(self, w, cmp::min(abs, DELTA_LF_SMALL), cdf); + symbol_with_update!(self, w, cmp::min(abs, DELTA_LF_SMALL), cdf, 4); if abs >= DELTA_LF_SMALL { let bits = msb(abs as i32 - 1) as u32; @@ -1957,7 +1953,8 @@ impl<'a> ContextWriter<'a> { self, w, (cmp::min(u32::cast_from(level), 3)) as u32, - &mut self.fc.coeff_base_cdf[txs_ctx][plane_type][coeff_ctx as usize] + &mut self.fc.coeff_base_cdf[txs_ctx][plane_type][coeff_ctx as usize], + 4 ); } @@ -1971,14 +1968,9 @@ impl<'a> ContextWriter<'a> { break; } let k = cmp::min(base_range - idx, T::cast_from(BR_CDF_SIZE - 1)); - symbol_with_update!( - self, - w, - u32::cast_from(k), - &mut self.fc.coeff_br_cdf - [cmp::min(txs_ctx, TxSize::TX_32X32 as usize)][plane_type] - [br_ctx] - ); + let cdf = &mut self.fc.coeff_br_cdf + [txs_ctx.min(TxSize::TX_32X32 as usize)][plane_type][br_ctx]; + symbol_with_update!(self, w, u32::cast_from(k), cdf, 4); if k < T::cast_from(BR_CDF_SIZE - 1) { break; } diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index c9eeeebdda..5aeef45df6 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -546,6 +546,9 @@ impl fmt::Debug for CDFContext { macro_rules! symbol_with_update { ($self:ident, $w:ident, $s:expr, $cdf:expr) => { $w.symbol_with_update($s, $cdf, &mut $self.fc_log); + symbol_with_update!($self, $cdf); + }; + ($self:ident, $cdf:expr) => { #[cfg(feature = "desync_finder")] { let cdf: &[_] = $cdf; @@ -554,6 +557,10 @@ macro_rules! symbol_with_update { } } }; + ($self:ident, $w:ident, $s:expr, $cdf:expr, 4) => { + $w.symbol_with_update_4($s, $cdf, &mut $self.fc_log); + symbol_with_update!($self, $cdf); + }; } #[derive(Clone)] diff --git a/src/context/mod.rs b/src/context/mod.rs index bb183a969e..f34c3fafe2 100644 --- a/src/context/mod.rs +++ b/src/context/mod.rs @@ -221,7 +221,7 @@ impl<'a> ContextWriter<'a> { } else { &mut mvcomp.fp_cdf }; - symbol_with_update!(self, w, fr, cdf); + symbol_with_update!(self, w, fr, cdf, 4); } // High precision bit diff --git a/src/context/partition_unit.rs b/src/context/partition_unit.rs index 2ea629986c..d6d1291a46 100644 --- a/src/context/partition_unit.rs +++ b/src/context/partition_unit.rs @@ -314,7 +314,7 @@ impl<'a> ContextWriter<'a> { if has_rows && has_cols { if ctx < PARTITION_TYPES { let cdf = &mut self.fc.partition_w8_cdf[ctx]; - symbol_with_update!(self, w, p as u32, cdf); + symbol_with_update!(self, w, p as u32, cdf, 4); } else if ctx < 4 * PARTITION_TYPES { let cdf = &mut self.fc.partition_cdf[ctx - PARTITION_TYPES]; symbol_with_update!(self, w, p as u32, cdf); diff --git a/src/ec.rs b/src/ec.rs index c94235094f..7936ba105f 100644 --- a/src/ec.rs +++ b/src/ec.rs @@ -18,6 +18,7 @@ cfg_if::cfg_if! { } } +use crate::context::CDFContextLog; use crate::util::{msb, ILog}; use bitstream_io::{BigEndian, BitWrite, BitWriter}; use std::io; @@ -27,6 +28,12 @@ const EC_PROB_SHIFT: u32 = 6; const EC_MIN_PROB: u32 = 4; type ec_window = u32; +macro_rules! symbol_with_update_decl {($($n:expr),*) => {$(paste::item!{ + fn []( + &mut self, s: u32, cdf: &mut [u16; $n], log: &mut CDFContextLog, + ); +})*}} + /// Public trait interface to a bitstream Writer: a Counter can be /// used to count bits for cost analysis without actually storing /// anything (using a new::WriterCounter() as a Writer), to record @@ -43,9 +50,9 @@ pub trait Writer { fn symbol_bits(&self, s: u32, cdf: &[u16]) -> u32; /// Write a symbol s, using the passed in cdf reference; updates the referenced cdf. fn symbol_with_update( - &mut self, s: u32, cdf: &mut [u16], - log: &mut crate::context::CDFContextLog, + &mut self, s: u32, cdf: &mut [u16], log: &mut CDFContextLog, ); + symbol_with_update_decl!(4); /// Write a bool using passed in probability fn bool(&mut self, val: bool, f: u16); /// Write a single bit with flat proability @@ -482,6 +489,14 @@ impl WriterBase { } } +macro_rules! symbol_with_update_impl {($($n:expr),*) => {$(paste::item!{ + fn []( + &mut self, s: u32, cdf: &mut [u16; $n], log: &mut CDFContextLog, + ) { + self.symbol_with_update(s, cdf, log); + } +})*}} + /// Generic/shared implementation for Writers with StorageBackends (ie, Encoders and Recorders) impl Writer for WriterBase where @@ -543,9 +558,9 @@ where /// The values must be monotonically non-decreasing, and the last value /// must be greater 32704. There should be at most 16 values. /// The lower 6 bits of the last value hold the count. + #[inline(always)] fn symbol_with_update( - &mut self, s: u32, cdf: &mut [u16], - log: &mut crate::context::CDFContextLog, + &mut self, s: u32, cdf: &mut [u16], log: &mut CDFContextLog, ) { #[cfg(feature = "desync_finder")] { @@ -558,6 +573,7 @@ where update_cdf(cdf, s); } + symbol_with_update_impl!(4); /// Returns approximate cost for a symbol given a cumulative /// distribution function (CDF) table and current write state. /// `s`: The index of the symbol to encode. From 8f64c297e7cad313524040325d155b849fc81581 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Thu, 18 Feb 2021 09:32:37 +0900 Subject: [PATCH 032/155] Add variant of symbol_with_update for 2-symbol CDFs --- src/context/block_unit.rs | 83 ++++++++---------------- src/context/cdf_context.rs | 4 ++ src/context/frame_header.rs | 116 ++++++++++------------------------ src/context/mod.rs | 11 ++-- src/context/partition_unit.rs | 3 +- src/context/transform_unit.rs | 13 ++-- src/ec.rs | 4 +- 7 files changed, 78 insertions(+), 156 deletions(-) diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs index a9630f0cec..8913d6c5aa 100644 --- a/src/context/block_unit.rs +++ b/src/context/block_unit.rs @@ -759,12 +759,8 @@ impl<'a> ContextWriter<'a> { pub fn write_use_filter_intra( &mut self, w: &mut dyn Writer, enable: bool, block_size: BlockSize, ) { - symbol_with_update!( - self, - w, - enable as u32, - &mut self.fc.filter_intra_cdfs[block_size as usize] - ); + let cdf = &mut self.fc.filter_intra_cdfs[block_size as usize]; + symbol_with_update!(self, w, enable as u32, cdf, 2); } pub fn write_use_palette_mode( @@ -780,23 +776,15 @@ impl<'a> ContextWriter<'a> { if luma_mode == PredictionMode::DC_PRED { let bsize_ctx = bsize.width_mi_log2() + bsize.height_mi_log2() - 2; - symbol_with_update!( - self, - w, - enable as u32, - &mut self.fc.palette_y_mode_cdfs[bsize_ctx][ctx_luma] - ); + let cdf = &mut self.fc.palette_y_mode_cdfs[bsize_ctx][ctx_luma]; + symbol_with_update!(self, w, enable as u32, cdf, 2); } if has_chroma(bo, bsize, xdec, ydec, cs) && chroma_mode == PredictionMode::DC_PRED { - symbol_with_update!( - self, - w, - enable as u32, - &mut self.fc.palette_uv_mode_cdfs[ctx_chroma] - ); + let cdf = &mut self.fc.palette_uv_mode_cdfs[ctx_chroma]; + symbol_with_update!(self, w, enable as u32, cdf, 2); } } @@ -1699,29 +1687,18 @@ impl<'a> ContextWriter<'a> { pub fn write_inter_mode( &mut self, w: &mut dyn Writer, mode: PredictionMode, ctx: usize, ) { + use PredictionMode::{GLOBALMV, NEARESTMV, NEWMV}; let newmv_ctx = ctx & NEWMV_CTX_MASK; - symbol_with_update!( - self, - w, - (mode != PredictionMode::NEWMV) as u32, - &mut self.fc.newmv_cdf[newmv_ctx] - ); - if mode != PredictionMode::NEWMV { + let cdf = &mut self.fc.newmv_cdf[newmv_ctx]; + symbol_with_update!(self, w, (mode != NEWMV) as u32, cdf, 2); + if mode != NEWMV { let zeromv_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; - symbol_with_update!( - self, - w, - (mode != PredictionMode::GLOBALMV) as u32, - &mut self.fc.zeromv_cdf[zeromv_ctx] - ); - if mode != PredictionMode::GLOBALMV { + let cdf = &mut self.fc.zeromv_cdf[zeromv_ctx]; + symbol_with_update!(self, w, (mode != GLOBALMV) as u32, cdf, 2); + if mode != GLOBALMV { let refmv_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; - symbol_with_update!( - self, - w, - (mode != PredictionMode::NEARESTMV) as u32, - &mut self.fc.refmv_cdf[refmv_ctx] - ); + let cdf = &mut self.fc.refmv_cdf[refmv_ctx]; + symbol_with_update!(self, w, (mode != NEARESTMV) as u32, cdf, 2); } } } @@ -1730,7 +1707,8 @@ impl<'a> ContextWriter<'a> { pub fn write_drl_mode( &mut self, w: &mut dyn Writer, drl_mode: bool, ctx: usize, ) { - symbol_with_update!(self, w, drl_mode as u32, &mut self.fc.drl_cdfs[ctx]); + let cdf = &mut self.fc.drl_cdfs[ctx]; + symbol_with_update!(self, w, drl_mode as u32, cdf, 2); } pub fn write_mv( @@ -1789,12 +1767,8 @@ impl<'a> ContextWriter<'a> { &mut self, w: &mut dyn Writer, bo: TileBlockOffset, is_inter: bool, ) { let ctx = self.bc.intra_inter_context(bo); - symbol_with_update!( - self, - w, - is_inter as u32, - &mut self.fc.intra_inter_cdfs[ctx] - ); + let cdf = &mut self.fc.intra_inter_cdfs[ctx]; + symbol_with_update!(self, w, is_inter as u32, cdf, 2); } pub fn write_coeffs_lv_map( @@ -1836,7 +1810,7 @@ impl<'a> ContextWriter<'a> { { let cdf = &mut self.fc.txb_skip_cdf[txs_ctx][txb_ctx.txb_skip_ctx]; - symbol_with_update!(self, w, (eob == 0) as u32, cdf); + symbol_with_update!(self, w, (eob == 0) as u32, cdf, 2); } if eob == 0 { @@ -1908,12 +1882,9 @@ impl<'a> ContextWriter<'a> { let mut eob_shift = eob_offset_bits - 1; let mut bit: u32 = if (eob_extra & (1 << eob_shift)) != 0 { 1 } else { 0 }; - symbol_with_update!( - self, - w, - bit, - &mut self.fc.eob_extra_cdf[txs_ctx][plane_type][(eob_pt - 3) as usize] - ); + let cdf = + &mut self.fc.eob_extra_cdf[txs_ctx][plane_type][(eob_pt - 3) as usize]; + symbol_with_update!(self, w, bit, cdf, 2); for i in 1..eob_offset_bits { eob_shift = eob_offset_bits as u16 - 1 - i as u16; bit = if (eob_extra & (1 << eob_shift)) != 0 { 1 } else { 0 }; @@ -1989,12 +1960,8 @@ impl<'a> ContextWriter<'a> { let level = v.abs(); let sign = if v < T::cast_from(0) { 1 } else { 0 }; if c == 0 { - symbol_with_update!( - self, - w, - sign, - &mut self.fc.dc_sign_cdf[plane_type][txb_ctx.dc_sign_ctx] - ); + let cdf = &mut self.fc.dc_sign_cdf[plane_type][txb_ctx.dc_sign_ctx]; + symbol_with_update!(self, w, sign, cdf, 2); } else { w.bit(sign as u16); } diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 5aeef45df6..45e28d2c88 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -557,6 +557,10 @@ macro_rules! symbol_with_update { } } }; + ($self:ident, $w:ident, $s:expr, $cdf:expr, 2) => { + $w.symbol_with_update_2($s, $cdf, &mut $self.fc_log); + symbol_with_update!($self, $cdf); + }; ($self:ident, $w:ident, $s:expr, $cdf:expr, 4) => { $w.symbol_with_update_4($s, $cdf, &mut $self.fc_log); symbol_with_update!($self, $cdf); diff --git a/src/context/frame_header.rs b/src/context/frame_header.rs index 36dbd88697..0bcb2052eb 100644 --- a/src/context/frame_header.rs +++ b/src/context/frame_header.rs @@ -73,12 +73,8 @@ impl<'a> ContextWriter<'a> { if fi.reference_mode != ReferenceMode::SINGLE && sz >= 2 { let ctx = self.get_comp_mode_ctx(bo); - symbol_with_update!( - self, - w, - comp_mode as u32, - &mut self.fc.comp_mode_cdf[ctx] - ); + let cdf = &mut self.fc.comp_mode_cdf[ctx]; + symbol_with_update!(self, w, comp_mode as u32, cdf, 2); } else { assert!(!comp_mode); } @@ -86,123 +82,75 @@ impl<'a> ContextWriter<'a> { if comp_mode { let comp_ref_type: u32 = 1; // bidir let ctx = self.get_comp_ref_type_ctx(bo); - symbol_with_update!( - self, - w, - comp_ref_type, - &mut self.fc.comp_ref_type_cdf[ctx] - ); + let cdf = &mut self.fc.comp_ref_type_cdf[ctx]; + symbol_with_update!(self, w, comp_ref_type, cdf, 2); if comp_ref_type == 0 { unimplemented!(); } else { let compref = rf[0] == GOLDEN_FRAME || rf[0] == LAST3_FRAME; let ctx = self.get_pred_ctx_ll2_or_l3gld(bo); - symbol_with_update!( - self, - w, - compref as u32, - &mut self.fc.comp_ref_cdf[ctx][0] - ); + let cdf = &mut self.fc.comp_ref_cdf[ctx][0]; + symbol_with_update!(self, w, compref as u32, cdf, 2); if !compref { let compref_p1 = rf[0] == LAST2_FRAME; let ctx = self.get_pred_ctx_last_or_last2(bo); - symbol_with_update!( - self, - w, - compref_p1 as u32, - &mut self.fc.comp_ref_cdf[ctx][1] - ); + let cdf = &mut self.fc.comp_ref_cdf[ctx][1]; + symbol_with_update!(self, w, compref_p1 as u32, cdf, 2); } else { let compref_p2 = rf[0] == GOLDEN_FRAME; let ctx = self.get_pred_ctx_last3_or_gold(bo); - symbol_with_update!( - self, - w, - compref_p2 as u32, - &mut self.fc.comp_ref_cdf[ctx][2] - ); + let cdf = &mut self.fc.comp_ref_cdf[ctx][2]; + symbol_with_update!(self, w, compref_p2 as u32, cdf, 2); } let comp_bwdref = rf[1] == ALTREF_FRAME; let ctx = self.get_pred_ctx_brfarf2_or_arf(bo); - symbol_with_update!( - self, - w, - comp_bwdref as u32, - &mut self.fc.comp_bwd_ref_cdf[ctx][0] - ); + let cdf = &mut self.fc.comp_bwd_ref_cdf[ctx][0]; + symbol_with_update!(self, w, comp_bwdref as u32, cdf, 2); if !comp_bwdref { let comp_bwdref_p1 = rf[1] == ALTREF2_FRAME; let ctx = self.get_pred_ctx_brf_or_arf2(bo); - symbol_with_update!( - self, - w, - comp_bwdref_p1 as u32, - &mut self.fc.comp_bwd_ref_cdf[ctx][1] - ); + let cdf = &mut self.fc.comp_bwd_ref_cdf[ctx][1]; + symbol_with_update!(self, w, comp_bwdref_p1 as u32, cdf, 2); } } } else { let b0_ctx = self.get_ref_frame_ctx_b0(bo); let b0 = rf[0] != NONE_FRAME && rf[0].is_bwd_ref(); - symbol_with_update!( - self, - w, - b0 as u32, - &mut self.fc.single_ref_cdfs[b0_ctx][0] - ); + let cdf = &mut self.fc.single_ref_cdfs[b0_ctx][0]; + symbol_with_update!(self, w, b0 as u32, cdf, 2); if b0 { let b1_ctx = self.get_pred_ctx_brfarf2_or_arf(bo); let b1 = rf[0] == ALTREF_FRAME; - symbol_with_update!( - self, - w, - b1 as u32, - &mut self.fc.single_ref_cdfs[b1_ctx][1] - ); + let cdf = &mut self.fc.single_ref_cdfs[b1_ctx][1]; + symbol_with_update!(self, w, b1 as u32, cdf, 2); if !b1 { let b5_ctx = self.get_pred_ctx_brf_or_arf2(bo); let b5 = rf[0] == ALTREF2_FRAME; - symbol_with_update!( - self, - w, - b5 as u32, - &mut self.fc.single_ref_cdfs[b5_ctx][5] - ); + let cdf = &mut self.fc.single_ref_cdfs[b5_ctx][5]; + symbol_with_update!(self, w, b5 as u32, cdf, 2); } } else { let b2_ctx = self.get_pred_ctx_ll2_or_l3gld(bo); let b2 = rf[0] == LAST3_FRAME || rf[0] == GOLDEN_FRAME; - symbol_with_update!( - self, - w, - b2 as u32, - &mut self.fc.single_ref_cdfs[b2_ctx][2] - ); + let cdf = &mut self.fc.single_ref_cdfs[b2_ctx][2]; + symbol_with_update!(self, w, b2 as u32, cdf, 2); if !b2 { let b3_ctx = self.get_pred_ctx_last_or_last2(bo); let b3 = rf[0] != LAST_FRAME; - symbol_with_update!( - self, - w, - b3 as u32, - &mut self.fc.single_ref_cdfs[b3_ctx][3] - ); + let cdf = &mut self.fc.single_ref_cdfs[b3_ctx][3]; + symbol_with_update!(self, w, b3 as u32, cdf, 2); } else { let b4_ctx = self.get_pred_ctx_last3_or_gold(bo); let b4 = rf[0] != LAST3_FRAME; - symbol_with_update!( - self, - w, - b4 as u32, - &mut self.fc.single_ref_cdfs[b4_ctx][4] - ); + let cdf = &mut self.fc.single_ref_cdfs[b4_ctx][4]; + symbol_with_update!(self, w, b4 as u32, cdf, 2); } } } @@ -227,10 +175,12 @@ impl<'a> ContextWriter<'a> { match filter { RestorationFilter::None => match rp.rp_cfg.lrf_type { RESTORE_WIENER => { - symbol_with_update!(self, w, 0, &mut self.fc.lrf_wiener_cdf); + let cdf = &mut self.fc.lrf_wiener_cdf; + symbol_with_update!(self, w, 0, cdf, 2); } RESTORE_SGRPROJ => { - symbol_with_update!(self, w, 0, &mut self.fc.lrf_sgrproj_cdf); + let cdf = &mut self.fc.lrf_sgrproj_cdf; + symbol_with_update!(self, w, 0, cdf, 2); } RESTORE_SWITCHABLE => { symbol_with_update!(self, w, 0, &mut self.fc.lrf_switchable_cdf); @@ -241,7 +191,8 @@ impl<'a> ContextWriter<'a> { RestorationFilter::Sgrproj { set, xqd } => { match rp.rp_cfg.lrf_type { RESTORE_SGRPROJ => { - symbol_with_update!(self, w, 1, &mut self.fc.lrf_sgrproj_cdf); + let cdf = &mut self.fc.lrf_sgrproj_cdf; + symbol_with_update!(self, w, 1, cdf, 2); } RESTORE_SWITCHABLE => { // Does *not* write 'RESTORE_SGRPROJ' @@ -282,7 +233,8 @@ impl<'a> ContextWriter<'a> { RestorationFilter::Wiener { coeffs } => { match rp.rp_cfg.lrf_type { RESTORE_WIENER => { - symbol_with_update!(self, w, 1, &mut self.fc.lrf_wiener_cdf); + let cdf = &mut self.fc.lrf_wiener_cdf; + symbol_with_update!(self, w, 1, cdf, 2); } RESTORE_SWITCHABLE => { // Does *not* write 'RESTORE_WIENER' diff --git a/src/context/mod.rs b/src/context/mod.rs index f34c3fafe2..4692ce4b4f 100644 --- a/src/context/mod.rs +++ b/src/context/mod.rs @@ -200,18 +200,21 @@ impl<'a> ContextWriter<'a> { let hp = offset & 1; // high precision mv data // Sign - symbol_with_update!(self, w, sign, &mut mvcomp.sign_cdf); + let cdf = &mut mvcomp.sign_cdf; + symbol_with_update!(self, w, sign, cdf, 2); // Class symbol_with_update!(self, w, mv_class as u32, &mut mvcomp.classes_cdf); // Integer bits if mv_class == MV_CLASS_0 { - symbol_with_update!(self, w, d, &mut mvcomp.class0_cdf); + let cdf = &mut mvcomp.class0_cdf; + symbol_with_update!(self, w, d, cdf, 2); } else { let n = mv_class + CLASS0_BITS - 1; // number of bits for i in 0..n { - symbol_with_update!(self, w, (d >> i) & 1, &mut mvcomp.bits_cdf[i]); + let cdf = &mut mvcomp.bits_cdf[i]; + symbol_with_update!(self, w, (d >> i) & 1, cdf, 2); } } // Fractional bits @@ -231,7 +234,7 @@ impl<'a> ContextWriter<'a> { } else { &mut mvcomp.hp_cdf }; - symbol_with_update!(self, w, hp, cdf); + symbol_with_update!(self, w, hp, cdf, 2); } } } diff --git a/src/context/partition_unit.rs b/src/context/partition_unit.rs index d6d1291a46..80888dbb40 100644 --- a/src/context/partition_unit.rs +++ b/src/context/partition_unit.rs @@ -236,7 +236,8 @@ impl<'a> ContextWriter<'a> { &mut self, w: &mut dyn Writer, bo: TileBlockOffset, skip: bool, ) { let ctx = self.bc.skip_context(bo); - symbol_with_update!(self, w, skip as u32, &mut self.fc.skip_cdfs[ctx]); + let cdf = &mut self.fc.skip_cdfs[ctx]; + symbol_with_update!(self, w, skip as u32, cdf, 2); } pub fn get_segment_pred(&self, bo: TileBlockOffset) -> (u8, u8) { diff --git a/src/context/transform_unit.rs b/src/context/transform_unit.rs index 3f76a1d4f6..f46e1e5a75 100644 --- a/src/context/transform_unit.rs +++ b/src/context/transform_unit.rs @@ -547,7 +547,7 @@ impl<'a> ContextWriter<'a> { symbol_with_update!(self, w, s, cdf); } else { let cdf = &mut self.fc.inter_tx_3_cdf[square_tx_size as usize]; - symbol_with_update!(self, w, s, cdf); + symbol_with_update!(self, w, s, cdf, 2); } } else { let intra_dir = y_mode; @@ -658,7 +658,7 @@ impl<'a> ContextWriter<'a> { symbol_with_update!(self, w, depth as u32, cdf); } else { let cdf = &mut self.fc.tx_size_8x8_cdf[tx_size_ctx]; - symbol_with_update!(self, w, depth as u32, cdf); + symbol_with_update!(self, w, depth as u32, cdf, 2); } } @@ -733,13 +733,8 @@ impl<'a> ContextWriter<'a> { if tx_size != TX_4X4 && depth < MAX_VARTX_DEPTH { let ctx = self.txfm_partition_context(bo, bsize, tx_size, tbx, tby); - - symbol_with_update!( - self, - w, - txfm_split as u32, - &mut self.fc.txfm_partition_cdf[ctx] - ); + let cdf = &mut self.fc.txfm_partition_cdf[ctx]; + symbol_with_update!(self, w, txfm_split as u32, cdf, 2); } else { debug_assert!(!txfm_split); } diff --git a/src/ec.rs b/src/ec.rs index 7936ba105f..b88cf85569 100644 --- a/src/ec.rs +++ b/src/ec.rs @@ -52,7 +52,7 @@ pub trait Writer { fn symbol_with_update( &mut self, s: u32, cdf: &mut [u16], log: &mut CDFContextLog, ); - symbol_with_update_decl!(4); + symbol_with_update_decl!(2, 4); /// Write a bool using passed in probability fn bool(&mut self, val: bool, f: u16); /// Write a single bit with flat proability @@ -573,7 +573,7 @@ where update_cdf(cdf, s); } - symbol_with_update_impl!(4); + symbol_with_update_impl!(2, 4); /// Returns approximate cost for a symbol given a cumulative /// distribution function (CDF) table and current write state. /// `s`: The index of the symbol to encode. From 33f31e78340b1bbb467e3c2cc4389d95ba59583a Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Thu, 18 Feb 2021 09:35:43 +0900 Subject: [PATCH 033/155] Add variant of symbol_with_update for 3-symbol CDFs --- src/context/block_unit.rs | 3 ++- src/context/cdf_context.rs | 4 ++++ src/context/frame_header.rs | 19 ++++++------------- src/context/transform_unit.rs | 2 +- src/ec.rs | 4 ++-- 5 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs index 8913d6c5aa..6d332d04e8 100644 --- a/src/context/block_unit.rs +++ b/src/context/block_unit.rs @@ -1917,7 +1917,8 @@ impl<'a> ContextWriter<'a> { w, (cmp::min(u32::cast_from(level), 3) - 1) as u32, &mut self.fc.coeff_base_eob_cdf[txs_ctx][plane_type] - [coeff_ctx as usize] + [coeff_ctx as usize], + 3 ); } else { symbol_with_update!( diff --git a/src/context/cdf_context.rs b/src/context/cdf_context.rs index 45e28d2c88..6f87921a38 100644 --- a/src/context/cdf_context.rs +++ b/src/context/cdf_context.rs @@ -561,6 +561,10 @@ macro_rules! symbol_with_update { $w.symbol_with_update_2($s, $cdf, &mut $self.fc_log); symbol_with_update!($self, $cdf); }; + ($self:ident, $w:ident, $s:expr, $cdf:expr, 3) => { + $w.symbol_with_update_3($s, $cdf, &mut $self.fc_log); + symbol_with_update!($self, $cdf); + }; ($self:ident, $w:ident, $s:expr, $cdf:expr, 4) => { $w.symbol_with_update_4($s, $cdf, &mut $self.fc_log); symbol_with_update!($self, $cdf); diff --git a/src/context/frame_header.rs b/src/context/frame_header.rs index 0bcb2052eb..3b94e85bb0 100644 --- a/src/context/frame_header.rs +++ b/src/context/frame_header.rs @@ -183,7 +183,8 @@ impl<'a> ContextWriter<'a> { symbol_with_update!(self, w, 0, cdf, 2); } RESTORE_SWITCHABLE => { - symbol_with_update!(self, w, 0, &mut self.fc.lrf_switchable_cdf); + let cdf = &mut self.fc.lrf_switchable_cdf; + symbol_with_update!(self, w, 0, cdf, 3); } RESTORE_NONE => {} _ => unreachable!(), @@ -196,12 +197,8 @@ impl<'a> ContextWriter<'a> { } RESTORE_SWITCHABLE => { // Does *not* write 'RESTORE_SGRPROJ' - symbol_with_update!( - self, - w, - 2, - &mut self.fc.lrf_switchable_cdf - ); + let cdf = &mut self.fc.lrf_switchable_cdf; + symbol_with_update!(self, w, 2, cdf, 3); } _ => unreachable!(), } @@ -238,12 +235,8 @@ impl<'a> ContextWriter<'a> { } RESTORE_SWITCHABLE => { // Does *not* write 'RESTORE_WIENER' - symbol_with_update!( - self, - w, - 1, - &mut self.fc.lrf_switchable_cdf - ); + let cdf = &mut self.fc.lrf_switchable_cdf; + symbol_with_update!(self, w, 1, cdf, 3); } _ => unreachable!(), } diff --git a/src/context/transform_unit.rs b/src/context/transform_unit.rs index f46e1e5a75..ab62b683d0 100644 --- a/src/context/transform_unit.rs +++ b/src/context/transform_unit.rs @@ -655,7 +655,7 @@ impl<'a> ContextWriter<'a> { if tx_size_cat > 0 { let cdf = &mut self.fc.tx_size_cdf[tx_size_cat - 1][tx_size_ctx]; - symbol_with_update!(self, w, depth as u32, cdf); + symbol_with_update!(self, w, depth as u32, cdf, 3); } else { let cdf = &mut self.fc.tx_size_8x8_cdf[tx_size_ctx]; symbol_with_update!(self, w, depth as u32, cdf, 2); diff --git a/src/ec.rs b/src/ec.rs index b88cf85569..6a6fe570f0 100644 --- a/src/ec.rs +++ b/src/ec.rs @@ -52,7 +52,7 @@ pub trait Writer { fn symbol_with_update( &mut self, s: u32, cdf: &mut [u16], log: &mut CDFContextLog, ); - symbol_with_update_decl!(2, 4); + symbol_with_update_decl!(2, 3, 4); /// Write a bool using passed in probability fn bool(&mut self, val: bool, f: u16); /// Write a single bit with flat proability @@ -573,7 +573,7 @@ where update_cdf(cdf, s); } - symbol_with_update_impl!(2, 4); + symbol_with_update_impl!(2, 3, 4); /// Returns approximate cost for a symbol given a cumulative /// distribution function (CDF) table and current write state. /// `s`: The index of the symbol to encode. From 08c2f566ce1ee1da19a28cb706f4fa15df794c03 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Fri, 12 Feb 2021 23:58:20 +0100 Subject: [PATCH 034/155] Move out update_block_importances --- src/api/internal.rs | 320 +++++++++++++++++++++----------------------- 1 file changed, 156 insertions(+), 164 deletions(-) diff --git a/src/api/internal.rs b/src/api/internal.rs index 465f5e317b..ba1a9af2af 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -799,6 +799,161 @@ impl ContextInner { } } + #[hawktracer(update_block_importances)] + fn update_block_importances( + fi: &FrameInvariants, me_stats: &crate::me::FrameMEStats, + frame: &Frame, reference_frame: &Frame, bit_depth: usize, + bsize: BlockSize, len: usize, + reference_frame_block_importances: &mut [f32], + ) { + let plane_org = &frame.planes[0]; + let plane_ref = &reference_frame.planes[0]; + + (0..fi.h_in_imp_b) + .zip(fi.lookahead_intra_costs.chunks_exact(fi.w_in_imp_b)) + .zip(fi.block_importances.chunks_exact(fi.w_in_imp_b)) + .for_each(|((y, lookahead_intra_costs), block_importances)| { + (0..fi.w_in_imp_b).for_each(|x| { + let mv = me_stats[y * 2][x * 2].mv; + + // Coordinates of the top-left corner of the reference block, in MV + // units. + let reference_x = + x as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.col as i64; + let reference_y = + y as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.row as i64; + + let region_org = plane_org.region(Area::Rect { + x: (x * IMPORTANCE_BLOCK_SIZE) as isize, + y: (y * IMPORTANCE_BLOCK_SIZE) as isize, + width: IMPORTANCE_BLOCK_SIZE, + height: IMPORTANCE_BLOCK_SIZE, + }); + + let region_ref = plane_ref.region(Area::Rect { + x: reference_x as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, + y: reference_y as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, + width: IMPORTANCE_BLOCK_SIZE, + height: IMPORTANCE_BLOCK_SIZE, + }); + + let inter_cost = get_satd( + ®ion_org, + ®ion_ref, + bsize, + bit_depth, + fi.cpu_feature_level, + ) as f32; + + let intra_cost = lookahead_intra_costs[x] as f32; + let future_importance = block_importances[x]; + + let propagate_fraction = if intra_cost <= inter_cost { + 0. + } else { + 1. - inter_cost / intra_cost + }; + + let propagate_amount = + (intra_cost + future_importance) * propagate_fraction / len as f32; + + let mut propagate = + |block_x_in_mv_units, block_y_in_mv_units, fraction| { + let x = block_x_in_mv_units / IMP_BLOCK_SIZE_IN_MV_UNITS; + let y = block_y_in_mv_units / IMP_BLOCK_SIZE_IN_MV_UNITS; + + // TODO: propagate partially if the block is partially off-frame + // (possible on right and bottom edges)? + if x >= 0 + && y >= 0 + && (x as usize) < fi.w_in_imp_b + && (y as usize) < fi.h_in_imp_b + { + reference_frame_block_importances + [y as usize * fi.w_in_imp_b + x as usize] += + propagate_amount * fraction; + } + }; + + // Coordinates of the top-left corner of the block intersecting the + // reference block from the top-left. + let top_left_block_x = (reference_x + - if reference_x < 0 { + IMP_BLOCK_SIZE_IN_MV_UNITS - 1 + } else { + 0 + }) + / IMP_BLOCK_SIZE_IN_MV_UNITS + * IMP_BLOCK_SIZE_IN_MV_UNITS; + let top_left_block_y = (reference_y + - if reference_y < 0 { + IMP_BLOCK_SIZE_IN_MV_UNITS - 1 + } else { + 0 + }) + / IMP_BLOCK_SIZE_IN_MV_UNITS + * IMP_BLOCK_SIZE_IN_MV_UNITS; + + debug_assert!(reference_x >= top_left_block_x); + debug_assert!(reference_y >= top_left_block_y); + + let top_right_block_x = + top_left_block_x + IMP_BLOCK_SIZE_IN_MV_UNITS; + let top_right_block_y = top_left_block_y; + let bottom_left_block_x = top_left_block_x; + let bottom_left_block_y = + top_left_block_y + IMP_BLOCK_SIZE_IN_MV_UNITS; + let bottom_right_block_x = top_right_block_x; + let bottom_right_block_y = bottom_left_block_y; + + let top_left_block_fraction = ((top_right_block_x - reference_x) + * (bottom_left_block_y - reference_y)) + as f32 + / IMP_BLOCK_AREA_IN_MV_UNITS as f32; + + propagate( + top_left_block_x, + top_left_block_y, + top_left_block_fraction, + ); + + let top_right_block_fraction = + ((reference_x + IMP_BLOCK_SIZE_IN_MV_UNITS - top_right_block_x) + * (bottom_left_block_y - reference_y)) as f32 + / IMP_BLOCK_AREA_IN_MV_UNITS as f32; + + propagate( + top_right_block_x, + top_right_block_y, + top_right_block_fraction, + ); + + let bottom_left_block_fraction = ((top_right_block_x - reference_x) + * (reference_y + IMP_BLOCK_SIZE_IN_MV_UNITS - bottom_left_block_y)) + as f32 + / IMP_BLOCK_AREA_IN_MV_UNITS as f32; + + propagate( + bottom_left_block_x, + bottom_left_block_y, + bottom_left_block_fraction, + ); + + let bottom_right_block_fraction = + ((reference_x + IMP_BLOCK_SIZE_IN_MV_UNITS - top_right_block_x) + * (reference_y + IMP_BLOCK_SIZE_IN_MV_UNITS + - bottom_left_block_y)) as f32 + / IMP_BLOCK_AREA_IN_MV_UNITS as f32; + + propagate( + bottom_right_block_x, + bottom_right_block_y, + bottom_right_block_fraction, + ); + }); + }); + } + /// Computes the block importances for the current output frame. #[hawktracer(compute_block_importances)] fn compute_block_importances(&mut self) { @@ -887,7 +1042,7 @@ impl ContextInner { .get_mut(&reference_output_frameno) .map(|data| &mut data.fi.block_importances) { - update_block_importances( + Self::update_block_importances( fi, me_stats, frame, @@ -897,169 +1052,6 @@ impl ContextInner { len, reference_frame_block_importances, ); - - #[hawktracer(update_block_importances)] - fn update_block_importances( - fi: &FrameInvariants, me_stats: &crate::me::FrameMEStats, - frame: &Frame, reference_frame: &Frame, bit_depth: usize, - bsize: BlockSize, len: usize, - reference_frame_block_importances: &mut [f32], - ) { - let plane_org = &frame.planes[0]; - let plane_ref = &reference_frame.planes[0]; - - (0..fi.h_in_imp_b) - .zip(fi.lookahead_intra_costs.chunks_exact(fi.w_in_imp_b)) - .zip(fi.block_importances.chunks_exact(fi.w_in_imp_b)) - .for_each(|((y, lookahead_intra_costs), block_importances)| { - (0..fi.w_in_imp_b).for_each(|x| { - let mv = me_stats[y * 2][x * 2].mv; - - // Coordinates of the top-left corner of the reference block, in MV - // units. - let reference_x = - x as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.col as i64; - let reference_y = - y as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.row as i64; - - let region_org = plane_org.region(Area::Rect { - x: (x * IMPORTANCE_BLOCK_SIZE) as isize, - y: (y * IMPORTANCE_BLOCK_SIZE) as isize, - width: IMPORTANCE_BLOCK_SIZE, - height: IMPORTANCE_BLOCK_SIZE, - }); - - let region_ref = plane_ref.region(Area::Rect { - x: reference_x as isize - / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, - y: reference_y as isize - / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, - width: IMPORTANCE_BLOCK_SIZE, - height: IMPORTANCE_BLOCK_SIZE, - }); - - let inter_cost = get_satd( - ®ion_org, - ®ion_ref, - bsize, - bit_depth, - fi.cpu_feature_level, - ) as f32; - - let intra_cost = lookahead_intra_costs[x] as f32; - let future_importance = block_importances[x]; - - let propagate_fraction = if intra_cost <= inter_cost { - 0. - } else { - 1. - inter_cost / intra_cost - }; - - let propagate_amount = (intra_cost + future_importance) - * propagate_fraction - / len as f32; - - let mut propagate = - |block_x_in_mv_units, block_y_in_mv_units, fraction| { - let x = block_x_in_mv_units / IMP_BLOCK_SIZE_IN_MV_UNITS; - let y = block_y_in_mv_units / IMP_BLOCK_SIZE_IN_MV_UNITS; - - // TODO: propagate partially if the block is partially off-frame - // (possible on right and bottom edges)? - if x >= 0 - && y >= 0 - && (x as usize) < fi.w_in_imp_b - && (y as usize) < fi.h_in_imp_b - { - reference_frame_block_importances - [y as usize * fi.w_in_imp_b + x as usize] += - propagate_amount * fraction; - } - }; - - // Coordinates of the top-left corner of the block intersecting the - // reference block from the top-left. - let top_left_block_x = (reference_x - - if reference_x < 0 { - IMP_BLOCK_SIZE_IN_MV_UNITS - 1 - } else { - 0 - }) - / IMP_BLOCK_SIZE_IN_MV_UNITS - * IMP_BLOCK_SIZE_IN_MV_UNITS; - let top_left_block_y = (reference_y - - if reference_y < 0 { - IMP_BLOCK_SIZE_IN_MV_UNITS - 1 - } else { - 0 - }) - / IMP_BLOCK_SIZE_IN_MV_UNITS - * IMP_BLOCK_SIZE_IN_MV_UNITS; - - debug_assert!(reference_x >= top_left_block_x); - debug_assert!(reference_y >= top_left_block_y); - - let top_right_block_x = - top_left_block_x + IMP_BLOCK_SIZE_IN_MV_UNITS; - let top_right_block_y = top_left_block_y; - let bottom_left_block_x = top_left_block_x; - let bottom_left_block_y = - top_left_block_y + IMP_BLOCK_SIZE_IN_MV_UNITS; - let bottom_right_block_x = top_right_block_x; - let bottom_right_block_y = bottom_left_block_y; - - let top_left_block_fraction = ((top_right_block_x - - reference_x) - * (bottom_left_block_y - reference_y)) - as f32 - / IMP_BLOCK_AREA_IN_MV_UNITS as f32; - - propagate( - top_left_block_x, - top_left_block_y, - top_left_block_fraction, - ); - - let top_right_block_fraction = ((reference_x - + IMP_BLOCK_SIZE_IN_MV_UNITS - - top_right_block_x) - * (bottom_left_block_y - reference_y)) - as f32 - / IMP_BLOCK_AREA_IN_MV_UNITS as f32; - - propagate( - top_right_block_x, - top_right_block_y, - top_right_block_fraction, - ); - - let bottom_left_block_fraction = - ((top_right_block_x - reference_x) - * (reference_y + IMP_BLOCK_SIZE_IN_MV_UNITS - - bottom_left_block_y)) as f32 - / IMP_BLOCK_AREA_IN_MV_UNITS as f32; - - propagate( - bottom_left_block_x, - bottom_left_block_y, - bottom_left_block_fraction, - ); - - let bottom_right_block_fraction = - ((reference_x + IMP_BLOCK_SIZE_IN_MV_UNITS - - top_right_block_x) - * (reference_y + IMP_BLOCK_SIZE_IN_MV_UNITS - - bottom_left_block_y)) as f32 - / IMP_BLOCK_AREA_IN_MV_UNITS as f32; - - propagate( - bottom_right_block_x, - bottom_right_block_y, - bottom_right_block_fraction, - ); - }); - }); - } } }); From ac65e933c56e4fc0dacf349e60bd545aec59f941 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Sat, 13 Feb 2021 00:02:22 +0100 Subject: [PATCH 035/155] Move the chunck_exact iterators in separate variables --- src/api/internal.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/api/internal.rs b/src/api/internal.rs index ba1a9af2af..4495ddc101 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -808,10 +808,14 @@ impl ContextInner { ) { let plane_org = &frame.planes[0]; let plane_ref = &reference_frame.planes[0]; + let lookahead_intra_costs_lines = + fi.lookahead_intra_costs.chunks_exact(fi.w_in_imp_b); + let block_importances_lines = + fi.block_importances.chunks_exact(fi.w_in_imp_b); (0..fi.h_in_imp_b) - .zip(fi.lookahead_intra_costs.chunks_exact(fi.w_in_imp_b)) - .zip(fi.block_importances.chunks_exact(fi.w_in_imp_b)) + .zip(lookahead_intra_costs_lines) + .zip(block_importances_lines) .for_each(|((y, lookahead_intra_costs), block_importances)| { (0..fi.w_in_imp_b).for_each(|x| { let mv = me_stats[y * 2][x * 2].mv; From 705fb22f191c8468313a9b32ac9796887c7f93b6 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Thu, 18 Feb 2021 15:01:18 +0100 Subject: [PATCH 036/155] Split the temporal rdo update Calcolate all the values first and then propagate them. --- src/api/internal.rs | 258 ++++++++++++++++++++++---------------------- 1 file changed, 129 insertions(+), 129 deletions(-) diff --git a/src/api/internal.rs b/src/api/internal.rs index 4495ddc101..395a66e48c 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -813,149 +813,149 @@ impl ContextInner { let block_importances_lines = fi.block_importances.chunks_exact(fi.w_in_imp_b); - (0..fi.h_in_imp_b) - .zip(lookahead_intra_costs_lines) + let costs: Vec<_> = lookahead_intra_costs_lines .zip(block_importances_lines) - .for_each(|((y, lookahead_intra_costs), block_importances)| { - (0..fi.w_in_imp_b).for_each(|x| { - let mv = me_stats[y * 2][x * 2].mv; - - // Coordinates of the top-left corner of the reference block, in MV - // units. - let reference_x = - x as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.col as i64; - let reference_y = - y as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.row as i64; - - let region_org = plane_org.region(Area::Rect { - x: (x * IMPORTANCE_BLOCK_SIZE) as isize, - y: (y * IMPORTANCE_BLOCK_SIZE) as isize, - width: IMPORTANCE_BLOCK_SIZE, - height: IMPORTANCE_BLOCK_SIZE, - }); - - let region_ref = plane_ref.region(Area::Rect { - x: reference_x as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, - y: reference_y as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, - width: IMPORTANCE_BLOCK_SIZE, - height: IMPORTANCE_BLOCK_SIZE, - }); - - let inter_cost = get_satd( - ®ion_org, - ®ion_ref, - bsize, - bit_depth, - fi.cpu_feature_level, - ) as f32; + .enumerate() + .flat_map(|(y, (lookahead_intra_costs, block_importances))| { + lookahead_intra_costs + .iter() + .zip(block_importances.iter()) + .enumerate() + .map(move |(x, (&intra_cost, &future_importance))| { + let mv = me_stats[y * 2][x * 2].mv; + + // Coordinates of the top-left corner of the reference block, in MV + // units. + let reference_x = + x as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.col as i64; + let reference_y = + y as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.row as i64; + + let region_org = plane_org.region(Area::Rect { + x: (x * IMPORTANCE_BLOCK_SIZE) as isize, + y: (y * IMPORTANCE_BLOCK_SIZE) as isize, + width: IMPORTANCE_BLOCK_SIZE, + height: IMPORTANCE_BLOCK_SIZE, + }); + + let region_ref = plane_ref.region(Area::Rect { + x: reference_x as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, + y: reference_y as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, + width: IMPORTANCE_BLOCK_SIZE, + height: IMPORTANCE_BLOCK_SIZE, + }); + + let inter_cost = get_satd( + ®ion_org, + ®ion_ref, + bsize, + bit_depth, + fi.cpu_feature_level, + ) as f32; + + let intra_cost = intra_cost as f32; + // let intra_cost = lookahead_intra_costs[x] as f32; + // let future_importance = block_importances[x]; + + let propagate_fraction = if intra_cost <= inter_cost { + 0. + } else { + 1. - inter_cost / intra_cost + }; - let intra_cost = lookahead_intra_costs[x] as f32; - let future_importance = block_importances[x]; + let propagate_amount = (intra_cost + future_importance) + * propagate_fraction + / len as f32; - let propagate_fraction = if intra_cost <= inter_cost { - 0. - } else { - 1. - inter_cost / intra_cost + (propagate_amount, reference_x, reference_y) + }) + }) + .collect(); + + costs.into_iter().for_each( + |(propagate_amount, reference_x, reference_y)| { + let mut propagate = + |block_x_in_mv_units, block_y_in_mv_units, fraction| { + let x = block_x_in_mv_units / IMP_BLOCK_SIZE_IN_MV_UNITS; + let y = block_y_in_mv_units / IMP_BLOCK_SIZE_IN_MV_UNITS; + + // TODO: propagate partially if the block is partially off-frame + // (possible on right and bottom edges)? + if x >= 0 + && y >= 0 + && (x as usize) < fi.w_in_imp_b + && (y as usize) < fi.h_in_imp_b + { + reference_frame_block_importances + [y as usize * fi.w_in_imp_b + x as usize] += + propagate_amount * fraction; + } }; - let propagate_amount = - (intra_cost + future_importance) * propagate_fraction / len as f32; - - let mut propagate = - |block_x_in_mv_units, block_y_in_mv_units, fraction| { - let x = block_x_in_mv_units / IMP_BLOCK_SIZE_IN_MV_UNITS; - let y = block_y_in_mv_units / IMP_BLOCK_SIZE_IN_MV_UNITS; - - // TODO: propagate partially if the block is partially off-frame - // (possible on right and bottom edges)? - if x >= 0 - && y >= 0 - && (x as usize) < fi.w_in_imp_b - && (y as usize) < fi.h_in_imp_b - { - reference_frame_block_importances - [y as usize * fi.w_in_imp_b + x as usize] += - propagate_amount * fraction; - } - }; - - // Coordinates of the top-left corner of the block intersecting the - // reference block from the top-left. - let top_left_block_x = (reference_x - - if reference_x < 0 { - IMP_BLOCK_SIZE_IN_MV_UNITS - 1 - } else { - 0 - }) - / IMP_BLOCK_SIZE_IN_MV_UNITS - * IMP_BLOCK_SIZE_IN_MV_UNITS; - let top_left_block_y = (reference_y - - if reference_y < 0 { - IMP_BLOCK_SIZE_IN_MV_UNITS - 1 - } else { - 0 - }) - / IMP_BLOCK_SIZE_IN_MV_UNITS - * IMP_BLOCK_SIZE_IN_MV_UNITS; - - debug_assert!(reference_x >= top_left_block_x); - debug_assert!(reference_y >= top_left_block_y); - - let top_right_block_x = - top_left_block_x + IMP_BLOCK_SIZE_IN_MV_UNITS; - let top_right_block_y = top_left_block_y; - let bottom_left_block_x = top_left_block_x; - let bottom_left_block_y = - top_left_block_y + IMP_BLOCK_SIZE_IN_MV_UNITS; - let bottom_right_block_x = top_right_block_x; - let bottom_right_block_y = bottom_left_block_y; - - let top_left_block_fraction = ((top_right_block_x - reference_x) - * (bottom_left_block_y - reference_y)) - as f32 + // Coordinates of the top-left corner of the block intersecting the + // reference block from the top-left. + let top_left_block_x = (reference_x + - if reference_x < 0 { IMP_BLOCK_SIZE_IN_MV_UNITS - 1 } else { 0 }) + / IMP_BLOCK_SIZE_IN_MV_UNITS + * IMP_BLOCK_SIZE_IN_MV_UNITS; + let top_left_block_y = (reference_y + - if reference_y < 0 { IMP_BLOCK_SIZE_IN_MV_UNITS - 1 } else { 0 }) + / IMP_BLOCK_SIZE_IN_MV_UNITS + * IMP_BLOCK_SIZE_IN_MV_UNITS; + + debug_assert!(reference_x >= top_left_block_x); + debug_assert!(reference_y >= top_left_block_y); + + let top_right_block_x = top_left_block_x + IMP_BLOCK_SIZE_IN_MV_UNITS; + let top_right_block_y = top_left_block_y; + let bottom_left_block_x = top_left_block_x; + let bottom_left_block_y = + top_left_block_y + IMP_BLOCK_SIZE_IN_MV_UNITS; + let bottom_right_block_x = top_right_block_x; + let bottom_right_block_y = bottom_left_block_y; + + let top_left_block_fraction = ((top_right_block_x - reference_x) + * (bottom_left_block_y - reference_y)) + as f32 + / IMP_BLOCK_AREA_IN_MV_UNITS as f32; + + propagate(top_left_block_x, top_left_block_y, top_left_block_fraction); + + let top_right_block_fraction = + ((reference_x + IMP_BLOCK_SIZE_IN_MV_UNITS - top_right_block_x) + * (bottom_left_block_y - reference_y)) as f32 / IMP_BLOCK_AREA_IN_MV_UNITS as f32; - propagate( - top_left_block_x, - top_left_block_y, - top_left_block_fraction, - ); + propagate( + top_right_block_x, + top_right_block_y, + top_right_block_fraction, + ); - let top_right_block_fraction = - ((reference_x + IMP_BLOCK_SIZE_IN_MV_UNITS - top_right_block_x) - * (bottom_left_block_y - reference_y)) as f32 - / IMP_BLOCK_AREA_IN_MV_UNITS as f32; + let bottom_left_block_fraction = ((top_right_block_x - reference_x) + * (reference_y + IMP_BLOCK_SIZE_IN_MV_UNITS - bottom_left_block_y)) + as f32 + / IMP_BLOCK_AREA_IN_MV_UNITS as f32; - propagate( - top_right_block_x, - top_right_block_y, - top_right_block_fraction, - ); + propagate( + bottom_left_block_x, + bottom_left_block_y, + bottom_left_block_fraction, + ); - let bottom_left_block_fraction = ((top_right_block_x - reference_x) + let bottom_right_block_fraction = + ((reference_x + IMP_BLOCK_SIZE_IN_MV_UNITS - top_right_block_x) * (reference_y + IMP_BLOCK_SIZE_IN_MV_UNITS - bottom_left_block_y)) as f32 / IMP_BLOCK_AREA_IN_MV_UNITS as f32; - propagate( - bottom_left_block_x, - bottom_left_block_y, - bottom_left_block_fraction, - ); - - let bottom_right_block_fraction = - ((reference_x + IMP_BLOCK_SIZE_IN_MV_UNITS - top_right_block_x) - * (reference_y + IMP_BLOCK_SIZE_IN_MV_UNITS - - bottom_left_block_y)) as f32 - / IMP_BLOCK_AREA_IN_MV_UNITS as f32; - - propagate( - bottom_right_block_x, - bottom_right_block_y, - bottom_right_block_fraction, - ); - }); - }); + propagate( + bottom_right_block_x, + bottom_right_block_y, + bottom_right_block_fraction, + ); + }, + ); } /// Computes the block importances for the current output frame. From 975595def0ab072455ab64c07defdadd3a9c4c42 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Fri, 19 Feb 2021 14:48:05 +0100 Subject: [PATCH 037/155] Extend the rayon API mocked for wasi It will be used in the next commit --- src/lib.rs | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 63afa2d574..57d9184401 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -146,6 +146,41 @@ mod rayon { self.into_par_iter() } } + + pub trait ParallelIterator: Iterator { + fn flat_map_iter(self, f: F) -> std::iter::FlatMap + where + Self: Sized, + U: IntoIterator, + F: FnMut(::Item) -> U, + { + self.flat_map(f) + } + } + + impl ParallelIterator for I {} + } + + pub mod slice { + pub trait ParallelSlice { + fn par_chunks_exact( + &self, chunk_size: usize, + ) -> std::slice::ChunksExact<'_, T>; + } + + impl ParallelSlice for [T] { + #[inline] + fn par_chunks_exact( + &self, chunk_size: usize, + ) -> std::slice::ChunksExact<'_, T> { + self.chunks_exact(chunk_size) + } + } + } + + pub mod prelude { + pub use super::iter::*; + pub use super::slice::*; } pub fn join(oper_a: A, oper_b: B) -> (RA, RB) From 522f134c7efb3c6d70ea52917376ffe1d4928332 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Thu, 18 Feb 2021 18:57:53 +0100 Subject: [PATCH 038/155] Update the temporal-rdo by-line in parallel About 9% faster for 1080p --- src/api/internal.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/api/internal.rs b/src/api/internal.rs index 395a66e48c..b2ab16cd7c 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -21,6 +21,7 @@ use crate::rate::{ RCState, FRAME_NSUBTYPES, FRAME_SUBTYPE_I, FRAME_SUBTYPE_P, FRAME_SUBTYPE_SEF, }; +use crate::rayon::prelude::*; use crate::scenechange::SceneChangeDetector; use crate::stats::EncoderStats; use crate::tiling::Area; @@ -809,14 +810,14 @@ impl ContextInner { let plane_org = &frame.planes[0]; let plane_ref = &reference_frame.planes[0]; let lookahead_intra_costs_lines = - fi.lookahead_intra_costs.chunks_exact(fi.w_in_imp_b); + fi.lookahead_intra_costs.par_chunks_exact(fi.w_in_imp_b); let block_importances_lines = - fi.block_importances.chunks_exact(fi.w_in_imp_b); + fi.block_importances.par_chunks_exact(fi.w_in_imp_b); let costs: Vec<_> = lookahead_intra_costs_lines .zip(block_importances_lines) .enumerate() - .flat_map(|(y, (lookahead_intra_costs, block_importances))| { + .flat_map_iter(|(y, (lookahead_intra_costs, block_importances))| { lookahead_intra_costs .iter() .zip(block_importances.iter()) @@ -866,7 +867,6 @@ impl ContextInner { let propagate_amount = (intra_cost + future_importance) * propagate_fraction / len as f32; - (propagate_amount, reference_x, reference_y) }) }) From 6a554f24045a031847a5e8a2d5ea7b4f3c9885e5 Mon Sep 17 00:00:00 2001 From: Zen <46526140+master-of-zen@users.noreply.github.com> Date: Fri, 19 Feb 2021 18:09:47 +0200 Subject: [PATCH 039/155] Per speedsetting rdo-lookahead default values Make speed settings faster with a reasonable quality trade-off. Co-authored-by: Vibhoothi --- src/api/config/speedsettings.rs | 11 +++++++++++ src/bin/common.rs | 15 ++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/api/config/speedsettings.rs b/src/api/config/speedsettings.rs index e416cb531e..127afea9e6 100644 --- a/src/api/config/speedsettings.rs +++ b/src/api/config/speedsettings.rs @@ -209,6 +209,17 @@ impl SpeedSettings { speed <= 1 } + /// Set default rdo-lookahead-frames for different speed settings + pub fn rdo_lookahead_frames(speed: usize) -> usize { + match speed { + 9..=10 => 10, + 6..=8 => 20, + 3..=5 => 30, + 0..=2 => 40, + _ => 40, + } + } + const fn rdo_tx_decision_preset(speed: usize) -> bool { speed <= 5 } diff --git a/src/bin/common.rs b/src/bin/common.rs index cfc3d8fb3d..6706eae272 100644 --- a/src/bin/common.rs +++ b/src/bin/common.rs @@ -214,7 +214,8 @@ pub fn parse_cli() -> Result { ) .arg( Arg::with_name("RDO_LOOKAHEAD_FRAMES") - .help("Number of frames encoder should lookahead for RDO purposes [default: 40]\n") + .help("Number of frames encoder should lookahead for RDO purposes\n\ + [default value for speed levels: 10,9 - 10; 8,7,6 - 20; 5,4,3 - 30; 2,1,0 - 40]\n") .long("rdo-lookahead-frames") .takes_value(true) ) @@ -669,8 +670,16 @@ fn parse_config(matches: &ArgMatches<'_>) -> Result { cfg.reservoir_frame_delay = matches .value_of("RESERVOIR_FRAME_DELAY") .map(|reservior_frame_delay| reservior_frame_delay.parse().unwrap()); - cfg.rdo_lookahead_frames = - matches.value_of("RDO_LOOKAHEAD_FRAMES").unwrap_or("40").parse().unwrap(); + + // rdo-lookahead-frames + let maybe_rdo = matches.value_of("RDO_LOOKAHEAD_FRAMES"); + if maybe_rdo.is_some() { + cfg.rdo_lookahead_frames = + matches.value_of("RDO_LOOKAHEAD_FRAMES").unwrap().parse().unwrap(); + } else { + cfg.rdo_lookahead_frames = SpeedSettings::rdo_lookahead_frames(speed) + } + cfg.tune = matches.value_of("TUNE").unwrap().parse().unwrap(); if cfg.tune == Tune::Psychovisual { From 3e91d52d086782b5032e6676b58f7a7b5051c92e Mon Sep 17 00:00:00 2001 From: "Timothy B. Terriberry" Date: Sat, 23 Jan 2021 17:39:55 -0800 Subject: [PATCH 040/155] Document minimum Rust version in README.md We spent several paragraphs talking about nasm, but never mentioned Rust. --- README.md | 3 +++ build.rs | 1 + 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index 5a2a1d6ccb..0cf8a0fbe3 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,9 @@ For the foreseeable future, a weekly pre-release of rav1e will be [published](ht ## Building +### Toolchain: Rust +rav1e currently requires Rust 1.44.1 or later to build. + ### Dependency: NASM Some `x86_64`-specific optimizations require [NASM](https://nasm.us/) `2.14.02` or newer and are enabled by default. diff --git a/build.rs b/build.rs index c9b81f64a2..3b31f7a855 100644 --- a/build.rs +++ b/build.rs @@ -179,6 +179,7 @@ fn build_asm_files() { fn rustc_version_check() { // This should match the version in the CI + // Make sure to updated README.md when this changes. const REQUIRED_VERSION: &str = "1.44.1"; if version().unwrap() < Version::parse(REQUIRED_VERSION).unwrap() { eprintln!("rav1e requires rustc >= {}.", REQUIRED_VERSION); From 98e1872b1c955d598537fc25c5266601b9342252 Mon Sep 17 00:00:00 2001 From: "Timothy B. Terriberry" Date: Sat, 23 Jan 2021 17:46:58 -0800 Subject: [PATCH 041/155] Fix URL for theoretical_results.pdf. RIP people.xiph.org. --- src/quantize.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/quantize.rs b/src/quantize.rs index dcea1fab74..597c185699 100644 --- a/src/quantize.rs +++ b/src/quantize.rs @@ -230,7 +230,7 @@ impl QuantizationContext { // to be quite close (+/- 1/256), for both inter and intra, // post-deadzoning. // - // [1] https://people.xiph.org/~jm/notes/theoretical_results.pdf + // [1] https://jmvalin.ca/notes/theoretical_results.pdf self.dc_offset = self.dc_quant * (if is_intra { 109 } else { 108 }) / 256; self.ac_offset0 = self.ac_quant * (if is_intra { 98 } else { 97 }) / 256; self.ac_offset1 = self.ac_quant * (if is_intra { 109 } else { 108 }) / 256; From 6b068d1d553695bd48f325ba7361e950a85fb546 Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Mon, 18 May 2020 16:34:40 -0400 Subject: [PATCH 042/155] Fix intermittent dequantize test failure The test was generating values one less than -i16::MAX. Fixes #2276 --- src/asm/x86/quantize.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs index 81268deb89..27eaeb33a6 100644 --- a/src/asm/x86/quantize.rs +++ b/src/asm/x86/quantize.rs @@ -154,6 +154,7 @@ unsafe fn dequantize_avx2( #[cfg(test)] mod test { use super::*; + use rand::distributions::{Distribution, Uniform}; use rand::{thread_rng, Rng}; #[test] @@ -191,9 +192,10 @@ mod test { let mut rcoeffs = Aligned::new([0i16; 32 * 32]); // Generate quantized coefficients upto the eob + let between = Uniform::from(-std::i16::MAX..=std::i16::MAX); for (i, qcoeff) in qcoeffs.data.iter_mut().enumerate().take(eob) { - *qcoeff = - rng.gen::() / if i == 0 { dc_quant } else { ac_quant }; + *qcoeff = between.sample(&mut rng) + / if i == 0 { dc_quant } else { ac_quant }; } // Rely on quantize's internal tests From 2e43720073e7fa1acd1d0ec1c591d2ee0ac382c1 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Fri, 26 Feb 2021 15:41:44 +0900 Subject: [PATCH 043/155] CI: Update libdav1d to 0.8.2-dmo1 --- .github/workflows/rav1e.yml | 6 +++--- .travis/install-dav1d.sh | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index b482f264f3..b0d911762c 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -144,11 +144,11 @@ jobs: matrix.conf == 'grcov-coveralls' || matrix.conf == 'fuzz' || matrix.conf == 'no-asm-tests' env: LINK: https://www.deb-multimedia.org/pool/main/d/dav1d-dmo - DAV1D_VERSION: 0.8.1-dmo1 + DAV1D_VERSION: 0.8.2-dmo1 DAV1D_DEV_SHA256: >- - dcf911325699d93a90818e16736e2c93b29d8e7538c1545accd3b25c610876c0 + 04d30fc34056467b91a627563c61b9a0046a2e084bb649791cd31887a6c76d8e DAV1D_LIB_SHA256: >- - 06f51b9660d413417827270b298e2ad541bd8ddaae7e027ebcb6bb7b6b1ad006 + 0c3debb3a926e10009503e639dddcfd4082ed6e012340ca49682b738c243dedc run: | echo "$LINK/libdav1d-dev_${DAV1D_VERSION}_amd64.deb" >> DEBS echo "$LINK/libdav1d5_${DAV1D_VERSION}_amd64.deb" >> DEBS diff --git a/.travis/install-dav1d.sh b/.travis/install-dav1d.sh index 5a2f2b1642..deddb47689 100755 --- a/.travis/install-dav1d.sh +++ b/.travis/install-dav1d.sh @@ -1,7 +1,7 @@ #!/bin/bash set -ex -DAV1D_VERSION="0.8.1-dmo1" +DAV1D_VERSION="0.8.2-dmo1" PKG_URL="https://www.deb-multimedia.org/pool/main/d/dav1d-dmo" case "$ARCH" in @@ -17,10 +17,10 @@ curl -O "$PKG_URL/libdav1d-dev_${DAV1D_VERSION}_$ARCH.deb" \ -O "$PKG_URL/libdav1d5_${DAV1D_VERSION}_$ARCH.deb" sha256sum --check --ignore-missing < Date: Fri, 22 Jan 2021 00:05:53 +0900 Subject: [PATCH 044/155] Prepare to merge src/x86/cdef.asm Rename so that upstream patches can be replayed verbatim. --- build.rs | 2 +- src/x86/{cdef.asm => cdef_rav1e.asm} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename src/x86/{cdef.asm => cdef_rav1e.asm} (100%) diff --git a/build.rs b/build.rs index 3b31f7a855..22dbd222b9 100644 --- a/build.rs +++ b/build.rs @@ -99,7 +99,7 @@ fn build_nasm_files() { "src/x86/sad_avx.asm", "src/x86/satd.asm", "src/x86/sse.asm", - "src/x86/cdef.asm", + "src/x86/cdef_rav1e.asm", "src/x86/cdef_sse.asm", "src/x86/cdef16_avx2.asm", "src/x86/cdef16_sse.asm", diff --git a/src/x86/cdef.asm b/src/x86/cdef_rav1e.asm similarity index 100% rename from src/x86/cdef.asm rename to src/x86/cdef_rav1e.asm From 690bdc65c87bc1bc0e380db93f1f01a2035dd14d Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 27 Oct 2018 18:38:21 -0400 Subject: [PATCH 045/155] Add AVX2 SIMD implementation for cdef_dir cdef_dir_8bpc_c: 629.3 cdef_dir_8bpc_avx2: 82.4 First 1000 frames of Chimera 1080p: before: 0m23.084s after: 0m21.860s --- src/x86/cdef.asm | 264 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 src/x86/cdef.asm diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm new file mode 100644 index 0000000000..8ae4c3b476 --- /dev/null +++ b/src/x86/cdef.asm @@ -0,0 +1,264 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +pd_04512763: dd 0, 4, 5, 1, 2, 7, 6, 3 +div_table: dd 840, 420, 280, 210, 168, 140, 120, 105 + dd 420, 210, 140, 105 +pd_04261537: dd 0, 4, 2, 6, 1, 5, 3, 7 +shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 +shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +pw_128: times 2 dw 128 + +SECTION .text + +INIT_YMM avx2 +cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 + lea stride3q, [strideq*3] + movq xm0, [srcq+strideq*0] + movq xm1, [srcq+strideq*1] + movq xm2, [srcq+strideq*2] + movq xm3, [srcq+stride3q] + lea srcq, [srcq+strideq*4] + vpbroadcastq m4, [srcq+strideq*0] + vpbroadcastq m5, [srcq+strideq*1] + vpbroadcastq m6, [srcq+strideq*2] + vpbroadcastq m7, [srcq+stride3q] + vpbroadcastd m8, [pw_128] + pxor m9, m9 + + vpblendd m0, m0, m7, 0xf0 + vpblendd m1, m1, m6, 0xf0 + vpblendd m2, m2, m5, 0xf0 + vpblendd m3, m3, m4, 0xf0 + + punpcklbw m0, m9 + punpcklbw m1, m9 + punpcklbw m2, m9 + punpcklbw m3, m9 + + psubw m0, m8 + psubw m1, m8 + psubw m2, m8 + psubw m3, m8 + + ; shuffle registers to generate partial_sum_diag[0-1] together + vpermq m7, m0, q1032 + vpermq m6, m1, q1032 + vpermq m5, m2, q1032 + vpermq m4, m3, q1032 + + ; start with partial_sum_hv[0-1] + paddw m8, m0, m1 + paddw m9, m2, m3 + phaddw m10, m0, m1 + phaddw m11, m2, m3 + paddw m8, m9 + phaddw m10, m11 + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + paddw xm8, xm9 ; partial_sum_hv[1] + phaddw xm10, xm11 ; partial_sum_hv[0] + vinserti128 m8, xm10, 1 + vpbroadcastd m9, [div_table+44] + pmaddwd m8, m8 + pmulld m8, m9 ; cost6[2a-d] | cost2[a-d] + + ; create aggregates [lower half]: + ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+ + ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0 + ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+ + ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x + ; and [upper half]: + ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+ + ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567 + ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+ + ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx + ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd + + pslldq m9, m1, 2 + psrldq m10, m1, 14 + pslldq m11, m2, 4 + psrldq m12, m2, 12 + pslldq m13, m3, 6 + psrldq m14, m3, 10 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 + pslldq m11, m4, 8 + psrldq m12, m4, 8 + pslldq m13, m5, 10 + psrldq m14, m5, 6 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 + pslldq m11, m6, 12 + psrldq m12, m6, 4 + pslldq m13, m7, 14 + psrldq m14, m7, 2 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero] + vbroadcasti128 m14, [shufw_6543210x] + vbroadcasti128 m13, [div_table+16] + vbroadcasti128 m12, [div_table+0] + paddw m9, m0 ; partial_sum_diag[0/1][0-7] + pshufb m10, m14 + punpckhwd m11, m9, m10 + punpcklwd m9, m10 + pmaddwd m11, m11 + pmaddwd m9, m9 + pmulld m11, m13 + pmulld m9, m12 + paddd m9, m11 ; cost0[a-d] | cost4[a-d] + + ; merge horizontally and vertically for partial_sum_alt[0-3] + paddw m10, m0, m1 + paddw m11, m2, m3 + paddw m12, m4, m5 + paddw m13, m6, m7 + phaddw m0, m4 + phaddw m1, m5 + phaddw m2, m6 + phaddw m3, m7 + + ; create aggregates [lower half]: + ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234 + ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx + ; and [upper half]: + ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 + ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx + ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd + + vbroadcasti128 m14, [shufw_210xxxxx] + pslldq m4, m11, 2 + psrldq m11, 14 + pslldq m5, m12, 4 + psrldq m12, 12 + pslldq m6, m13, 6 + psrldq m13, 10 + paddw m4, m10 + paddw m11, m12 + vpbroadcastd m12, [div_table+44] + paddw m5, m6 + paddw m11, m13 ; partial_sum_alt[3/2] right + vbroadcasti128 m13, [div_table+32] + paddw m4, m5 ; partial_sum_alt[3/2] left + pshufb m11, m14 + punpckhwd m6, m4, m11 + punpcklwd m4, m11 + pmaddwd m6, m6 + pmaddwd m4, m4 + pmulld m6, m12 + pmulld m4, m13 + paddd m4, m6 ; cost7[a-d] | cost5[a-d] + + ; create aggregates [lower half]: + ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234 + ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx + ; and [upper half]: + ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 + ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx + ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd + + pslldq m5, m1, 2 + psrldq m1, 14 + pslldq m6, m2, 4 + psrldq m2, 12 + pslldq m7, m3, 6 + psrldq m3, 10 + paddw m5, m0 + paddw m1, m2 + paddw m6, m7 + paddw m1, m3 ; partial_sum_alt[0/1] right + paddw m5, m6 ; partial_sum_alt[0/1] left + pshufb m1, m14 + punpckhwd m6, m5, m1 + punpcklwd m5, m1 + pmaddwd m6, m6 + pmaddwd m5, m5 + pmulld m6, m12 + pmulld m5, m13 + paddd m5, m6 ; cost1[a-d] | cost3[a-d] + + mova xm0, [pd_04512763+ 0] + mova xm1, [pd_04512763+ 16] + phaddd m9, m8 + phaddd m5, m4 + phaddd m9, m5 + vpermd m0, m9 ; cost[0/4/2/6] + vpermd m1, m9 ; cost[1/5/3/7] + + ; now find the best cost, its idx^4 complement, and its idx + pcmpgtd xm2, xm1, xm0 ; [1/5/3/7] > [0/4/2/6] + pand xm3, xm2, xm1 + pandn xm4, xm2, xm0 + por xm3, xm4 ; higher 4 values + pshufd xm1, xm1, q2301 + pshufd xm0, xm0, q2301 + pand xm1, xm2, xm1 + pandn xm0, xm2, xm0 + por xm0, xm1 ; complementary 4 values at idx^4 offset + pand xm13, xm2, [pd_04261537+16] + pandn xm14, xm2, [pd_04261537+ 0] + por xm14, xm13 ; indices + + punpckhqdq xm4, xm3, xm0 + punpcklqdq xm3, xm0 + pcmpgtd xm5, xm4, xm3 ; [2or3-6or7] > [0or1/4or5] + punpcklqdq xm5, xm5 + pand xm6, xm5, xm4 + pandn xm7, xm5, xm3 + por xm6, xm7 ; { highest 2 values, complements at idx^4 } + movhlps xm13, xm14 + pand xm13, xm5, xm13 + pandn xm14, xm5, xm14 + por xm14, xm13 + + pshufd xm7, xm6, q3311 + pcmpgtd xm8, xm7, xm6 ; [4or5or6or7] > [0or1or2or3] + punpcklqdq xm8, xm8 + pand xm9, xm8, xm7 + pandn xm10, xm8, xm6 + por xm9, xm10 ; max + movhlps xm10, xm9 ; complement at idx^4 + psubd xm9, xm10 + psrld xm9, 10 + movd [varq], xm9 + pshufd xm13, xm14, q1111 + pand xm13, xm8, xm13 + pandn xm14, xm8, xm14 + por xm14, xm13 + movd eax, xm14 + RET +%endif ; ARCH_X86_64 From 3f74513d01c8d05c8fd6dddc0723652717af08d1 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Mon, 29 Oct 2018 09:26:24 -0400 Subject: [PATCH 046/155] Add 8x8 cdef_filter AVX2 implementation cdef_filter_8x8_8bpc_c: 7913.0 cdef_filter_8x8_8bpc_avx2: 309.9 First 1000 frames of Chimera 1080p: before: 0m23.100s after: 0m17.863s --- src/x86/cdef.asm | 312 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 312 insertions(+) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 8ae4c3b476..4956da8ede 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -36,9 +36,321 @@ pd_04261537: dd 0, 4, 2, 6, 1, 5, 3, 7 shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pw_128: times 2 dw 128 +pw_2048: times 2 dw 2048 +tap_table: dw 4, 2, 3, 3, 2, 1, 2, 1 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + db 1 * 16 + 0, 2 * 16 + 0 + db 1 * 16 + 0, 2 * 16 - 1 + ; the last 6 are repeats of the first 6 so we don't need to & 7 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 SECTION .text +INIT_YMM avx2 +cglobal cdef_filter_8x8, 4, 9, 16, 26 * 16, dst, stride, left, top, \ + pri, sec, stride3, dst4, edge +%define px rsp+32+2*32 + pcmpeqw m14, m14 + psrlw m14, 1 ; 0x7fff + mov edged, r8m + + ; prepare pixel buffers - body/right + lea dst4q, [dstq+strideq*4] + lea stride3q, [strideq*3] + test edged, 2 ; have_right + jz .no_right + pmovzxbw m1, [dstq+strideq*0] + pmovzxbw m2, [dstq+strideq*1] + pmovzxbw m3, [dstq+strideq*2] + pmovzxbw m4, [dstq+stride3q] + movu [px+0*32], m1 + movu [px+1*32], m2 + movu [px+2*32], m3 + movu [px+3*32], m4 + pmovzxbw m1, [dst4q+strideq*0] + pmovzxbw m2, [dst4q+strideq*1] + pmovzxbw m3, [dst4q+strideq*2] + pmovzxbw m4, [dst4q+stride3q] + movu [px+4*32], m1 + movu [px+5*32], m2 + movu [px+6*32], m3 + movu [px+7*32], m4 + jmp .body_done +.no_right: + pmovzxbw xm1, [dstq+strideq*0] + pmovzxbw xm2, [dstq+strideq*1] + pmovzxbw xm3, [dstq+strideq*2] + pmovzxbw xm4, [dstq+stride3q] + movu [px+0*32], xm1 + movu [px+1*32], xm2 + movu [px+2*32], xm3 + movu [px+3*32], xm4 + movd [px+0*32+16], xm14 + movd [px+1*32+16], xm14 + movd [px+2*32+16], xm14 + movd [px+3*32+16], xm14 + pmovzxbw xm1, [dst4q+strideq*0] + pmovzxbw xm2, [dst4q+strideq*1] + pmovzxbw xm3, [dst4q+strideq*2] + pmovzxbw xm4, [dst4q+stride3q] + movu [px+4*32], xm1 + movu [px+5*32], xm2 + movu [px+6*32], xm3 + movu [px+7*32], xm4 + movd [px+4*32+16], xm14 + movd [px+5*32+16], xm14 + movd [px+6*32+16], xm14 + movd [px+7*32+16], xm14 +.body_done: + + ; top + DEFINE_ARGS dst, stride, left, top2, pri, sec, top1, dummy, edge + test edged, 4 ; have_top + jz .no_top + mov top1q, [top2q+0*gprsize] + mov top2q, [top2q+1*gprsize] + test edged, 1 ; have_left + jz .top_no_left + test edged, 2 ; have_right + jz .top_no_right + pmovzxbw m1, [top1q-4] + pmovzxbw m2, [top2q-4] + movu [px-2*32-8], m1 + movu [px-1*32-8], m2 + jmp .top_done +.top_no_right: + pmovzxbw m1, [top1q-8] + pmovzxbw m2, [top2q-8] + movu [px-2*32-16], m1 + movu [px-1*32-16], m2 + movd [px-2*32+16], xm14 + movd [px-1*32+16], xm14 + jmp .top_done +.top_no_left: + test edged, 2 ; have_right + jz .top_no_left_right + pmovzxbw m1, [top1q] + pmovzxbw m2, [top2q] + movu [px-2*32+0], m1 + movu [px-1*32+0], m2 + movd [px-2*32-4], xm14 + movd [px-1*32-4], xm14 + jmp .top_done +.top_no_left_right: + pmovzxbw xm1, [top1q] + pmovzxbw xm2, [top2q] + movu [px-2*32+0], xm1 + movu [px-1*32+0], xm2 + movd [px-2*32-4], xm14 + movd [px-1*32-4], xm14 + movd [px-2*32+16], xm14 + movd [px-1*32+16], xm14 + jmp .top_done +.no_top: + movu [px-2*32-8], m14 + movu [px-1*32-8], m14 +.top_done: + + ; left + test edged, 1 ; have_left + jz .no_left + pmovzxbw xm1, [leftq+ 0] + pmovzxbw xm2, [leftq+ 8] + movd [px+0*32-4], xm1 + pextrd [px+1*32-4], xm1, 1 + pextrd [px+2*32-4], xm1, 2 + pextrd [px+3*32-4], xm1, 3 + movd [px+4*32-4], xm2 + pextrd [px+5*32-4], xm2, 1 + pextrd [px+6*32-4], xm2, 2 + pextrd [px+7*32-4], xm2, 3 + jmp .left_done +.no_left: + movd [px+0*32-4], xm14 + movd [px+1*32-4], xm14 + movd [px+2*32-4], xm14 + movd [px+3*32-4], xm14 + movd [px+4*32-4], xm14 + movd [px+5*32-4], xm14 + movd [px+6*32-4], xm14 + movd [px+7*32-4], xm14 +.left_done: + + ; bottom + DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, dummy2, dummy3, edge + test edged, 8 ; have_bottom + jz .no_bottom + lea dst8q, [dstq+8*strideq] + test edged, 1 ; have_left + jz .bottom_no_left + test edged, 2 ; have_right + jz .bottom_no_right + pmovzxbw m1, [dst8q-4] + pmovzxbw m2, [dst8q+strideq-4] + movu [px+8*32-8], m1 + movu [px+9*32-8], m2 + jmp .bottom_done +.bottom_no_right: + pmovzxbw m1, [dst8q-8] + pmovzxbw m2, [dst8q+strideq-8] + movu [px+8*32-16], m1 + movu [px+9*32-16], m2 + movd [px+7*32+16], xm14 ; overwritten by previous movu + movd [px+8*32+16], xm14 + movd [px+9*32+16], xm14 + jmp .bottom_done +.bottom_no_left: + test edged, 2 ; have_right + jz .bottom_no_left_right + pmovzxbw m1, [dst8q] + pmovzxbw m2, [dst8q+strideq] + movu [px+8*32+0], m1 + movu [px+9*32+0], m2 + movd [px+8*32-4], xm14 + movd [px+9*32-4], xm14 + jmp .bottom_done +.bottom_no_left_right: + pmovzxbw xm1, [dst8q] + pmovzxbw xm2, [dst8q+strideq] + movu [px+8*32+0], xm1 + movu [px+9*32+0], xm2 + movd [px+8*32-4], xm14 + movd [px+9*32-4], xm14 + movd [px+8*32+16], xm14 + movd [px+9*32+16], xm14 + jmp .bottom_done +.no_bottom: + movu [px+8*32-8], m14 + movu [px+9*32-8], m14 +.bottom_done: + + ; actual filter + DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, secdmp +%undef edged + movifnidn prid, prim + movifnidn secd, secm + mov dampingd, r7m + + mov pridmpd, prid + mov secdmpd, secd + or pridmpd, 1 + or secdmpd, 1 + lzcnt pridmpd, pridmpd + lzcnt secdmpd, secdmpd + lea pridmpd, [pridmpd+dampingd-31] + lea secdmpd, [secdmpd+dampingd-31] + xor dampingd, dampingd + test pridmpd, pridmpd + cmovl pridmpd, dampingd + test secdmpd, secdmpd + cmovl secdmpd, dampingd + mov [rsp+0], pridmpq ; pri_shift + mov [rsp+8], secdmpq ; sec_shift + + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, tap, dummy, pri, sec + movd xm0, prid + movd xm1, secd + vpbroadcastw m0, xm0 ; pri_strength + vpbroadcastw m1, xm1 ; sec_strength + and prid, 1 + and secd, 1 + lea tapq, [tap_table] + lea priq, [tapq+priq*4] ; pri_taps + lea secq, [tapq+secq*4+8] ; sec_taps + + ; off1/2/3[k] [6 total] from [tapq+16+(dir+0/2/6)*2+k] + DEFINE_ARGS dst, stride, tap, dir, pri, sec + mov dird, r6m + lea tapq, [tapq+dirq*2+16] + DEFINE_ARGS dst, stride, dir, h, pri, sec, stk, off, k + mov hd, 4 + lea stkq, [px] + pxor m13, m13 +.v_loop: + mov kd, 1 + mova xm4, [stkq+32*0] ; px + vinserti128 m4, [stkq+32*1], 1 + pxor m15, m15 ; sum + mova m7, m4 ; max + mova m8, m4 ; min +.k_loop: + vpbroadcastw m2, [priq+kq*2] ; pri_taps + vpbroadcastw m3, [secq+kq*2] ; sec_taps + +%macro ACCUMULATE_TAP 4 ; tap_offset, shift, strength, mul_tap + ; load p0/p1 + movsx offq, byte [dirq+kq+%1] ; off1 + movu xm5, [stkq+offq*2+32*0] ; p0 + vinserti128 m5, [stkq+offq*2+32*1], 1 + neg offq ; -off1 + movu xm6, [stkq+offq*2+32*0] ; p1 + vinserti128 m6, [stkq+offq*2+32*1], 1 + pcmpeqw m9, m14, m5 + pcmpeqw m10, m14, m6 + pandn m9, m5 + pandn m10, m6 + pmaxsw m7, m9 ; max after p0 + pminsw m8, m5 ; min after p0 + pmaxsw m7, m10 ; max after p1 + pminsw m8, m6 ; min after p1 + + ; accumulate sum[m15] over p0/p1 + psubw m5, m4 ; diff_p0(p0 - px) + psubw m6, m4 ; diff_p1(p1 - px) + pabsw m9, m5 + pabsw m10, m6 + psraw m11, m9, %2 + psraw m12, m10, %2 + psubw m11, %3, m11 + psubw m12, %3, m12 + pmaxsw m11, m13 + pmaxsw m12, m13 + pminsw m11, m9 + pminsw m12, m10 + psignw m11, m5 ; constrain(diff_p0) + psignw m12, m6 ; constrain(diff_p1) + pmullw m11, %4 ; constrain(diff_p0) * pri_taps + pmullw m12, %4 ; constrain(diff_p1) * pri_taps + paddw m15, m11 + paddw m15, m12 +%endmacro + + ACCUMULATE_TAP 0*2, [rsp+0], m0, m2 + ACCUMULATE_TAP 2*2, [rsp+8], m1, m3 + ACCUMULATE_TAP 6*2, [rsp+8], m1, m3 + + dec kq + jge .k_loop + + vpbroadcastd m12, [pw_2048] + pcmpgtw m11, m13, m15 + paddw m15, m11 + pmulhrsw m15, m12 + paddw m4, m15 + pminsw m4, m7 + pmaxsw m4, m8 + packuswb m4, m4 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 + lea dstq, [dstq+strideq*2] + add stkq, 32*2 + dec hd + jg .v_loop + + RET + INIT_YMM avx2 cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 lea stride3q, [strideq*3] From d337f7230522ab63f1ef699939f5a98f17122189 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 26 Oct 2018 17:00:39 -0400 Subject: [PATCH 047/155] Add a 4x4 cdef_filter AVX2 implementation cdef_filter_4x4_8bpc_c: 2273.6 cdef_filter_4x4_8bpc_avx2: 113.6 Decoding time reduces to 15.51s for first 1000 frames of chimera 1080p, from 23.1 before cdef_filter SIMD or 17.86 with only 8x8 cdef_filter SIMD. --- src/x86/cdef.asm | 361 ++++++++++++++++++++++++++++++----------------- 1 file changed, 229 insertions(+), 132 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 4956da8ede..758cae27e5 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -56,16 +56,76 @@ tap_table: dw 4, 2, 3, 3, 2, 1, 2, 1 SECTION .text +%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride + ; load p0/p1 + movsx offq, byte [dirq+kq+%1] ; off1 +%if %5 == 4 + movq xm5, [stkq+offq*2+%6*0] ; p0 + movq xm6, [stkq+offq*2+%6*2] + movhps xm5, [stkq+offq*2+%6*1] + movhps xm6, [stkq+offq*2+%6*3] + vinserti128 m5, xm6, 1 +%else + movu xm5, [stkq+offq*2+%6*0] ; p0 + vinserti128 m5, [stkq+offq*2+%6*1], 1 +%endif + neg offq ; -off1 +%if %5 == 4 + movq xm6, [stkq+offq*2+%6*0] ; p1 + movq xm9, [stkq+offq*2+%6*2] + movhps xm6, [stkq+offq*2+%6*1] + movhps xm9, [stkq+offq*2+%6*3] + vinserti128 m6, xm9, 1 +%else + movu xm6, [stkq+offq*2+%6*0] ; p1 + vinserti128 m6, [stkq+offq*2+%6*1], 1 +%endif + pcmpeqw m9, m14, m5 + pcmpeqw m10, m14, m6 + pandn m9, m5 + pandn m10, m6 + pmaxsw m7, m9 ; max after p0 + pminsw m8, m5 ; min after p0 + pmaxsw m7, m10 ; max after p1 + pminsw m8, m6 ; min after p1 + + ; accumulate sum[m15] over p0/p1 + psubw m5, m4 ; diff_p0(p0 - px) + psubw m6, m4 ; diff_p1(p1 - px) + pabsw m9, m5 + pabsw m10, m6 + psraw m11, m9, %2 + psraw m12, m10, %2 + psubw m11, %3, m11 + psubw m12, %3, m12 + pmaxsw m11, m13 + pmaxsw m12, m13 + pminsw m11, m9 + pminsw m12, m10 + psignw m11, m5 ; constrain(diff_p0) + psignw m12, m6 ; constrain(diff_p1) + pmullw m11, %4 ; constrain(diff_p0) * pri_taps + pmullw m12, %4 ; constrain(diff_p1) * pri_taps + paddw m15, m11 + paddw m15, m12 +%endmacro + +%macro cdef_filter_fn 3 ; w, h, stride INIT_YMM avx2 -cglobal cdef_filter_8x8, 4, 9, 16, 26 * 16, dst, stride, left, top, \ - pri, sec, stride3, dst4, edge -%define px rsp+32+2*32 +cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ + dst, stride, left, top, pri, sec, stride3, dst4, edge +%define px rsp+2*16+2*%3 pcmpeqw m14, m14 psrlw m14, 1 ; 0x7fff mov edged, r8m ; prepare pixel buffers - body/right +%if %1 == 4 + INIT_XMM avx2 +%endif +%if %2 == 8 lea dst4q, [dstq+strideq*4] +%endif lea stride3q, [strideq*3] test edged, 2 ; have_right jz .no_right @@ -73,48 +133,70 @@ cglobal cdef_filter_8x8, 4, 9, 16, 26 * 16, dst, stride, left, top, \ pmovzxbw m2, [dstq+strideq*1] pmovzxbw m3, [dstq+strideq*2] pmovzxbw m4, [dstq+stride3q] - movu [px+0*32], m1 - movu [px+1*32], m2 - movu [px+2*32], m3 - movu [px+3*32], m4 + mova [px+0*%3], m1 + mova [px+1*%3], m2 + mova [px+2*%3], m3 + mova [px+3*%3], m4 +%if %2 == 8 pmovzxbw m1, [dst4q+strideq*0] pmovzxbw m2, [dst4q+strideq*1] pmovzxbw m3, [dst4q+strideq*2] pmovzxbw m4, [dst4q+stride3q] - movu [px+4*32], m1 - movu [px+5*32], m2 - movu [px+6*32], m3 - movu [px+7*32], m4 + mova [px+4*%3], m1 + mova [px+5*%3], m2 + mova [px+6*%3], m3 + mova [px+7*%3], m4 +%endif jmp .body_done .no_right: +%if %1 == 4 + movd xm1, [dstq+strideq*0] + movd xm2, [dstq+strideq*2] + pinsrd xm1, [dstq+strideq*1], 1 + pinsrd xm2, [dstq+stride3q], 1 + pmovzxbw xm1, xm1 + pmovzxbw xm2, xm2 + movq [px+0*%3], xm1 + movhps [px+1*%3], xm1 + movq [px+2*%3], xm2 + movhps [px+3*%3], xm2 +%else pmovzxbw xm1, [dstq+strideq*0] pmovzxbw xm2, [dstq+strideq*1] pmovzxbw xm3, [dstq+strideq*2] pmovzxbw xm4, [dstq+stride3q] - movu [px+0*32], xm1 - movu [px+1*32], xm2 - movu [px+2*32], xm3 - movu [px+3*32], xm4 - movd [px+0*32+16], xm14 - movd [px+1*32+16], xm14 - movd [px+2*32+16], xm14 - movd [px+3*32+16], xm14 + mova [px+0*%3], xm1 + mova [px+1*%3], xm2 + mova [px+2*%3], xm3 + mova [px+3*%3], xm4 +%endif + movd [px+0*%3+%1*2], xm14 + movd [px+1*%3+%1*2], xm14 + movd [px+2*%3+%1*2], xm14 + movd [px+3*%3+%1*2], xm14 +%if %2 == 8 + ; FIXME w == 4 + movd [px+0*%3+%1*2], xm14 + movd [px+1*%3+%1*2], xm14 + movd [px+2*%3+%1*2], xm14 + movd [px+3*%3+%1*2], xm14 pmovzxbw xm1, [dst4q+strideq*0] pmovzxbw xm2, [dst4q+strideq*1] pmovzxbw xm3, [dst4q+strideq*2] pmovzxbw xm4, [dst4q+stride3q] - movu [px+4*32], xm1 - movu [px+5*32], xm2 - movu [px+6*32], xm3 - movu [px+7*32], xm4 - movd [px+4*32+16], xm14 - movd [px+5*32+16], xm14 - movd [px+6*32+16], xm14 - movd [px+7*32+16], xm14 + mova [px+4*%3], xm1 + mova [px+5*%3], xm2 + mova [px+6*%3], xm3 + mova [px+7*%3], xm4 + movd [px+4*%3+%1*2], xm14 + movd [px+5*%3+%1*2], xm14 + movd [px+6*%3+%1*2], xm14 + movd [px+7*%3+%1*2], xm14 +%endif .body_done: ; top - DEFINE_ARGS dst, stride, left, top2, pri, sec, top1, dummy, edge + DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge test edged, 4 ; have_top jz .no_top mov top1q, [top2q+0*gprsize] @@ -123,119 +205,144 @@ cglobal cdef_filter_8x8, 4, 9, 16, 26 * 16, dst, stride, left, top, \ jz .top_no_left test edged, 2 ; have_right jz .top_no_right - pmovzxbw m1, [top1q-4] - pmovzxbw m2, [top2q-4] - movu [px-2*32-8], m1 - movu [px-1*32-8], m2 + pmovzxbw m1, [top1q-(%1/2)] + pmovzxbw m2, [top2q-(%1/2)] + movu [px-2*%3-%1], m1 + movu [px-1*%3-%1], m2 jmp .top_done .top_no_right: - pmovzxbw m1, [top1q-8] - pmovzxbw m2, [top2q-8] - movu [px-2*32-16], m1 - movu [px-1*32-16], m2 - movd [px-2*32+16], xm14 - movd [px-1*32+16], xm14 + pmovzxbw m1, [top1q-%1] + pmovzxbw m2, [top2q-%1] + movu [px-2*%3-%1*2], m1 + movu [px-1*%3-%1*2], m2 + movd [px-2*%3+%1*2], xm14 + movd [px-1*%3+%1*2], xm14 jmp .top_done .top_no_left: test edged, 2 ; have_right jz .top_no_left_right pmovzxbw m1, [top1q] pmovzxbw m2, [top2q] - movu [px-2*32+0], m1 - movu [px-1*32+0], m2 - movd [px-2*32-4], xm14 - movd [px-1*32-4], xm14 + mova [px-2*%3+0], m1 + mova [px-1*%3+0], m2 + movd [px-2*%3-4], xm14 + movd [px-1*%3-4], xm14 jmp .top_done .top_no_left_right: +%if %1 == 4 + movd xm1, [top1q] + pinsrd xm1, [top2q], 1 + pmovzxbw xm1, xm1 + movq [px-2*%3+0], xm1 + movhps [px-1*%3+0], xm1 +%else pmovzxbw xm1, [top1q] pmovzxbw xm2, [top2q] - movu [px-2*32+0], xm1 - movu [px-1*32+0], xm2 - movd [px-2*32-4], xm14 - movd [px-1*32-4], xm14 - movd [px-2*32+16], xm14 - movd [px-1*32+16], xm14 + mova [px-2*%3+0], xm1 + mova [px-1*%3+0], xm2 +%endif + movd [px-2*%3-4], xm14 + movd [px-1*%3-4], xm14 + movd [px-2*%3+%1*2], xm14 + movd [px-1*%3+%1*2], xm14 jmp .top_done .no_top: - movu [px-2*32-8], m14 - movu [px-1*32-8], m14 + movu [px-2*%3-%1], m14 + movu [px-1*%3-%1], m14 .top_done: ; left test edged, 1 ; have_left jz .no_left pmovzxbw xm1, [leftq+ 0] +%if %2 == 8 pmovzxbw xm2, [leftq+ 8] +%endif movd [px+0*32-4], xm1 pextrd [px+1*32-4], xm1, 1 pextrd [px+2*32-4], xm1, 2 pextrd [px+3*32-4], xm1, 3 +%if %2 == 8 movd [px+4*32-4], xm2 pextrd [px+5*32-4], xm2, 1 pextrd [px+6*32-4], xm2, 2 pextrd [px+7*32-4], xm2, 3 +%endif jmp .left_done .no_left: - movd [px+0*32-4], xm14 - movd [px+1*32-4], xm14 - movd [px+2*32-4], xm14 - movd [px+3*32-4], xm14 - movd [px+4*32-4], xm14 - movd [px+5*32-4], xm14 - movd [px+6*32-4], xm14 - movd [px+7*32-4], xm14 + movd [px+0*%3-4], xm14 + movd [px+1*%3-4], xm14 + movd [px+2*%3-4], xm14 + movd [px+3*%3-4], xm14 +%if %2 == 8 + movd [px+4*%3-4], xm14 + movd [px+5*%3-4], xm14 + movd [px+6*%3-4], xm14 + movd [px+7*%3-4], xm14 +%endif .left_done: ; bottom - DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, dummy2, dummy3, edge + DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge test edged, 8 ; have_bottom jz .no_bottom - lea dst8q, [dstq+8*strideq] + lea dst8q, [dstq+%2*strideq] test edged, 1 ; have_left jz .bottom_no_left test edged, 2 ; have_right jz .bottom_no_right - pmovzxbw m1, [dst8q-4] - pmovzxbw m2, [dst8q+strideq-4] - movu [px+8*32-8], m1 - movu [px+9*32-8], m2 + pmovzxbw m1, [dst8q-(%1/2)] + pmovzxbw m2, [dst8q+strideq-(%1/2)] + movu [px+(%2+0)*%3-%1], m1 + movu [px+(%2+1)*%3-%1], m2 jmp .bottom_done .bottom_no_right: - pmovzxbw m1, [dst8q-8] - pmovzxbw m2, [dst8q+strideq-8] - movu [px+8*32-16], m1 - movu [px+9*32-16], m2 - movd [px+7*32+16], xm14 ; overwritten by previous movu - movd [px+8*32+16], xm14 - movd [px+9*32+16], xm14 + pmovzxbw m1, [dst8q-%1] + pmovzxbw m2, [dst8q+strideq-%1] + movu [px+(%2+0)*%3-%1*2], m1 + movu [px+(%2+1)*%3-%1*2], m2 +%if %1 == 8 + movd [px+(%2-1)*%3+%1*2], xm14 ; overwritten by previous movu +%endif + movd [px+(%2+0)*%3+%1*2], xm14 + movd [px+(%2+1)*%3+%1*2], xm14 jmp .bottom_done .bottom_no_left: test edged, 2 ; have_right jz .bottom_no_left_right pmovzxbw m1, [dst8q] pmovzxbw m2, [dst8q+strideq] - movu [px+8*32+0], m1 - movu [px+9*32+0], m2 - movd [px+8*32-4], xm14 - movd [px+9*32-4], xm14 + mova [px+(%2+0)*%3+0], m1 + mova [px+(%2+1)*%3+0], m2 + movd [px+(%2+0)*%3-4], xm14 + movd [px+(%2+1)*%3-4], xm14 jmp .bottom_done .bottom_no_left_right: +%if %1 == 4 + movd xm1, [dst8q] + pinsrd xm1, [dst8q+strideq], 1 + pmovzxbw xm1, xm1 + movq [px+(%2+0)*%3+0], xm1 + movhps [px+(%2+1)*%3+0], xm1 +%else pmovzxbw xm1, [dst8q] pmovzxbw xm2, [dst8q+strideq] - movu [px+8*32+0], xm1 - movu [px+9*32+0], xm2 - movd [px+8*32-4], xm14 - movd [px+9*32-4], xm14 - movd [px+8*32+16], xm14 - movd [px+9*32+16], xm14 + mova [px+(%2+0)*%3+0], xm1 + mova [px+(%2+1)*%3+0], xm2 +%endif + movd [px+(%2+0)*%3-4], xm14 + movd [px+(%2+1)*%3-4], xm14 + movd [px+(%2+0)*%3+%1*2], xm14 + movd [px+(%2+1)*%3+%1*2], xm14 jmp .bottom_done .no_bottom: - movu [px+8*32-8], m14 - movu [px+9*32-8], m14 + movu [px+(%2+0)*%3-%1], m14 + movu [px+(%2+1)*%3-%1], m14 .bottom_done: ; actual filter - DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, secdmp + INIT_YMM avx2 + DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp %undef edged movifnidn prid, prim movifnidn secd, secm @@ -258,7 +365,7 @@ cglobal cdef_filter_8x8, 4, 9, 16, 26 * 16, dst, stride, left, top, \ mov [rsp+8], secdmpq ; sec_shift ; pri/sec_taps[k] [4 total] - DEFINE_ARGS dst, stride, tap, dummy, pri, sec + DEFINE_ARGS dst, stride, tap, dummy, pri, sec, stride3 movd xm0, prid movd xm1, secd vpbroadcastw m0, xm0 ; pri_strength @@ -270,17 +377,31 @@ cglobal cdef_filter_8x8, 4, 9, 16, 26 * 16, dst, stride, left, top, \ lea secq, [tapq+secq*4+8] ; sec_taps ; off1/2/3[k] [6 total] from [tapq+16+(dir+0/2/6)*2+k] - DEFINE_ARGS dst, stride, tap, dir, pri, sec + DEFINE_ARGS dst, stride, tap, dir, pri, sec, stride3 mov dird, r6m lea tapq, [tapq+dirq*2+16] - DEFINE_ARGS dst, stride, dir, h, pri, sec, stk, off, k - mov hd, 4 +%if %1*%2*2/mmsize > 1 + DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k + mov hd, %1*%2*2/mmsize +%else + DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k +%endif lea stkq, [px] pxor m13, m13 +%if %1*%2*2/mmsize > 1 .v_loop: +%endif mov kd, 1 - mova xm4, [stkq+32*0] ; px - vinserti128 m4, [stkq+32*1], 1 +%if %1 == 4 + movq xm4, [stkq+%3*0] + movhps xm4, [stkq+%3*1] + movq xm5, [stkq+%3*2] + movhps xm5, [stkq+%3*3] + vinserti128 m4, xm5, 1 +%else + mova xm4, [stkq+%3*0] ; px + vinserti128 m4, [stkq+%3*1], 1 +%endif pxor m15, m15 ; sum mova m7, m4 ; max mova m8, m4 ; min @@ -288,47 +409,9 @@ cglobal cdef_filter_8x8, 4, 9, 16, 26 * 16, dst, stride, left, top, \ vpbroadcastw m2, [priq+kq*2] ; pri_taps vpbroadcastw m3, [secq+kq*2] ; sec_taps -%macro ACCUMULATE_TAP 4 ; tap_offset, shift, strength, mul_tap - ; load p0/p1 - movsx offq, byte [dirq+kq+%1] ; off1 - movu xm5, [stkq+offq*2+32*0] ; p0 - vinserti128 m5, [stkq+offq*2+32*1], 1 - neg offq ; -off1 - movu xm6, [stkq+offq*2+32*0] ; p1 - vinserti128 m6, [stkq+offq*2+32*1], 1 - pcmpeqw m9, m14, m5 - pcmpeqw m10, m14, m6 - pandn m9, m5 - pandn m10, m6 - pmaxsw m7, m9 ; max after p0 - pminsw m8, m5 ; min after p0 - pmaxsw m7, m10 ; max after p1 - pminsw m8, m6 ; min after p1 - - ; accumulate sum[m15] over p0/p1 - psubw m5, m4 ; diff_p0(p0 - px) - psubw m6, m4 ; diff_p1(p1 - px) - pabsw m9, m5 - pabsw m10, m6 - psraw m11, m9, %2 - psraw m12, m10, %2 - psubw m11, %3, m11 - psubw m12, %3, m12 - pmaxsw m11, m13 - pmaxsw m12, m13 - pminsw m11, m9 - pminsw m12, m10 - psignw m11, m5 ; constrain(diff_p0) - psignw m12, m6 ; constrain(diff_p1) - pmullw m11, %4 ; constrain(diff_p0) * pri_taps - pmullw m12, %4 ; constrain(diff_p1) * pri_taps - paddw m15, m11 - paddw m15, m12 -%endmacro - - ACCUMULATE_TAP 0*2, [rsp+0], m0, m2 - ACCUMULATE_TAP 2*2, [rsp+8], m1, m3 - ACCUMULATE_TAP 6*2, [rsp+8], m1, m3 + ACCUMULATE_TAP 0*2, [rsp+0], m0, m2, %1, %3 + ACCUMULATE_TAP 2*2, [rsp+8], m1, m3, %1, %3 + ACCUMULATE_TAP 6*2, [rsp+8], m1, m3, %1, %3 dec kq jge .k_loop @@ -342,14 +425,28 @@ cglobal cdef_filter_8x8, 4, 9, 16, 26 * 16, dst, stride, left, top, \ pmaxsw m4, m8 packuswb m4, m4 vextracti128 xm5, m4, 1 +%if %1 == 4 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + movd [dstq+strideq*2], xm5 + pextrd [dstq+stride3q], xm5, 1 +%else movq [dstq+strideq*0], xm4 movq [dstq+strideq*1], xm5 +%endif + +%if %1*%2*2/mmsize > 1 lea dstq, [dstq+strideq*2] - add stkq, 32*2 + add stkq, %3*2 dec hd jg .v_loop +%endif RET +%endmacro + +cdef_filter_fn 8, 8, 32 +cdef_filter_fn 4, 4, 32 INIT_YMM avx2 cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 From c803b6778a6be82e6240d56acac454aea029de6e Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 8 Nov 2018 13:18:19 -0500 Subject: [PATCH 048/155] cdef: simplify sec_taps Also reduce scope of tables to inside the function where they are used. --- src/x86/cdef.asm | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 758cae27e5..6e60cadc31 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -37,7 +37,7 @@ shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pw_128: times 2 dw 128 pw_2048: times 2 dw 2048 -tap_table: dw 4, 2, 3, 3, 2, 1, 2, 1 +tap_table: dw 4, 2, 3, 3, 2, 1 db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 db 0 * 16 + 1, 0 * 16 + 2 @@ -371,15 +371,14 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ vpbroadcastw m0, xm0 ; pri_strength vpbroadcastw m1, xm1 ; sec_strength and prid, 1 - and secd, 1 lea tapq, [tap_table] lea priq, [tapq+priq*4] ; pri_taps - lea secq, [tapq+secq*4+8] ; sec_taps + lea secq, [tapq+8] ; sec_taps - ; off1/2/3[k] [6 total] from [tapq+16+(dir+0/2/6)*2+k] + ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] DEFINE_ARGS dst, stride, tap, dir, pri, sec, stride3 mov dird, r6m - lea tapq, [tapq+dirq*2+16] + lea tapq, [tapq+dirq*2+12] %if %1*%2*2/mmsize > 1 DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k mov hd, %1*%2*2/mmsize From 360b2f776eed05c9d0cdc1ed84b4553415c415e8 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Wed, 13 Feb 2019 17:36:19 +0100 Subject: [PATCH 049/155] x86: improve AVX2 cdef_filter macro consistency - consistently use %3 instead of hardcoded value for tmp stride - also correct a comment --- src/x86/cdef.asm | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 6e60cadc31..b278666fcf 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -104,8 +104,8 @@ SECTION .text pminsw m12, m10 psignw m11, m5 ; constrain(diff_p0) psignw m12, m6 ; constrain(diff_p1) - pmullw m11, %4 ; constrain(diff_p0) * pri_taps - pmullw m12, %4 ; constrain(diff_p1) * pri_taps + pmullw m11, %4 ; constrain(diff_p0) * taps + pmullw m12, %4 ; constrain(diff_p1) * taps paddw m15, m11 paddw m15, m12 %endmacro @@ -258,15 +258,15 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ %if %2 == 8 pmovzxbw xm2, [leftq+ 8] %endif - movd [px+0*32-4], xm1 - pextrd [px+1*32-4], xm1, 1 - pextrd [px+2*32-4], xm1, 2 - pextrd [px+3*32-4], xm1, 3 + movd [px+0*%3-4], xm1 + pextrd [px+1*%3-4], xm1, 1 + pextrd [px+2*%3-4], xm1, 2 + pextrd [px+3*%3-4], xm1, 3 %if %2 == 8 - movd [px+4*32-4], xm2 - pextrd [px+5*32-4], xm2, 1 - pextrd [px+6*32-4], xm2, 2 - pextrd [px+7*32-4], xm2, 3 + movd [px+4*%3-4], xm2 + pextrd [px+5*%3-4], xm2, 1 + pextrd [px+6*%3-4], xm2, 2 + pextrd [px+7*%3-4], xm2, 3 %endif jmp .left_done .no_left: From 608350cb0c7d2e7329a1e0d25d7513bdb856e61d Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Mon, 11 Feb 2019 16:02:56 +0100 Subject: [PATCH 050/155] x86: remove redundant code in cdef filter AVX2 --- src/x86/cdef.asm | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index b278666fcf..0340d4219f 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -176,10 +176,6 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ movd [px+3*%3+%1*2], xm14 %if %2 == 8 ; FIXME w == 4 - movd [px+0*%3+%1*2], xm14 - movd [px+1*%3+%1*2], xm14 - movd [px+2*%3+%1*2], xm14 - movd [px+3*%3+%1*2], xm14 pmovzxbw xm1, [dst4q+strideq*0] pmovzxbw xm2, [dst4q+strideq*1] pmovzxbw xm3, [dst4q+strideq*2] From 0b0ba72ae572ccd190fe878aafb37ba558e25b65 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Sun, 24 Feb 2019 15:14:15 +0100 Subject: [PATCH 051/155] x86: add AVX2 cdef_filter_4x8 used for YUV 422 chroma blocks cdef_filter_4x8_8bpc_c: 2711.6 cdef_filter_4x8_8bpc_avx2: 189.1 --- src/x86/cdef.asm | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 0340d4219f..5c830b1e6c 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -112,8 +112,13 @@ SECTION .text %macro cdef_filter_fn 3 ; w, h, stride INIT_YMM avx2 +%if %1 != 4 || %2 != 8 cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ dst, stride, left, top, pri, sec, stride3, dst4, edge +%else +cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ + dst, stride, left, top, pri, sec, stride3, dst4, edge +%endif %define px rsp+2*16+2*%3 pcmpeqw m14, m14 psrlw m14, 1 ; 0x7fff @@ -175,7 +180,20 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ movd [px+2*%3+%1*2], xm14 movd [px+3*%3+%1*2], xm14 %if %2 == 8 - ; FIXME w == 4 + %if %1 == 4 + movd xm1, [dst4q+strideq*0] + movd xm2, [dst4q+strideq*1] + movd xm3, [dst4q+strideq*2] + movd xm4, [dst4q+stride3q] + pmovzxbw xm1, xm1 + pmovzxbw xm2, xm2 + pmovzxbw xm3, xm3 + pmovzxbw xm4, xm4 + movq [px+4*%3], xm1 + movq [px+5*%3], xm2 + movq [px+6*%3], xm3 + movq [px+7*%3], xm4 + %else pmovzxbw xm1, [dst4q+strideq*0] pmovzxbw xm2, [dst4q+strideq*1] pmovzxbw xm3, [dst4q+strideq*2] @@ -184,6 +202,7 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ mova [px+5*%3], xm2 mova [px+6*%3], xm3 mova [px+7*%3], xm4 + %endif movd [px+4*%3+%1*2], xm14 movd [px+5*%3+%1*2], xm14 movd [px+6*%3+%1*2], xm14 @@ -376,7 +395,11 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ mov dird, r6m lea tapq, [tapq+dirq*2+12] %if %1*%2*2/mmsize > 1 + %if %1 == 4 + DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k + %else DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k + %endif mov hd, %1*%2*2/mmsize %else DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k @@ -431,8 +454,9 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ %endif %if %1*%2*2/mmsize > 1 - lea dstq, [dstq+strideq*2] - add stkq, %3*2 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, %3*vloop_lines dec hd jg .v_loop %endif @@ -441,6 +465,7 @@ cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ %endmacro cdef_filter_fn 8, 8, 32 +cdef_filter_fn 4, 8, 32 cdef_filter_fn 4, 4, 32 INIT_YMM avx2 From e2cc4046b3bf07eae3ea1ba02c9314892b314e6b Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Sun, 24 Feb 2019 15:10:02 +0100 Subject: [PATCH 052/155] x86: optimize 4 by X cdef filters for HAVE_RIGHT=0 --- src/x86/cdef.asm | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 5c830b1e6c..96e6d10544 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -156,15 +156,17 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ .no_right: %if %1 == 4 movd xm1, [dstq+strideq*0] - movd xm2, [dstq+strideq*2] - pinsrd xm1, [dstq+strideq*1], 1 - pinsrd xm2, [dstq+stride3q], 1 + movd xm2, [dstq+strideq*1] + movd xm3, [dstq+strideq*2] + movd xm4, [dstq+stride3q] pmovzxbw xm1, xm1 pmovzxbw xm2, xm2 + pmovzxbw xm3, xm3 + pmovzxbw xm4, xm4 movq [px+0*%3], xm1 - movhps [px+1*%3], xm1 - movq [px+2*%3], xm2 - movhps [px+3*%3], xm2 + movq [px+1*%3], xm2 + movq [px+2*%3], xm3 + movq [px+3*%3], xm4 %else pmovzxbw xm1, [dstq+strideq*0] pmovzxbw xm2, [dstq+strideq*1] From 6adb1035ce4384c437122cc61ee2eb28868cede2 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Mon, 25 Feb 2019 17:42:51 +0100 Subject: [PATCH 053/155] x86: optimize AVX2 cdef filters before: cdef_filter_4x4_8bpc_avx2: 110.4 after: cdef_filter_4x4_8bpc_avx2: 106.0 before: cdef_filter_4x8_8bpc_avx2: 188.3 after: cdef_filter_4x8_8bpc_avx2: 182.2 before: cdef_filter_8x8_8bpc_avx2: 276.7 after: cdef_filter_8x8_8bpc_avx2: 252.5 Credit to Gramner. --- src/x86/cdef.asm | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 96e6d10544..bfca1d0c90 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -94,20 +94,18 @@ SECTION .text psubw m6, m4 ; diff_p1(p1 - px) pabsw m9, m5 pabsw m10, m6 - psraw m11, m9, %2 - psraw m12, m10, %2 - psubw m11, %3, m11 - psubw m12, %3, m12 - pmaxsw m11, m13 - pmaxsw m12, m13 - pminsw m11, m9 - pminsw m12, m10 - psignw m11, m5 ; constrain(diff_p0) - psignw m12, m6 ; constrain(diff_p1) - pmullw m11, %4 ; constrain(diff_p0) * taps - pmullw m12, %4 ; constrain(diff_p1) * taps - paddw m15, m11 - paddw m15, m12 + psignw m11, %4, m5 + psignw m12, %4, m6 + psrlw m5, m9, %2 + psrlw m6, m10, %2 + psubusw m5, %3, m5 + psubusw m6, %3, m6 + pminsw m5, m9 ; constrain(diff_p0) + pminsw m6, m10 ; constrain(diff_p1) + pmullw m5, m11 ; constrain(diff_p0) * taps + pmullw m6, m12 ; constrain(diff_p1) * taps + paddw m15, m5 + paddw m15, m6 %endmacro %macro cdef_filter_fn 3 ; w, h, stride From 4c21ff9e6cdac75d57662400642235c028948dac Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Sun, 3 Mar 2019 15:47:01 +0100 Subject: [PATCH 054/155] Speed up finding the best cost in avx2 cdef --- src/x86/cdef.asm | 80 +++++++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 48 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index bfca1d0c90..35283d704e 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -29,10 +29,10 @@ %if ARCH_X86_64 SECTION_RODATA 32 -pd_04512763: dd 0, 4, 5, 1, 2, 7, 6, 3 +pd_02564713: dd 0, 2, 5, 6, 4, 7, 1, 3 +pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 div_table: dd 840, 420, 280, 210, 168, 140, 120, 105 dd 420, 210, 140, 105 -pd_04261537: dd 0, 4, 2, 6, 1, 5, 3, 7 shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pw_128: times 2 dw 128 @@ -640,54 +640,38 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 pmulld m5, m13 paddd m5, m6 ; cost1[a-d] | cost3[a-d] - mova xm0, [pd_04512763+ 0] - mova xm1, [pd_04512763+ 16] + mova xm0, [pd_47130256+ 16] + mova m1, [pd_47130256] phaddd m9, m8 phaddd m5, m4 phaddd m9, m5 - vpermd m0, m9 ; cost[0/4/2/6] - vpermd m1, m9 ; cost[1/5/3/7] - - ; now find the best cost, its idx^4 complement, and its idx - pcmpgtd xm2, xm1, xm0 ; [1/5/3/7] > [0/4/2/6] - pand xm3, xm2, xm1 - pandn xm4, xm2, xm0 - por xm3, xm4 ; higher 4 values - pshufd xm1, xm1, q2301 - pshufd xm0, xm0, q2301 - pand xm1, xm2, xm1 - pandn xm0, xm2, xm0 - por xm0, xm1 ; complementary 4 values at idx^4 offset - pand xm13, xm2, [pd_04261537+16] - pandn xm14, xm2, [pd_04261537+ 0] - por xm14, xm13 ; indices - - punpckhqdq xm4, xm3, xm0 - punpcklqdq xm3, xm0 - pcmpgtd xm5, xm4, xm3 ; [2or3-6or7] > [0or1/4or5] - punpcklqdq xm5, xm5 - pand xm6, xm5, xm4 - pandn xm7, xm5, xm3 - por xm6, xm7 ; { highest 2 values, complements at idx^4 } - movhlps xm13, xm14 - pand xm13, xm5, xm13 - pandn xm14, xm5, xm14 - por xm14, xm13 - - pshufd xm7, xm6, q3311 - pcmpgtd xm8, xm7, xm6 ; [4or5or6or7] > [0or1or2or3] - punpcklqdq xm8, xm8 - pand xm9, xm8, xm7 - pandn xm10, xm8, xm6 - por xm9, xm10 ; max - movhlps xm10, xm9 ; complement at idx^4 - psubd xm9, xm10 - psrld xm9, 10 - movd [varq], xm9 - pshufd xm13, xm14, q1111 - pand xm13, xm8, xm13 - pandn xm14, xm8, xm14 - por xm14, xm13 - movd eax, xm14 + vpermd m0, m9 ; cost[0-3] + vpermd m1, m9 ; cost[4-7] | cost[0-3] + + ; now find the best cost + pmaxsd xm2, xm0, xm1 + pshufd xm3, xm2, q3232 + pmaxsd xm2, xm3 + pshufd xm3, xm2, q1111 + pmaxsd xm2, xm3 + pshufd xm2, xm2, q0000 ; best cost + + ; find the idx using minpos + ; make everything other than the best cost negative via subtraction + ; find the min of unsigned 16-bit ints to sort out the negative values + psubd xm4, xm1, xm2 + psubd xm3, xm0, xm2 + packssdw xm3, xm4 + phminposuw xm3, xm3 + + ; convert idx to 32-bits + psrldq xm3, 2 + movd eax, xm3 + + ; get idx^4 complement + vpermd m3, m1 + psubd xm2, xm3 + psrld xm2, 10 + movd [varq], xm2 RET %endif ; ARCH_X86_64 From 785835742706d93d4b3526a9c42b5066b40a038e Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Sun, 3 Mar 2019 10:37:28 -0500 Subject: [PATCH 055/155] Remove unused data from x86/cdef.asm --- src/x86/cdef.asm | 1 - 1 file changed, 1 deletion(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 35283d704e..3c6fdf6f11 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -29,7 +29,6 @@ %if ARCH_X86_64 SECTION_RODATA 32 -pd_02564713: dd 0, 2, 5, 6, 4, 7, 1, 3 pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 div_table: dd 840, 420, 280, 210, 168, 140, 120, 105 dd 420, 210, 140, 105 From c0d08880a6a0b9bbd83ed36e589a4220fb1cab4b Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Tue, 5 Mar 2019 15:42:54 +0100 Subject: [PATCH 056/155] Utilize a better CDEF constant for avx2 Before: ``` cdef_filter_8x8_8bpc_avx2: 275.5 cdef_filter_4x8_8bpc_avx2: 193.3 cdef_filter_4x4_8bpc_avx2: 113.5 ``` After: ``` cdef_filter_8x8_8bpc_avx2: 252.3 cdef_filter_4x8_8bpc_avx2: 182.1 cdef_filter_4x4_8bpc_avx2: 105.7 ``` --- src/x86/cdef.asm | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 3c6fdf6f11..1b3c30cd6f 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -79,14 +79,13 @@ SECTION .text movu xm6, [stkq+offq*2+%6*0] ; p1 vinserti128 m6, [stkq+offq*2+%6*1], 1 %endif - pcmpeqw m9, m14, m5 - pcmpeqw m10, m14, m6 - pandn m9, m5 - pandn m10, m6 - pmaxsw m7, m9 ; max after p0 - pminsw m8, m5 ; min after p0 - pmaxsw m7, m10 ; max after p1 - pminsw m8, m6 ; min after p1 + ; out of bounds values are set to a value that is a both a large unsigned + ; value and a negative signed value. + ; use signed max and unsigned min to remove them + pmaxsw m7, m5 ; max after p0 + pminuw m8, m5 ; min after p0 + pmaxsw m7, m6 ; max after p1 + pminuw m8, m6 ; min after p1 ; accumulate sum[m15] over p0/p1 psubw m5, m4 ; diff_p0(p0 - px) @@ -99,8 +98,10 @@ SECTION .text psrlw m6, m10, %2 psubusw m5, %3, m5 psubusw m6, %3, m6 - pminsw m5, m9 ; constrain(diff_p0) - pminsw m6, m10 ; constrain(diff_p1) + + ; use unsigned min since abs diff can equal 0x8000 + pminuw m5, m9 ; constrain(diff_p0) + pminuw m6, m10 ; constrain(diff_p1) pmullw m5, m11 ; constrain(diff_p0) * taps pmullw m6, m12 ; constrain(diff_p1) * taps paddw m15, m5 @@ -118,7 +119,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ %endif %define px rsp+2*16+2*%3 pcmpeqw m14, m14 - psrlw m14, 1 ; 0x7fff + psllw m14, 15 ; 0x8000 mov edged, r8m ; prepare pixel buffers - body/right From 1ba4a78dc0abd3f7927c061939bb40200858bb46 Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Wed, 6 Mar 2019 20:03:51 +0100 Subject: [PATCH 057/155] Use some 8 bit arithmetic in AVX2 CDEF filter Before: cdef_filter_8x8_8bpc_avx2: 252.3 cdef_filter_4x8_8bpc_avx2: 182.1 cdef_filter_4x4_8bpc_avx2: 105.7 After: cdef_filter_8x8_8bpc_avx2: 235.5 cdef_filter_4x8_8bpc_avx2: 174.8 cdef_filter_4x4_8bpc_avx2: 101.8 --- src/x86/cdef.asm | 111 ++++++++++++++++++++++++++--------------------- 1 file changed, 62 insertions(+), 49 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 1b3c30cd6f..2066adf0d6 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -34,9 +34,13 @@ div_table: dd 840, 420, 280, 210, 168, 140, 120, 105 dd 420, 210, 140, 105 shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pw_128: times 2 dw 128 pw_2048: times 2 dw 2048 -tap_table: dw 4, 2, 3, 3, 2, 1 +tap_table: ; masks for 8 bit shifts + db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 + ; weights + db 4, 2, 3, 3, 2, 1 db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 db 0 * 16 + 1, 0 * 16 + 2 @@ -55,29 +59,29 @@ tap_table: dw 4, 2, 3, 3, 2, 1 SECTION .text -%macro ACCUMULATE_TAP 6 ; tap_offset, shift, strength, mul_tap, w, stride +%macro ACCUMULATE_TAP 7 ; tap_offset, shift, mask, strength, mul_tap, w, stride ; load p0/p1 movsx offq, byte [dirq+kq+%1] ; off1 -%if %5 == 4 - movq xm5, [stkq+offq*2+%6*0] ; p0 - movq xm6, [stkq+offq*2+%6*2] - movhps xm5, [stkq+offq*2+%6*1] - movhps xm6, [stkq+offq*2+%6*3] +%if %6 == 4 + movq xm5, [stkq+offq*2+%7*0] ; p0 + movq xm6, [stkq+offq*2+%7*2] + movhps xm5, [stkq+offq*2+%7*1] + movhps xm6, [stkq+offq*2+%7*3] vinserti128 m5, xm6, 1 %else - movu xm5, [stkq+offq*2+%6*0] ; p0 - vinserti128 m5, [stkq+offq*2+%6*1], 1 + movu xm5, [stkq+offq*2+%7*0] ; p0 + vinserti128 m5, [stkq+offq*2+%7*1], 1 %endif neg offq ; -off1 -%if %5 == 4 - movq xm6, [stkq+offq*2+%6*0] ; p1 - movq xm9, [stkq+offq*2+%6*2] - movhps xm6, [stkq+offq*2+%6*1] - movhps xm9, [stkq+offq*2+%6*3] +%if %6 == 4 + movq xm6, [stkq+offq*2+%7*0] ; p1 + movq xm9, [stkq+offq*2+%7*2] + movhps xm6, [stkq+offq*2+%7*1] + movhps xm9, [stkq+offq*2+%7*3] vinserti128 m6, xm9, 1 %else - movu xm6, [stkq+offq*2+%6*0] ; p1 - vinserti128 m6, [stkq+offq*2+%6*1], 1 + movu xm6, [stkq+offq*2+%7*0] ; p1 + vinserti128 m6, [stkq+offq*2+%7*1], 1 %endif ; out of bounds values are set to a value that is a both a large unsigned ; value and a negative signed value. @@ -88,24 +92,26 @@ SECTION .text pminuw m8, m6 ; min after p1 ; accumulate sum[m15] over p0/p1 + ; calculate difference before converting psubw m5, m4 ; diff_p0(p0 - px) psubw m6, m4 ; diff_p1(p1 - px) - pabsw m9, m5 - pabsw m10, m6 - psignw m11, %4, m5 - psignw m12, %4, m6 - psrlw m5, m9, %2 - psrlw m6, m10, %2 - psubusw m5, %3, m5 - psubusw m6, %3, m6 - - ; use unsigned min since abs diff can equal 0x8000 - pminuw m5, m9 ; constrain(diff_p0) - pminuw m6, m10 ; constrain(diff_p1) - pmullw m5, m11 ; constrain(diff_p0) * taps - pmullw m6, m12 ; constrain(diff_p1) * taps + + ; convert to 8-bits with signed saturation + ; saturating to large diffs has no impact on the results + packsswb m5, m6 + + ; group into pairs so we can accumulate using maddubsw + pshufb m5, m12 + pabsb m9, m5 + psignb m10, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + + ; use unsigned min since abs diff can equal 0x80 + pminub m5, m9 + pmaddubsw m5, m10 paddw m15, m5 - paddw m15, m6 %endmacro %macro cdef_filter_fn 3 ; w, h, stride @@ -359,6 +365,9 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ INIT_YMM avx2 DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp %undef edged + ; register to shuffle values into after packing + vbroadcasti128 m12, [shufb_lohi] + movifnidn prid, prim movifnidn secd, secm mov dampingd, r7m @@ -379,21 +388,25 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ mov [rsp+0], pridmpq ; pri_shift mov [rsp+8], secdmpq ; sec_shift + DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + ; pri/sec_taps[k] [4 total] - DEFINE_ARGS dst, stride, tap, dummy, pri, sec, stride3 + DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3 movd xm0, prid movd xm1, secd - vpbroadcastw m0, xm0 ; pri_strength - vpbroadcastw m1, xm1 ; sec_strength + vpbroadcastb m0, xm0 ; pri_strength + vpbroadcastb m1, xm1 ; sec_strength and prid, 1 - lea tapq, [tap_table] - lea priq, [tapq+priq*4] ; pri_taps - lea secq, [tapq+8] ; sec_taps + lea priq, [tableq+priq*2+8] ; pri_taps + lea secq, [tableq+12] ; sec_taps ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] - DEFINE_ARGS dst, stride, tap, dir, pri, sec, stride3 + DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3 mov dird, r6m - lea tapq, [tapq+dirq*2+12] + lea dirq, [tapq+dirq*2+14] %if %1*%2*2/mmsize > 1 %if %1 == 4 DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k @@ -405,7 +418,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k %endif lea stkq, [px] - pxor m13, m13 + pxor m11, m11 %if %1*%2*2/mmsize > 1 .v_loop: %endif @@ -424,20 +437,20 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ mova m7, m4 ; max mova m8, m4 ; min .k_loop: - vpbroadcastw m2, [priq+kq*2] ; pri_taps - vpbroadcastw m3, [secq+kq*2] ; sec_taps + vpbroadcastb m2, [priq+kq] ; pri_taps + vpbroadcastb m3, [secq+kq] ; sec_taps - ACCUMULATE_TAP 0*2, [rsp+0], m0, m2, %1, %3 - ACCUMULATE_TAP 2*2, [rsp+8], m1, m3, %1, %3 - ACCUMULATE_TAP 6*2, [rsp+8], m1, m3, %1, %3 + ACCUMULATE_TAP 0*2, [rsp+0], m13, m0, m2, %1, %3 + ACCUMULATE_TAP 2*2, [rsp+8], m14, m1, m3, %1, %3 + ACCUMULATE_TAP 6*2, [rsp+8], m14, m1, m3, %1, %3 dec kq jge .k_loop - vpbroadcastd m12, [pw_2048] - pcmpgtw m11, m13, m15 - paddw m15, m11 - pmulhrsw m15, m12 + vpbroadcastd m10, [pw_2048] + pcmpgtw m9, m11, m15 + paddw m15, m9 + pmulhrsw m15, m10 paddw m4, m15 pminsw m4, m7 pmaxsw m4, m8 From 97afda9e9db40b1faa0d5b17579b4cfb62d7a3cc Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Fri, 8 Mar 2019 20:48:58 +0100 Subject: [PATCH 058/155] x86: optimize AVX2 cdef_dir This optimization is so tiny we can't even see it in checkasm. The only actual difference being the removal of a memory load, it has to be better. --- src/x86/cdef.asm | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 2066adf0d6..43f6196c63 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -33,7 +33,6 @@ pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 div_table: dd 840, 420, 280, 210, 168, 140, 120, 105 dd 420, 210, 140, 105 shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 -shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pw_128: times 2 dw 128 pw_2048: times 2 dw 2048 @@ -600,9 +599,8 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 ; and [upper half]: ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx - ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd + ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd - vbroadcasti128 m14, [shufw_210xxxxx] pslldq m4, m11, 2 psrldq m11, 14 pslldq m5, m12, 4 @@ -616,7 +614,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 paddw m11, m13 ; partial_sum_alt[3/2] right vbroadcasti128 m13, [div_table+32] paddw m4, m5 ; partial_sum_alt[3/2] left - pshufb m11, m14 + pshuflw m11, m11, q3012 punpckhwd m6, m4, m11 punpcklwd m4, m11 pmaddwd m6, m6 @@ -631,7 +629,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 ; and [upper half]: ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx - ; and then shuffle m11 [shufw_210xxxxx], unpcklwd, pmaddwd, pmulld, paddd + ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd pslldq m5, m1, 2 psrldq m1, 14 @@ -644,7 +642,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 paddw m6, m7 paddw m1, m3 ; partial_sum_alt[0/1] right paddw m5, m6 ; partial_sum_alt[0/1] left - pshufb m1, m14 + pshuflw m1, m1, q3012 punpckhwd m6, m5, m1 punpcklwd m5, m1 pmaddwd m6, m6 From 66acb1b946ea835c18f3ef8728267bf7f1df1cba Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 20 Mar 2019 19:16:13 +0100 Subject: [PATCH 059/155] x86: Add minor CDEF AVX2 optimizations --- src/x86/cdef.asm | 90 +++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 43f6196c63..c8ca614001 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -135,7 +135,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ lea dst4q, [dstq+strideq*4] %endif lea stride3q, [strideq*3] - test edged, 2 ; have_right + test edgeb, 2 ; have_right jz .no_right pmovzxbw m1, [dstq+strideq*0] pmovzxbw m2, [dstq+strideq*1] @@ -217,13 +217,13 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ; top DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge - test edged, 4 ; have_top + test edgeb, 4 ; have_top jz .no_top mov top1q, [top2q+0*gprsize] mov top2q, [top2q+1*gprsize] - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .top_no_left - test edged, 2 ; have_right + test edgeb, 2 ; have_right jz .top_no_right pmovzxbw m1, [top1q-(%1/2)] pmovzxbw m2, [top2q-(%1/2)] @@ -239,7 +239,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ movd [px-1*%3+%1*2], xm14 jmp .top_done .top_no_left: - test edged, 2 ; have_right + test edgeb, 2 ; have_right jz .top_no_left_right pmovzxbw m1, [top1q] pmovzxbw m2, [top2q] @@ -272,7 +272,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ .top_done: ; left - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .no_left pmovzxbw xm1, [leftq+ 0] %if %2 == 8 @@ -304,12 +304,12 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ; bottom DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge - test edged, 8 ; have_bottom + test edgeb, 8 ; have_bottom jz .no_bottom lea dst8q, [dstq+%2*strideq] - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .bottom_no_left - test edged, 2 ; have_right + test edgeb, 2 ; have_right jz .bottom_no_right pmovzxbw m1, [dst8q-(%1/2)] pmovzxbw m2, [dst8q+strideq-(%1/2)] @@ -328,7 +328,7 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ movd [px+(%2+1)*%3+%1*2], xm14 jmp .bottom_done .bottom_no_left: - test edged, 2 ; have_right + test edgeb, 2 ; have_right jz .bottom_no_left_right pmovzxbw m1, [dst8q] pmovzxbw m2, [dst8q+strideq] @@ -362,50 +362,49 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ ; actual filter INIT_YMM avx2 - DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, stride3, secdmp + DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero %undef edged ; register to shuffle values into after packing vbroadcasti128 m12, [shufb_lohi] movifnidn prid, prim - movifnidn secd, secm mov dampingd, r7m - - mov pridmpd, prid - mov secdmpd, secd - or pridmpd, 1 - or secdmpd, 1 - lzcnt pridmpd, pridmpd - lzcnt secdmpd, secdmpd - lea pridmpd, [pridmpd+dampingd-31] - lea secdmpd, [secdmpd+dampingd-31] - xor dampingd, dampingd - test pridmpd, pridmpd - cmovl pridmpd, dampingd - test secdmpd, secdmpd - cmovl secdmpd, dampingd + lzcnt pridmpd, prid +%if UNIX64 + movd xm0, prid + movd xm1, secdmpd +%endif + lzcnt secdmpd, secdmpm + sub dampingd, 31 + xor zerod, zerod + add pridmpd, dampingd + cmovl pridmpd, zerod + add secdmpd, dampingd + cmovl secdmpd, zerod mov [rsp+0], pridmpq ; pri_shift mov [rsp+8], secdmpq ; sec_shift - DEFINE_ARGS dst, stride, pridmp, table, pri, sec, stride3, secdmp + DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3 lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask ; pri/sec_taps[k] [4 total] - DEFINE_ARGS dst, stride, dummy, table, pri, sec, stride3 - movd xm0, prid - movd xm1, secd + DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3 +%if UNIX64 vpbroadcastb m0, xm0 ; pri_strength vpbroadcastb m1, xm1 ; sec_strength +%else + vpbroadcastb m0, prim + vpbroadcastb m1, secm +%endif and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps lea secq, [tableq+12] ; sec_taps ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] - DEFINE_ARGS dst, stride, dir, tap, pri, sec, stride3 mov dird, r6m - lea dirq, [tapq+dirq*2+14] + lea dirq, [tableq+dirq*2+14] %if %1*%2*2/mmsize > 1 %if %1 == 4 DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k @@ -614,9 +613,9 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 paddw m11, m13 ; partial_sum_alt[3/2] right vbroadcasti128 m13, [div_table+32] paddw m4, m5 ; partial_sum_alt[3/2] left - pshuflw m11, m11, q3012 - punpckhwd m6, m4, m11 - punpcklwd m4, m11 + pshuflw m5, m11, q3012 + punpckhwd m6, m11, m4 + punpcklwd m4, m5 pmaddwd m6, m6 pmaddwd m4, m4 pmulld m6, m12 @@ -642,14 +641,14 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 paddw m6, m7 paddw m1, m3 ; partial_sum_alt[0/1] right paddw m5, m6 ; partial_sum_alt[0/1] left - pshuflw m1, m1, q3012 - punpckhwd m6, m5, m1 - punpcklwd m5, m1 - pmaddwd m6, m6 + pshuflw m0, m1, q3012 + punpckhwd m1, m5 + punpcklwd m5, m0 + pmaddwd m1, m1 pmaddwd m5, m5 - pmulld m6, m12 + pmulld m1, m12 pmulld m5, m13 - paddd m5, m6 ; cost1[a-d] | cost3[a-d] + paddd m5, m1 ; cost1[a-d] | cost3[a-d] mova xm0, [pd_47130256+ 16] mova m1, [pd_47130256] @@ -661,11 +660,10 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 ; now find the best cost pmaxsd xm2, xm0, xm1 - pshufd xm3, xm2, q3232 - pmaxsd xm2, xm3 - pshufd xm3, xm2, q1111 + pshufd xm3, xm2, q1032 pmaxsd xm2, xm3 - pshufd xm2, xm2, q0000 ; best cost + pshufd xm3, xm2, q2301 + pmaxsd xm2, xm3 ; best cost ; find the idx using minpos ; make everything other than the best cost negative via subtraction @@ -676,7 +674,7 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 phminposuw xm3, xm3 ; convert idx to 32-bits - psrldq xm3, 2 + psrld xm3, 16 movd eax, xm3 ; get idx^4 complement From f9e330508ad3ae4acff847a03de10f2ca90f8927 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 26 Mar 2019 11:23:44 +0100 Subject: [PATCH 060/155] x86: cdef_filter: fix macro case (lower to upper) --- src/x86/cdef.asm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index c8ca614001..fe9821ce78 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -113,7 +113,7 @@ SECTION .text paddw m15, m5 %endmacro -%macro cdef_filter_fn 3 ; w, h, stride +%macro CDEF_FILTER 3 ; w, h, stride INIT_YMM avx2 %if %1 != 4 || %2 != 8 cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ @@ -475,9 +475,9 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ RET %endmacro -cdef_filter_fn 8, 8, 32 -cdef_filter_fn 4, 8, 32 -cdef_filter_fn 4, 4, 32 +CDEF_FILTER 8, 8, 32 +CDEF_FILTER 4, 8, 32 +CDEF_FILTER 4, 4, 32 INIT_YMM avx2 cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 From 814984c54bcf10fb5e3b2f143b357601538e2e23 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 29 Jan 2020 14:17:11 +0100 Subject: [PATCH 061/155] Rework the CDEF top edge handling Avoids some pointer chasing and simplifies the DSP code, at the cost of making the initialization a little bit more complicated. Also reduces memory usage by a small amount due to properly sizing the buffers instead of always allocating enough space for 4:4:4. --- src/x86/cdef.asm | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index fe9821ce78..4fe6b7420f 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -216,23 +216,20 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ .body_done: ; top - DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge test edgeb, 4 ; have_top jz .no_top - mov top1q, [top2q+0*gprsize] - mov top2q, [top2q+1*gprsize] test edgeb, 1 ; have_left jz .top_no_left test edgeb, 2 ; have_right jz .top_no_right - pmovzxbw m1, [top1q-(%1/2)] - pmovzxbw m2, [top2q-(%1/2)] + pmovzxbw m1, [topq+strideq*0-(%1/2)] + pmovzxbw m2, [topq+strideq*1-(%1/2)] movu [px-2*%3-%1], m1 movu [px-1*%3-%1], m2 jmp .top_done .top_no_right: - pmovzxbw m1, [top1q-%1] - pmovzxbw m2, [top2q-%1] + pmovzxbw m1, [topq+strideq*0-%1] + pmovzxbw m2, [topq+strideq*1-%1] movu [px-2*%3-%1*2], m1 movu [px-1*%3-%1*2], m2 movd [px-2*%3+%1*2], xm14 @@ -241,8 +238,8 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ .top_no_left: test edgeb, 2 ; have_right jz .top_no_left_right - pmovzxbw m1, [top1q] - pmovzxbw m2, [top2q] + pmovzxbw m1, [topq+strideq*0] + pmovzxbw m2, [topq+strideq*1] mova [px-2*%3+0], m1 mova [px-1*%3+0], m2 movd [px-2*%3-4], xm14 @@ -250,14 +247,14 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ jmp .top_done .top_no_left_right: %if %1 == 4 - movd xm1, [top1q] - pinsrd xm1, [top2q], 1 + movd xm1, [topq+strideq*0] + pinsrd xm1, [topq+strideq*1], 1 pmovzxbw xm1, xm1 movq [px-2*%3+0], xm1 movhps [px-1*%3+0], xm1 %else - pmovzxbw xm1, [top1q] - pmovzxbw xm2, [top2q] + pmovzxbw xm1, [topq+strideq*0] + pmovzxbw xm2, [topq+strideq*1] mova [px-2*%3+0], xm1 mova [px-1*%3+0], xm2 %endif From 34d295536b263adbe75c9fefb9d62835acadf8e4 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 3 Feb 2020 23:56:12 +0100 Subject: [PATCH 062/155] x86: Avoid cmov instructions that depends on multiple flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On many AMD CPU:s cmov instructions that depends on multiple flags require an additional µop, so prefer using cmov variants that only depends on a single flag where possible. --- src/x86/cdef.asm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 4fe6b7420f..0cdaabfabf 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -375,9 +375,9 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ sub dampingd, 31 xor zerod, zerod add pridmpd, dampingd - cmovl pridmpd, zerod + cmovs pridmpd, zerod add secdmpd, dampingd - cmovl secdmpd, zerod + cmovs secdmpd, zerod mov [rsp+0], pridmpq ; pri_shift mov [rsp+8], secdmpq ; sec_shift From 6c9d4104acb2408225ff38a5ba6a514566533ef6 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 28 Jan 2020 17:42:06 +0100 Subject: [PATCH 063/155] x86: Add cdef_filter_4x4 AVX-512 (Ice Lake) asm --- src/x86/cdef.asm | 258 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 231 insertions(+), 27 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 0cdaabfabf..a0e3655a98 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -28,33 +28,71 @@ %if ARCH_X86_64 -SECTION_RODATA 32 -pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 -div_table: dd 840, 420, 280, 210, 168, 140, 120, 105 - dd 420, 210, 140, 105 -shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 -shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 -pw_128: times 2 dw 128 -pw_2048: times 2 dw 2048 -tap_table: ; masks for 8 bit shifts - db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 - ; weights - db 4, 2, 3, 3, 2, 1 - db -1 * 16 + 1, -2 * 16 + 2 - db 0 * 16 + 1, -1 * 16 + 2 - db 0 * 16 + 1, 0 * 16 + 2 - db 0 * 16 + 1, 1 * 16 + 2 - db 1 * 16 + 1, 2 * 16 + 2 - db 1 * 16 + 0, 2 * 16 + 1 - db 1 * 16 + 0, 2 * 16 + 0 - db 1 * 16 + 0, 2 * 16 - 1 - ; the last 6 are repeats of the first 6 so we don't need to & 7 - db -1 * 16 + 1, -2 * 16 + 2 - db 0 * 16 + 1, -1 * 16 + 2 - db 0 * 16 + 1, 0 * 16 + 2 - db 0 * 16 + 1, 1 * 16 + 2 - db 1 * 16 + 1, 2 * 16 + 2 - db 1 * 16 + 0, 2 * 16 + 1 +%macro DUP4 1-* + %rep %0 + times 4 db %1 + %rotate 1 + %endrep +%endmacro + +%macro DIRS 16 ; cdef_directions[] + %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 + ; masking away unused bits allows us to use a single vpaddd {1to16} + ; instruction instead of having to do vpbroadcastd + paddb + db %13 & 0x3f, -%13 & 0x3f + %rotate 1 + %endrep +%endmacro + +SECTION_RODATA 64 + +lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 + db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 + db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 + db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 +edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 + dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 + dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 + dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 + dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 + dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 + dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 + dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 +px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 +cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 +gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 + dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 + dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 + dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 +end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 +pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 +sec_tap: db 32, 32, 16, 16 +pd_268435568: dd 268435568 +div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 +pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 +shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 +shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pw_128: times 2 dw 128 +pw_2048: times 2 dw 2048 +tap_table: ; masks for 8 bit shifts + db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 + ; weights + db 4, 2, 3, 3, 2, 1 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + db 1 * 16 + 0, 2 * 16 + 0 + db 1 * 16 + 0, 2 * 16 - 1 + ; the last 6 are repeats of the first 6 so we don't need to & 7 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 SECTION .text @@ -680,4 +718,170 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 psrld xm2, 10 movd [varq], xm2 RET + +%if WIN64 +DECLARE_REG_TMP 5, 6 +%else +DECLARE_REG_TMP 8, 5 +%endif + +; lut: +; t0 t1 t2 t3 t4 t5 t6 t7 +; T0 T1 T2 T3 T4 T5 T6 T7 +; L0 L1 00 01 02 03 04 05 +; L2 L3 10 11 12 13 14 15 +; L4 L5 20 21 22 23 24 25 +; L6 L7 30 31 32 33 34 35 +; 4e 4f 40 41 42 43 44 45 +; 5e 5f 50 51 52 53 54 55 + +INIT_ZMM avx512icl +cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge +%define base r7-edge_mask + movq xmm0, [dstq+strideq*0] + movhps xmm0, [dstq+strideq*1] + lea r7, [edge_mask] + movq xmm1, [topq+strideq*0-2] + movhps xmm1, [topq+strideq*1-2] + mov r6d, edgem + vinserti32x4 ym0, ymm0, [leftq], 1 + lea r2, [strideq*3] + vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 + mova m5, [base+lut_perm_4x4] + vinserti32x4 m0, [dstq+r2], 2 + test r6b, 0x08 ; avoid buffer overread + jz .main + lea r3, [dstq+strideq*4-4] + vinserti32x4 m1, [r3+strideq*0], 2 + vinserti32x4 m0, [r3+strideq*1], 3 +.main: + movifnidn prid, prim + mov t0d, dirm + mova m3, [base+px_idx] + mov r3d, dampingm + vpermi2b m5, m0, m1 ; lut + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m7, m7 + lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m6, m3, m5 ; px + cmp r6d, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 +%macro CDEF_FILTER_4x4_PRI 0 + vpcmpub k1, m6, m1, 6 ; px > pN + psubb m2, m1, m6 + lzcnt r6d, prid + vpsubb m2{k1}, m6, m1 ; abs(diff) + vpbroadcastb m4, prim + and prid, 1 + vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift + movifnidn t1d, secm + vpbroadcastd m10, [base+pri_tap+priq*4] + vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) + psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) + pminub m2, m4 + vpdpbusd m0, m2, m10 ; sum +%endmacro + CDEF_FILTER_4x4_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m4, m6, m1 + pmaxub m1, m6 + pminub m5, m2, m3 + pmaxub m2, m3 + pminub m4, m5 + pmaxub m2, m1 + psrldq m1, m4, 2 + psrldq m3, m2, 2 + pminub m1, m4 + vpcmpw k1, m0, m7, 1 + vpshldd m6, m0, 8 + pmaxub m2, m3 + pslldq m3, m1, 1 + psubw m7, m0 + paddusw m0, m6 ; clip >0xff + vpsubusw m0{k1}, m6, m7 ; clip <0x00 + pslldq m4, m2, 1 + pminub m1, m3 + pmaxub m2, m4 + pmaxub m0, m1 + pminub m0, m2 + jmp .end +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) + paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) +.end: + mova xm1, [base+end_perm] + vpermb m0, m1, m0 ; output in bits 8-15 of each dword + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + vpbroadcastq m8, [base+edge_mask+r6*8] + test prid, prid + jz .mask_edges_sec_only + vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m8, m2 ; index in-range + mova m1, m6 + vpermb m1{k1}, m2, m5 + CDEF_FILTER_4x4_PRI + test t1d, t1d + jz .end_no_clip + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m8, m4 + mova m2, m6 + vpermb m2{k1}, m4, m5 + vpshufbitqmb k1, m8, m9 + mova m3, m6 + vpermb m3{k1}, m9, m5 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 + vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 +.sec_main: + vpbroadcastd m8, [base+sec_tap] + vpcmpub k1, m6, m2, 6 + psubb m4, m2, m6 + vpbroadcastb m12, t1d + lzcnt t1d, t1d + vpsubb m4{k1}, m6, m2 + vpcmpub k2, m6, m3, 6 + vpbroadcastq m11, [r3+t1*8] + gf2p8affineqb m10, m4, m11, 0 + psubb m5, m3, m6 + mova m9, m8 + vpsubb m8{k1}, m7, m8 + psubusb m10, m12, m10 + vpsubb m5{k2}, m6, m3 + pminub m4, m10 + vpdpbusd m0, m4, m8 + gf2p8affineqb m11, m5, m11, 0 + vpsubb m9{k2}, m7, m9 + psubusb m12, m11 + pminub m5, m12 + vpdpbusd m0, m5, m9 + ret + %endif ; ARCH_X86_64 From dd1865d243f30504d3ba9da6e2df1c14802bb0f2 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Wed, 12 Feb 2020 14:25:21 +0100 Subject: [PATCH 064/155] x86: add a seperate fully edged case to cdef_filter_avx2 --------------------- fully edged blocks perf ------------------------------------------ before: cdef_filter_4x4_8bpc_avx2: 91.0 after: cdef_filter_4x4_8bpc_avx2: 75.7 --------------------- before: cdef_filter_4x8_8bpc_avx2: 154.6 after: cdef_filter_4x8_8bpc_avx2: 131.8 --------------------- before: cdef_filter_8x8_8bpc_avx2: 214.1 after: cdef_filter_8x8_8bpc_avx2: 195.9 ------------------------------------------ --- src/x86/cdef.asm | 1199 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 1070 insertions(+), 129 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index a0e3655a98..46c2ddb60a 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -44,6 +44,24 @@ %endrep %endmacro +%macro JMP_TABLE 2-* + %xdefine %1_jmptable %%table + %xdefine %%base mangle(private_prefix %+ _%1_avx2) + %%table: + %rep %0 - 1 + dd %%base %+ .%2 - %%table + %rotate 1 + %endrep +%endmacro + +%macro CDEF_FILTER_JMP_TABLE 1 +JMP_TABLE cdef_filter_%1, \ + d6k0, d6k1, d7k0, d7k1, \ + d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \ + d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \ + d0k0, d0k1, d1k0, d1k1 +%endmacro + SECTION_RODATA 64 lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 @@ -68,8 +86,19 @@ end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 sec_tap: db 32, 32, 16, 16 pd_268435568: dd 268435568 -div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 +blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00 + dd 0x80, 0x00, 0x00 +blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 +blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + dd 0x00, 0x00 +blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 + dd 0x0000 +blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 + dd 0x0000, 0x0000 +blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80 +blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000 pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 +div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pw_128: times 2 dw 128 @@ -94,31 +123,94 @@ tap_table: ; masks for 8 bit shifts db 1 * 16 + 1, 2 * 16 + 2 db 1 * 16 + 0, 2 * 16 + 1 +CDEF_FILTER_JMP_TABLE 4x4 +CDEF_FILTER_JMP_TABLE 4x8 +CDEF_FILTER_JMP_TABLE 8x8 + SECTION .text -%macro ACCUMULATE_TAP 7 ; tap_offset, shift, mask, strength, mul_tap, w, stride +%macro ACCUMULATE_TAP_BYTE 7 ; tap_offset, shift, mask, strength, mul_tap, w, h + ; load p0/p1 + movsxd dirjmpq, [dirq+kq*4+%1*2*4] + add dirjmpq, tableq + call dirjmpq + + pmaxub m7, m5 + pminub m8, m5 + pmaxub m7, m6 + pminub m8, m6 + + ; accumulate sum[m15] over p0/p1 +%if %7 == 4 + punpcklbw m5, m6 + punpcklbw m6, m4, m4 + psubusb m9, m5, m6 + psubusb m5, m6, m5 + por m9, m5 ; abs_diff_p01(p01 - px) + pcmpeqb m5, m9 + por m5, m3 + psignb m6, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + pminub m5, m9 + pmaddubsw m5, m6 + paddw m15, m5 +%else + psubusb m9, m5, m4 + psubusb m5, m4, m5 + psubusb m11, m6, m4 + psubusb m6, m4, m6 + por m9, m5 ; abs_diff_p0(p0 - px) + por m11, m6 ; abs_diff_p1(p1 - px) + pcmpeqb m5, m9 + pcmpeqb m6, m11 + punpckhbw m10, m9, m11 + punpcklbw m9, m11 + por m5, m3 + por m11, m6, m3 + punpckhbw m6, m5, m11 + punpcklbw m5, m11 + psignb m11, %5, m6 + psrlw m6, m10, %2 ; emulate 8-bit shift + pand m6, %3 + psubusb m6, %4, m6 + pminub m6, m10 + pmaddubsw m6, m11 + paddw m12, m6 + psignb m11, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + pminub m5, m9 + pmaddubsw m5, m11 + paddw m15, m5 +%endif +%endmacro + +%macro ACCUMULATE_TAP_WORD 6 ; tap_offset, shift, mask, strength, mul_tap, w ; load p0/p1 movsx offq, byte [dirq+kq+%1] ; off1 %if %6 == 4 - movq xm5, [stkq+offq*2+%7*0] ; p0 - movq xm6, [stkq+offq*2+%7*2] - movhps xm5, [stkq+offq*2+%7*1] - movhps xm6, [stkq+offq*2+%7*3] + movq xm5, [stkq+offq*2+32*0] ; p0 + movq xm6, [stkq+offq*2+32*2] + movhps xm5, [stkq+offq*2+32*1] + movhps xm6, [stkq+offq*2+32*3] vinserti128 m5, xm6, 1 %else - movu xm5, [stkq+offq*2+%7*0] ; p0 - vinserti128 m5, [stkq+offq*2+%7*1], 1 + movu xm5, [stkq+offq*2+32*0] ; p0 + vinserti128 m5, [stkq+offq*2+32*1], 1 %endif neg offq ; -off1 %if %6 == 4 - movq xm6, [stkq+offq*2+%7*0] ; p1 - movq xm9, [stkq+offq*2+%7*2] - movhps xm6, [stkq+offq*2+%7*1] - movhps xm9, [stkq+offq*2+%7*3] + movq xm6, [stkq+offq*2+32*0] ; p1 + movq xm9, [stkq+offq*2+32*2] + movhps xm6, [stkq+offq*2+32*1] + movhps xm9, [stkq+offq*2+32*3] vinserti128 m6, xm9, 1 %else - movu xm6, [stkq+offq*2+%7*0] ; p1 - vinserti128 m6, [stkq+offq*2+%7*1], 1 + movu xm6, [stkq+offq*2+32*0] ; p1 + vinserti128 m6, [stkq+offq*2+32*1], 1 %endif ; out of bounds values are set to a value that is a both a large unsigned ; value and a negative signed value. @@ -151,19 +243,868 @@ SECTION .text paddw m15, m5 %endmacro -%macro CDEF_FILTER 3 ; w, h, stride +%macro CDEF_FILTER 2 ; w, h INIT_YMM avx2 -%if %1 != 4 || %2 != 8 -cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \ - dst, stride, left, top, pri, sec, stride3, dst4, edge +cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ + pri, sec, dir, damping, edge +%assign stack_offset_entry stack_offset + mov edged, edgem + cmp edged, 0xf + jne .border_block + + PUSH r9 + PUSH r10 + PUSH r11 +%if %2 == 4 + %assign regs_used 12 + %if WIN64 + PUSH r%+regs_used + %assign regs_used regs_used+1 + %endif + ALLOC_STACK 0x60, 16 + pmovzxbw xm0, [leftq+1] + vpermq m0, m0, q0110 + psrldq m1, m0, 4 + vpalignr m2, m0, m0, 12 + movu [rsp+0x10], m0 + movu [rsp+0x28], m1 + movu [rsp+0x40], m2 %else -cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ - dst, stride, left, top, pri, sec, stride3, dst4, edge + PUSH r12 + %if %1 == 4 + %assign regs_used 13 + %if WIN64 + PUSH r%+regs_used + %assign regs_used regs_used+1 + %endif + ALLOC_STACK 8*2+%1*%2*1, 16 + pmovzxwd m0, [leftq] + mova [rsp+0x10], m0 + %else + PUSH r13 + %assign regs_used 14 + %if WIN64 + PUSH r%+regs_used + %assign regs_used regs_used+1 + %endif + ALLOC_STACK 8*2+%1*%2*2+32, 16 + lea r11, [strideq*3] + movu xm4, [dstq+strideq*2] + pmovzxwq m0, [leftq+0] + pmovzxwq m1, [leftq+8] + vinserti128 m4, [dstq+r11], 1 + pmovzxbd m2, [leftq+1] + pmovzxbd m3, [leftq+9] + mova [rsp+0x10], m0 + mova [rsp+0x30], m1 + mova [rsp+0x50], m2 + mova [rsp+0x70], m3 + mova [rsp+0x90], m4 + %endif %endif -%define px rsp+2*16+2*%3 + + DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping + movifnidn prid, prim +%if UNIX64 + movd xm0, prid + movd xm1, secdmpd +%endif + mov dampingd, r7m + lzcnt pridmpd, prid + lzcnt secdmpd, secdmpm + sub dampingd, 31 + xor zerod, zerod + add pridmpd, dampingd + cmovs pridmpd, zerod + add secdmpd, dampingd + cmovs secdmpd, zerod + mov [rsp+0], pridmpq ; pri_shift + mov [rsp+8], secdmpq ; sec_shift + + DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, left, top, pri, sec, table, dir +%if UNIX64 + vpbroadcastb m0, xm0 ; pri_strength + vpbroadcastb m1, xm1 ; sec_strength +%else + vpbroadcastb m0, prim + vpbroadcastb m1, secm +%endif + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + lea secq, [tableq+12] ; sec_taps + + ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] + mov dird, r6m + lea tableq, [cdef_filter_%1x%2_jmptable] + lea dirq, [tableq+dirq*2*4] +%if %1 == 4 + %if %2 == 4 + DEFINE_ARGS dst, stride, left, top, pri, sec, \ + table, dir, dirjmp, dst4, stride3, k + %else + DEFINE_ARGS dst, stride, left, top, pri, sec, \ + table, dir, dirjmp, dst4, dst8, stride3, k + lea dst8q, [dstq+strideq*8] + %endif +%else + DEFINE_ARGS dst, stride, h, top1, pri, sec, \ + table, dir, dirjmp, top2, dst4, stride3, k + mov hq, -8 + lea top1q, [top1q+strideq*0] + lea top2q, [top1q+strideq*1] +%endif + lea dst4q, [dstq+strideq*4] +%if %1 == 4 + lea stride3q, [strideq*3] +%endif +%if %1*%2 > mmsize +.v_loop: +%endif + mov kd, 1 + pxor m15, m15 ; sum +%if %2 == 8 + pxor m12, m12 + %if %1 == 4 + movd xm4, [dstq +strideq*0] + movd xm6, [dstq +strideq*1] + movd xm5, [dstq +strideq*2] + movd xm7, [dstq +stride3q ] + vinserti128 m4, [dst4q+strideq*0], 1 + vinserti128 m6, [dst4q+strideq*1], 1 + vinserti128 m5, [dst4q+strideq*2], 1 + vinserti128 m7, [dst4q+stride3q ], 1 + punpckldq m4, m6 + punpckldq m5, m7 + %else + movq xm4, [dstq+strideq*0] + movq xm5, [dstq+strideq*1] + vinserti128 m4, [dstq+strideq*2], 1 + vinserti128 m5, [dstq+stride3q ], 1 + %endif + punpcklqdq m4, m5 +%else + movd xm4, [dstq+strideq*0] + movd xm5, [dstq+strideq*1] + vinserti128 m4, [dstq+strideq*2], 1 + vinserti128 m5, [dstq+stride3q ], 1 + punpckldq m4, m5 +%endif + mova m7, m4 ; min + mova m8, m4 ; max +.k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + vpbroadcastb m3, [secq+kq] ; sec_taps + + ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0 + ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2 + ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2 + dec kq + jge .k_loop + + vpbroadcastd m10, [pw_2048] + pxor m9, m9 +%if %2 == 4 + punpcklbw m4, m9 + pcmpgtw m9, m15 + paddw m15, m9 + pmulhrsw m15, m10 + paddw m4, m15 + packuswb m4, m4 ; clip px in [0x0,0xff] + pminub m4, m7 + pmaxub m4, m8 + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + movd [dstq+strideq*2], xm5 + pextrd [dstq+strideq*1], xm4, 1 + pextrd [dstq+stride3q ], xm5, 1 +%else + pcmpgtw m6, m9, m12 + pcmpgtw m5, m9, m15 + paddw m12, m6 + paddw m15, m5 + punpckhbw m5, m4, m9 + punpcklbw m4, m9 + pmulhrsw m12, m10 + pmulhrsw m15, m10 + paddw m5, m12 + paddw m4, m15 + packuswb m4, m5 ; clip px in [0x0,0xff] + pminub m4, m7 + pmaxub m4, m8 + vextracti128 xm5, m4, 1 + %if %1 == 4 + movd [dstq +strideq*0], xm4 + movd [dst4q+strideq*0], xm5 + pextrd [dstq +strideq*1], xm4, 1 + pextrd [dst4q+strideq*1], xm5, 1 + pextrd [dstq +strideq*2], xm4, 2 + pextrd [dst4q+strideq*2], xm5, 2 + pextrd [dstq +stride3q ], xm4, 3 + pextrd [dst4q+stride3q ], xm5, 3 + %else + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm5 + movhps [dstq+strideq*1], xm4 + movhps [dstq+stride3q ], xm5 + %endif +%endif +%if %1*%2 > mmsize + mov dstq, dst4q + lea top1q, [rsp+0x90] + lea top2q, [rsp+0xA0] + lea dst4q, [dst4q+strideq*4] + add hq, 4 + jl .v_loop +%endif + RET + +.d0k0: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m6, [dstq+strideq*1-1] + vpbroadcastq m10, [dstq+strideq*2-1] + movd xm5, [topq+strideq*1+1] + movd xm9, [dstq+strideq*0+1] + psrldq m11, m6, 2 + psrldq m12, m10, 2 + vinserti128 m6, [dstq+stride3q -1], 1 + vinserti128 m10, [dstq+strideq*4-1], 1 + vpblendd m5, m11, 0x10 + vpblendd m9, m12, 0x10 + movu m11, [blend_4x4+16] + punpckldq m6, m10 + punpckldq m5, m9 + vpblendvb m6, [rsp+gprsize+0x28], m11 + %else + movd xm5, [topq +strideq*1+1] + movq xm6, [dstq +strideq*1-1] + movq xm10, [dstq +stride3q -1] + movq xm11, [dst4q+strideq*1-1] + pinsrd xm5, [dstq +strideq*0+1], 1 + movhps xm6, [dstq +strideq*2-1] + movhps xm10, [dst4q+strideq*0-1] + movhps xm11, [dst4q+strideq*2-1] + psrldq xm9, xm6, 2 + shufps xm5, xm9, q2010 ; -1 +0 +1 +2 + shufps xm6, xm10, q2020 ; +1 +2 +3 +4 + psrldq xm9, xm11, 2 + psrldq xm10, 2 + shufps xm10, xm9, q2020 ; +3 +4 +5 +6 + movd xm9, [dst4q+stride3q -1] + pinsrd xm9, [dst4q+strideq*4-1], 1 + shufps xm11, xm9, q1020 ; +5 +6 +7 +8 + pmovzxbw m9, [leftq+3] + vinserti128 m6, xm11, 1 + movu m11, [blend_4x8_0+4] + vinserti128 m5, xm10, 1 + vpblendvb m6, m9, m11 + %endif +%else + lea r13, [blend_8x8_0+16] + movq xm5, [top2q +1] + vbroadcasti128 m10, [dstq+strideq*1-1] + vbroadcasti128 m11, [dstq+strideq*2-1] + movhps xm5, [dstq+strideq*0+1] + vinserti128 m6, m10, [dstq+stride3q -1], 1 + vinserti128 m9, m11, [dstq+strideq*4-1], 1 + psrldq m10, 2 + psrldq m11, 2 + punpcklqdq m6, m9 + movu m9, [r13+hq*2*1+16*1] + punpcklqdq m10, m11 + vpblendd m5, m10, 0xF0 + vpblendvb m6, [rsp+gprsize+80+hq*8+64+8*1], m9 +%endif + ret +.d1k0: +.d2k0: +.d3k0: +%if %1 == 4 + %if %2 == 4 + movq xm6, [dstq+strideq*0-1] + movq xm9, [dstq+strideq*1-1] + vinserti128 m6, [dstq+strideq*2-1], 1 + vinserti128 m9, [dstq+stride3q -1], 1 + movu m11, [rsp+gprsize+0x10] + pcmpeqd m12, m12 + psrldq m5, m6, 2 + psrldq m10, m9, 2 + psrld m12, 24 + punpckldq m6, m9 + punpckldq m5, m10 + vpblendvb m6, m11, m12 + %else + movq xm6, [dstq +strideq*0-1] + movq xm9, [dstq +strideq*2-1] + movhps xm6, [dstq +strideq*1-1] + movhps xm9, [dstq +stride3q -1] + movq xm10, [dst4q+strideq*0-1] + movhps xm10, [dst4q+strideq*1-1] + psrldq xm5, xm6, 2 + psrldq xm11, xm9, 2 + shufps xm5, xm11, q2020 + movq xm11, [dst4q+strideq*2-1] + movhps xm11, [dst4q+stride3q -1] + shufps xm6, xm9, q2020 + shufps xm9, xm10, xm11, q2020 + vinserti128 m6, xm9, 1 + pmovzxbw m9, [leftq+1] + psrldq xm10, 2 + psrldq xm11, 2 + shufps xm10, xm11, q2020 + vpbroadcastd m11, [blend_4x8_0+4] + vinserti128 m5, xm10, 1 + vpblendvb m6, m9, m11 + %endif +%else + movu xm5, [dstq+strideq*0-1] + movu xm9, [dstq+strideq*1-1] + vinserti128 m5, [dstq+strideq*2-1], 1 + vinserti128 m9, [dstq+stride3q -1], 1 + mova m10, [blend_8x8_0+16] + punpcklqdq m6, m5, m9 + vpblendvb m6, [rsp+gprsize+80+hq*8+64], m10 + psrldq m5, 2 + psrldq m9, 2 + punpcklqdq m5, m9 +%endif + ret +.d4k0: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m10, [dstq+strideq*1-1] + vpbroadcastq m11, [dstq+strideq*2-1] + movd xm6, [topq+strideq*1-1] + movd xm9, [dstq+strideq*0-1] + psrldq m5, m10, 2 + psrldq m12, m11, 2 + vpblendd m6, m10, 0x10 + vpblendd m9, m11, 0x10 + movu m10, [blend_4x4] + vinserti128 m5, [dstq+stride3q +1], 1 + vinserti128 m12, [dstq+strideq*4+1], 1 + punpckldq m6, m9 + punpckldq m5, m12 + vpblendvb m6, [rsp+gprsize+0x40], m10 + %else + movd xm6, [topq +strideq*1-1] + movq xm9, [dstq +strideq*1-1] + movq xm10, [dstq +stride3q -1] + movq xm11, [dst4q+strideq*1-1] + pinsrd xm6, [dstq +strideq*0-1], 1 + movhps xm9, [dstq +strideq*2-1] + movhps xm10, [dst4q+strideq*0-1] + movhps xm11, [dst4q+strideq*2-1] + psrldq xm5, xm9, 2 + shufps xm6, xm9, q2010 + psrldq xm9, xm10, 2 + shufps xm5, xm9, q2020 + shufps xm10, xm11, q2020 + movd xm9, [dst4q+stride3q +1] + vinserti128 m6, xm10, 1 + pinsrd xm9, [dst4q+strideq*4+1], 1 + psrldq xm11, 2 + pmovzxbw m10, [leftq-1] + shufps xm11, xm9, q1020 + movu m9, [blend_4x8_0] + vinserti128 m5, xm11, 1 + vpblendvb m6, m10, m9 + %endif +%else + lea r13, [blend_8x8_0+8] + movq xm6, [top2q -1] + vbroadcasti128 m5, [dstq+strideq*1-1] + vbroadcasti128 m9, [dstq+strideq*2-1] + movhps xm6, [dstq+strideq*0-1] + movu m11, [r13+hq*2*1+16*1] + punpcklqdq m10, m5, m9 + vinserti128 m5, [dstq+stride3q -1], 1 + vinserti128 m9, [dstq+strideq*4-1], 1 + vpblendd m6, m10, 0xF0 + vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*1], m11 + psrldq m5, 2 + psrldq m9, 2 + punpcklqdq m5, m9 +%endif + ret +.d5k0: +.d6k0: +.d7k0: +%if %1 == 4 + %if %2 == 4 + movd xm6, [topq+strideq*1 ] + vpbroadcastd m5, [dstq+strideq*1 ] + vpbroadcastd m9, [dstq+strideq*2 ] + vpblendd xm6, [dstq+strideq*0-4], 0x2 + vpblendd m5, m9, 0x22 + vpblendd m6, m5, 0x30 + vinserti128 m5, [dstq+stride3q ], 1 + vpblendd m5, [dstq+strideq*4-20], 0x20 + %else + movd xm6, [topq +strideq*1] + movd xm5, [dstq +strideq*1] + movd xm9, [dstq +stride3q ] + movd xm10, [dst4q+strideq*1] + movd xm11, [dst4q+stride3q ] + pinsrd xm6, [dstq +strideq*0], 1 + pinsrd xm5, [dstq +strideq*2], 1 + pinsrd xm9, [dst4q+strideq*0], 1 + pinsrd xm10, [dst4q+strideq*2], 1 + pinsrd xm11, [dst4q+strideq*4], 1 + punpcklqdq xm6, xm5 + punpcklqdq xm5, xm9 + punpcklqdq xm9, xm10 + punpcklqdq xm10, xm11 + vinserti128 m6, xm9, 1 + vinserti128 m5, xm10, 1 + %endif +%else + movq xm6, [top2q ] + movq xm5, [dstq+strideq*1] + movq xm9, [dstq+stride3q ] + movhps xm6, [dstq+strideq*0] + movhps xm5, [dstq+strideq*2] + movhps xm9, [dstq+strideq*4] + vinserti128 m6, xm5, 1 + vinserti128 m5, xm9, 1 +%endif + ret +.d0k1: +%if %1 == 4 + %if %2 == 4 + movd xm6, [dstq +strideq*2-2] + movd xm9, [dstq +stride3q -2] + movd xm5, [topq +strideq*0+2] + movd xm10, [topq +strideq*1+2] + pinsrw xm6, [leftq+4], 0 + pinsrw xm9, [leftq+6], 0 + vinserti128 m5, [dstq +strideq*0+2], 1 + vinserti128 m10, [dstq +strideq*1+2], 1 + vinserti128 m6, [dst4q+strideq*0-2], 1 + vinserti128 m9, [dst4q+strideq*1-2], 1 + punpckldq m5, m10 + punpckldq m6, m9 + %else + movq xm6, [dstq +strideq*2-2] + movd xm10, [dst4q+strideq*2-2] + movd xm5, [topq +strideq*0+2] + movq xm9, [dst4q+strideq*0-2] + movhps xm6, [dstq +stride3q -2] + pinsrw xm10, [dst4q+stride3q ], 3 + pinsrd xm5, [topq +strideq*1+2], 1 + movhps xm9, [dst4q+strideq*1-2] + pinsrd xm10, [dst8q+strideq*0-2], 2 + pinsrd xm5, [dstq +strideq*0+2], 2 + pinsrd xm10, [dst8q+strideq*1-2], 3 + pinsrd xm5, [dstq +strideq*1+2], 3 + shufps xm11, xm6, xm9, q3131 + shufps xm6, xm9, q2020 + movu m9, [blend_4x8_3+8] + vinserti128 m6, xm10, 1 + vinserti128 m5, xm11, 1 + vpblendvb m6, [rsp+gprsize+16+8], m9 + %endif +%else + lea r13, [blend_8x8_1+16] + movq xm6, [dstq +strideq*2-2] + movq xm9, [dstq +stride3q -2] + movq xm5, [top1q +2] + movq xm10, [top2q +2] + movu m11, [r13+hq*2*2+16*2] + vinserti128 m6, [dst4q+strideq*0-2], 1 + vinserti128 m9, [dst4q+strideq*1-2], 1 + vinserti128 m5, [dstq +strideq*0+2], 1 + vinserti128 m10, [dstq +strideq*1+2], 1 + punpcklqdq m6, m9 + punpcklqdq m5, m10 + vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*2], m11 +%endif + ret +.d1k1: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m6, [dstq+strideq*1-2] + vpbroadcastq m9, [dstq+strideq*2-2] + movd xm5, [topq+strideq*1+2] + movd xm10, [dstq+strideq*0+2] + psrldq m11, m6, 4 + psrldq m12, m9, 4 + vpblendd m5, m11, 0x10 + movq xm11, [leftq+2] + vinserti128 m6, [dstq+stride3q -2], 1 + punpckldq xm11, xm11 + vpblendd m10, m12, 0x10 + pcmpeqd m12, m12 + pmovzxwd m11, xm11 + psrld m12, 16 + punpckldq m6, m9 + vpbroadcastd m9, [dstq+strideq*4-2] + vpblendvb m6, m11, m12 + punpckldq m5, m10 + vpblendd m6, m9, 0x20 + %else + movd xm5, [topq +strideq*1+2] + movq xm6, [dstq +strideq*1-2] + movq xm9, [dstq +stride3q -2] + movq xm10, [dst4q+strideq*1-2] + movd xm11, [dst4q+stride3q -2] + pinsrd xm5, [dstq +strideq*0+2], 1 + movhps xm6, [dstq +strideq*2-2] + movhps xm9, [dst4q+strideq*0-2] + movhps xm10, [dst4q+strideq*2-2] + pinsrd xm11, [dst4q+strideq*4-2], 1 + shufps xm5, xm6, q3110 + shufps xm6, xm9, q2020 + shufps xm9, xm10, q3131 + shufps xm10, xm11, q1020 + movu m11, [blend_4x8_2+4] + vinserti128 m6, xm10, 1 + vinserti128 m5, xm9, 1 + vpblendvb m6, [rsp+gprsize+16+4], m11 + %endif +%else + lea r13, [blend_8x8_1+16] + movq xm5, [top2q +2] + vbroadcasti128 m6, [dstq+strideq*1-2] + vbroadcasti128 m9, [dstq+strideq*2-2] + movhps xm5, [dstq+strideq*0+2] + shufps m10, m6, m9, q2121 + vinserti128 m6, [dstq+stride3q -2], 1 + vinserti128 m9, [dstq+strideq*4-2], 1 + movu m11, [r13+hq*2*1+16*1] + vpblendd m5, m10, 0xF0 + punpcklqdq m6, m9 + vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*1], m11 +%endif + ret +.d2k1: +%if %1 == 4 + %if %2 == 4 + movq xm11, [leftq] + movq xm6, [dstq+strideq*0-2] + movq xm9, [dstq+strideq*1-2] + vinserti128 m6, [dstq+strideq*2-2], 1 + vinserti128 m9, [dstq+stride3q -2], 1 + punpckldq xm11, xm11 + psrldq m5, m6, 4 + psrldq m10, m9, 4 + pmovzxwd m11, xm11 + punpckldq m6, m9 + punpckldq m5, m10 + pblendw m6, m11, 0x05 + %else + movq xm5, [dstq +strideq*0-2] + movq xm9, [dstq +strideq*2-2] + movq xm10, [dst4q+strideq*0-2] + movq xm11, [dst4q+strideq*2-2] + movhps xm5, [dstq +strideq*1-2] + movhps xm9, [dstq +stride3q -2] + movhps xm10, [dst4q+strideq*1-2] + movhps xm11, [dst4q+stride3q -2] + shufps xm6, xm5, xm9, q2020 + shufps xm5, xm9, q3131 + shufps xm9, xm10, xm11, q2020 + shufps xm10, xm11, q3131 + pmovzxwd m11, [leftq] + vinserti128 m6, xm9, 1 + vinserti128 m5, xm10, 1 + pblendw m6, m11, 0x55 + %endif +%else + mova m11, [rsp+gprsize+16+hq*8+64] + movu xm5, [dstq+strideq*0-2] + movu xm9, [dstq+strideq*1-2] + vinserti128 m5, [dstq+strideq*2-2], 1 + vinserti128 m9, [dstq+stride3q -2], 1 + shufps m6, m5, m9, q1010 + shufps m5, m9, q2121 + pblendw m6, m11, 0x11 +%endif + ret +.d3k1: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m11, [dstq+strideq*1-2] + vpbroadcastq m12, [dstq+strideq*2-2] + movd xm6, [topq+strideq*1-2] + movd xm9, [dstq+strideq*0-2] + pblendw m11, [leftq-16+2], 0x01 + pblendw m12, [leftq-16+4], 0x01 + pinsrw xm9, [leftq- 0+0], 0 + psrldq m5, m11, 4 + psrldq m10, m12, 4 + vinserti128 m5, [dstq+stride3q +2], 1 + vinserti128 m10, [dstq+strideq*4+2], 1 + vpblendd m6, m11, 0x10 + vpblendd m9, m12, 0x10 + punpckldq m6, m9 + punpckldq m5, m10 + %else + movd xm6, [topq +strideq*1-2] + movq xm5, [dstq +strideq*1-2] + movq xm9, [dstq +stride3q -2] + movq xm10, [dst4q+strideq*1-2] + movd xm11, [dst4q+stride3q +2] + pinsrw xm6, [dstq +strideq*0 ], 3 + movhps xm5, [dstq +strideq*2-2] + movhps xm9, [dst4q+strideq*0-2] + movhps xm10, [dst4q+strideq*2-2] + pinsrd xm11, [dst4q+strideq*4+2], 1 + shufps xm6, xm5, q2010 + shufps xm5, xm9, q3131 + shufps xm9, xm10, q2020 + shufps xm10, xm11, q1031 + movu m11, [blend_4x8_2] + vinserti128 m6, xm9, 1 + vinserti128 m5, xm10, 1 + vpblendvb m6, [rsp+gprsize+16-4], m11 + %endif +%else + lea r13, [blend_8x8_1+8] + movq xm6, [top2q -2] + vbroadcasti128 m5, [dstq+strideq*1-2] + vbroadcasti128 m10, [dstq+strideq*2-2] + movhps xm6, [dstq+strideq*0-2] + punpcklqdq m9, m5, m10 + vinserti128 m5, [dstq+stride3q -2], 1 + vinserti128 m10, [dstq+strideq*4-2], 1 + movu m11, [r13+hq*2*1+16*1] + vpblendd m6, m9, 0xF0 + shufps m5, m10, q2121 + vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*1], m11 +%endif + ret +.d4k1: +%if %1 == 4 + %if %2 == 4 + vinserti128 m6, [dstq +strideq*0-2], 1 + vinserti128 m9, [dstq +strideq*1-2], 1 + movd xm5, [dstq +strideq*2+2] + movd xm10, [dstq +stride3q +2] + pblendw m6, [leftq-16+0], 0x01 + pblendw m9, [leftq-16+2], 0x01 + vinserti128 m5, [dst4q+strideq*0+2], 1 + vinserti128 m10, [dst4q+strideq*1+2], 1 + vpblendd m6, [topq +strideq*0-2], 0x01 + vpblendd m9, [topq +strideq*1-2], 0x01 + punpckldq m5, m10 + punpckldq m6, m9 + %else + movd xm6, [topq +strideq*0-2] + movq xm5, [dstq +strideq*2-2] + movq xm9, [dst4q+strideq*0-2] + movd xm10, [dst4q+strideq*2+2] + pinsrd xm6, [topq +strideq*1-2], 1 + movhps xm5, [dstq +stride3q -2] + movhps xm9, [dst4q+strideq*1-2] + pinsrd xm10, [dst4q+stride3q +2], 1 + pinsrd xm6, [dstq +strideq*0-2], 2 + pinsrd xm10, [dst8q+strideq*0+2], 2 + pinsrd xm6, [dstq +strideq*1-2], 3 + pinsrd xm10, [dst8q+strideq*1+2], 3 + shufps xm11, xm5, xm9, q2020 + shufps xm5, xm9, q3131 + movu m9, [blend_4x8_3] + vinserti128 m6, xm11, 1 + vinserti128 m5, xm10, 1 + vpblendvb m6, [rsp+gprsize+16-8], m9 + %endif +%else + lea r13, [blend_8x8_1] + movu m11, [r13+hq*2*2+16*2] + movq xm6, [top1q -2] + movq xm9, [top2q -2] + movq xm5, [dstq +strideq*2+2] + movq xm10, [dstq +stride3q +2] + vinserti128 m6, [dstq +strideq*0-2], 1 + vinserti128 m9, [dstq +strideq*1-2], 1 + vinserti128 m5, [dst4q+strideq*0+2], 1 + vinserti128 m10, [dst4q+strideq*1+2], 1 + punpcklqdq m6, m9 + vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*2], m11 + punpcklqdq m5, m10 +%endif + ret +.d5k1: +%if %1 == 4 + %if %2 == 4 + movd xm6, [topq +strideq*0-1] + movd xm9, [topq +strideq*1-1] + movd xm5, [dstq +strideq*2+1] + movd xm10, [dstq +stride3q +1] + pcmpeqd m12, m12 + pmovzxbw m11, [leftq-8+1] + psrld m12, 24 + vinserti128 m6, [dstq +strideq*0-1], 1 + vinserti128 m9, [dstq +strideq*1-1], 1 + vinserti128 m5, [dst4q+strideq*0+1], 1 + vinserti128 m10, [dst4q+strideq*1+1], 1 + punpckldq m6, m9 + pxor m9, m9 + vpblendd m12, m9, 0x0F + punpckldq m5, m10 + vpblendvb m6, m11, m12 + %else + movd xm6, [topq +strideq*0-1] + movq xm5, [dstq +strideq*2-1] + movq xm9, [dst4q+strideq*0-1] + movd xm10, [dst4q+strideq*2+1] + pinsrd xm6, [topq +strideq*1-1], 1 + movhps xm5, [dstq +stride3q -1] + movhps xm9, [dst4q+strideq*1-1] + pinsrd xm10, [dst4q+stride3q +1], 1 + pinsrd xm6, [dstq +strideq*0-1], 2 + pinsrd xm10, [dst8q+strideq*0+1], 2 + pinsrd xm6, [dstq +strideq*1-1], 3 + pinsrd xm10, [dst8q+strideq*1+1], 3 + shufps xm11, xm5, xm9, q2020 + vinserti128 m6, xm11, 1 + pmovzxbw m11, [leftq-3] + psrldq xm5, 2 + psrldq xm9, 2 + shufps xm5, xm9, q2020 + movu m9, [blend_4x8_1] + vinserti128 m5, xm10, 1 + vpblendvb m6, m11, m9 + %endif +%else + lea r13, [blend_8x8_0] + movu m11, [r13+hq*2*2+16*2] + movq xm6, [top1q -1] + movq xm9, [top2q -1] + movq xm5, [dstq +strideq*2+1] + movq xm10, [dstq +stride3q +1] + vinserti128 m6, [dstq +strideq*0-1], 1 + vinserti128 m9, [dstq +strideq*1-1], 1 + vinserti128 m5, [dst4q+strideq*0+1], 1 + vinserti128 m10, [dst4q+strideq*1+1], 1 + punpcklqdq m6, m9 + punpcklqdq m5, m10 + vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*2], m11 +%endif + ret +.d6k1: +%if %1 == 4 + %if %2 == 4 + movd xm6, [topq +strideq*0] + movd xm9, [topq +strideq*1] + movd xm5, [dstq +strideq*2] + movd xm10, [dstq +stride3q ] + vinserti128 m6, [dstq +strideq*0], 1 + vinserti128 m9, [dstq +strideq*1], 1 + vinserti128 m5, [dst4q+strideq*0], 1 + vinserti128 m10, [dst4q+strideq*1], 1 + punpckldq m6, m9 + punpckldq m5, m10 + %else + movd xm5, [dstq +strideq*2] + movd xm6, [topq +strideq*0] + movd xm9, [dst4q+strideq*2] + pinsrd xm5, [dstq +stride3q ], 1 + pinsrd xm6, [topq +strideq*1], 1 + pinsrd xm9, [dst4q+stride3q ], 1 + pinsrd xm5, [dst4q+strideq*0], 2 + pinsrd xm6, [dstq +strideq*0], 2 + pinsrd xm9, [dst8q+strideq*0], 2 + pinsrd xm5, [dst4q+strideq*1], 3 + pinsrd xm6, [dstq +strideq*1], 3 + pinsrd xm9, [dst8q+strideq*1], 3 + vinserti128 m6, xm5, 1 + vinserti128 m5, xm9, 1 + %endif +%else + movq xm5, [dstq +strideq*2] + movq xm9, [dst4q+strideq*0] + movq xm6, [top1q ] + movq xm10, [dstq +strideq*0] + movhps xm5, [dstq +stride3q ] + movhps xm9, [dst4q+strideq*1] + movhps xm6, [top2q ] + movhps xm10, [dstq +strideq*1] + vinserti128 m5, xm9, 1 + vinserti128 m6, xm10, 1 +%endif + ret +.d7k1: +%if %1 == 4 + %if %2 == 4 + movd xm5, [dstq +strideq*2-1] + movd xm9, [dstq +stride3q -1] + movd xm6, [topq +strideq*0+1] + movd xm10, [topq +strideq*1+1] + pinsrb xm5, [leftq+ 5], 0 + pinsrb xm9, [leftq+ 7], 0 + vinserti128 m6, [dstq +strideq*0+1], 1 + vinserti128 m10, [dstq +strideq*1+1], 1 + vinserti128 m5, [dst4q+strideq*0-1], 1 + vinserti128 m9, [dst4q+strideq*1-1], 1 + punpckldq m6, m10 + punpckldq m5, m9 + %else + movd xm6, [topq +strideq*0+1] + movq xm9, [dstq +strideq*2-1] + movq xm10, [dst4q+strideq*0-1] + movd xm11, [dst4q+strideq*2-1] + pinsrd xm6, [topq +strideq*1+1], 1 + movhps xm9, [dstq +stride3q -1] + movhps xm10, [dst4q+strideq*1-1] + pinsrd xm11, [dst4q+stride3q -1], 1 + pinsrd xm6, [dstq +strideq*0+1], 2 + pinsrd xm11, [dst8q+strideq*0-1], 2 + pinsrd xm6, [dstq +strideq*1+1], 3 + pinsrd xm11, [dst8q+strideq*1-1], 3 + shufps xm5, xm9, xm10, q2020 + vinserti128 m5, xm11, 1 + pmovzxbw m11, [leftq+5] + psrldq xm9, 2 + psrldq xm10, 2 + shufps xm9, xm10, q2020 + movu m10, [blend_4x8_1+8] + vinserti128 m6, xm9, 1 + vpblendvb m5, m11, m10 + %endif +%else + lea r13, [blend_8x8_0+16] + movq xm5, [dstq +strideq*2-1] + movq xm9, [dst4q+strideq*0-1] + movq xm6, [top1q +1] + movq xm10, [dstq +strideq*0+1] + movhps xm5, [dstq +stride3q -1] + movhps xm9, [dst4q+strideq*1-1] + movhps xm6, [top2q +1] + movhps xm10, [dstq +strideq*1+1] + movu m11, [r13+hq*2*2+16*2] + vinserti128 m5, xm9, 1 + vinserti128 m6, xm10, 1 + vpblendvb m5, [rsp+gprsize+80+hq*8+64+8*2], m11 +%endif + ret + +.border_block: + DEFINE_ARGS dst, stride, left, top, pri, sec, stride3, dst4, edge +%define rstk rsp +%assign stack_offset stack_offset_entry +%if %1 == 4 && %2 == 8 + PUSH r9 + %assign regs_used 10 +%else + %assign regs_used 9 +%endif +%if WIN64 + PUSH r%+regs_used + %assign regs_used regs_used+1 +%endif + ALLOC_STACK 2*16+(%2+4)*32, 16 +%define px rsp+2*16+2*32 + pcmpeqw m14, m14 psllw m14, 15 ; 0x8000 - mov edged, r8m ; prepare pixel buffers - body/right %if %1 == 4 @@ -179,19 +1120,19 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ pmovzxbw m2, [dstq+strideq*1] pmovzxbw m3, [dstq+strideq*2] pmovzxbw m4, [dstq+stride3q] - mova [px+0*%3], m1 - mova [px+1*%3], m2 - mova [px+2*%3], m3 - mova [px+3*%3], m4 + mova [px+0*32], m1 + mova [px+1*32], m2 + mova [px+2*32], m3 + mova [px+3*32], m4 %if %2 == 8 pmovzxbw m1, [dst4q+strideq*0] pmovzxbw m2, [dst4q+strideq*1] pmovzxbw m3, [dst4q+strideq*2] pmovzxbw m4, [dst4q+stride3q] - mova [px+4*%3], m1 - mova [px+5*%3], m2 - mova [px+6*%3], m3 - mova [px+7*%3], m4 + mova [px+4*32], m1 + mova [px+5*32], m2 + mova [px+6*32], m3 + mova [px+7*32], m4 %endif jmp .body_done .no_right: @@ -204,24 +1145,24 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ pmovzxbw xm2, xm2 pmovzxbw xm3, xm3 pmovzxbw xm4, xm4 - movq [px+0*%3], xm1 - movq [px+1*%3], xm2 - movq [px+2*%3], xm3 - movq [px+3*%3], xm4 + movq [px+0*32], xm1 + movq [px+1*32], xm2 + movq [px+2*32], xm3 + movq [px+3*32], xm4 %else pmovzxbw xm1, [dstq+strideq*0] pmovzxbw xm2, [dstq+strideq*1] pmovzxbw xm3, [dstq+strideq*2] pmovzxbw xm4, [dstq+stride3q] - mova [px+0*%3], xm1 - mova [px+1*%3], xm2 - mova [px+2*%3], xm3 - mova [px+3*%3], xm4 -%endif - movd [px+0*%3+%1*2], xm14 - movd [px+1*%3+%1*2], xm14 - movd [px+2*%3+%1*2], xm14 - movd [px+3*%3+%1*2], xm14 + mova [px+0*32], xm1 + mova [px+1*32], xm2 + mova [px+2*32], xm3 + mova [px+3*32], xm4 +%endif + movd [px+0*32+%1*2], xm14 + movd [px+1*32+%1*2], xm14 + movd [px+2*32+%1*2], xm14 + movd [px+3*32+%1*2], xm14 %if %2 == 8 %if %1 == 4 movd xm1, [dst4q+strideq*0] @@ -232,24 +1173,24 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ pmovzxbw xm2, xm2 pmovzxbw xm3, xm3 pmovzxbw xm4, xm4 - movq [px+4*%3], xm1 - movq [px+5*%3], xm2 - movq [px+6*%3], xm3 - movq [px+7*%3], xm4 + movq [px+4*32], xm1 + movq [px+5*32], xm2 + movq [px+6*32], xm3 + movq [px+7*32], xm4 %else pmovzxbw xm1, [dst4q+strideq*0] pmovzxbw xm2, [dst4q+strideq*1] pmovzxbw xm3, [dst4q+strideq*2] pmovzxbw xm4, [dst4q+stride3q] - mova [px+4*%3], xm1 - mova [px+5*%3], xm2 - mova [px+6*%3], xm3 - mova [px+7*%3], xm4 + mova [px+4*32], xm1 + mova [px+5*32], xm2 + mova [px+6*32], xm3 + mova [px+7*32], xm4 %endif - movd [px+4*%3+%1*2], xm14 - movd [px+5*%3+%1*2], xm14 - movd [px+6*%3+%1*2], xm14 - movd [px+7*%3+%1*2], xm14 + movd [px+4*32+%1*2], xm14 + movd [px+5*32+%1*2], xm14 + movd [px+6*32+%1*2], xm14 + movd [px+7*32+%1*2], xm14 %endif .body_done: @@ -262,48 +1203,48 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ jz .top_no_right pmovzxbw m1, [topq+strideq*0-(%1/2)] pmovzxbw m2, [topq+strideq*1-(%1/2)] - movu [px-2*%3-%1], m1 - movu [px-1*%3-%1], m2 + movu [px-2*32-%1], m1 + movu [px-1*32-%1], m2 jmp .top_done .top_no_right: pmovzxbw m1, [topq+strideq*0-%1] pmovzxbw m2, [topq+strideq*1-%1] - movu [px-2*%3-%1*2], m1 - movu [px-1*%3-%1*2], m2 - movd [px-2*%3+%1*2], xm14 - movd [px-1*%3+%1*2], xm14 + movu [px-2*32-%1*2], m1 + movu [px-1*32-%1*2], m2 + movd [px-2*32+%1*2], xm14 + movd [px-1*32+%1*2], xm14 jmp .top_done .top_no_left: test edgeb, 2 ; have_right jz .top_no_left_right pmovzxbw m1, [topq+strideq*0] pmovzxbw m2, [topq+strideq*1] - mova [px-2*%3+0], m1 - mova [px-1*%3+0], m2 - movd [px-2*%3-4], xm14 - movd [px-1*%3-4], xm14 + mova [px-2*32+0], m1 + mova [px-1*32+0], m2 + movd [px-2*32-4], xm14 + movd [px-1*32-4], xm14 jmp .top_done .top_no_left_right: %if %1 == 4 movd xm1, [topq+strideq*0] pinsrd xm1, [topq+strideq*1], 1 pmovzxbw xm1, xm1 - movq [px-2*%3+0], xm1 - movhps [px-1*%3+0], xm1 + movq [px-2*32+0], xm1 + movhps [px-1*32+0], xm1 %else pmovzxbw xm1, [topq+strideq*0] pmovzxbw xm2, [topq+strideq*1] - mova [px-2*%3+0], xm1 - mova [px-1*%3+0], xm2 + mova [px-2*32+0], xm1 + mova [px-1*32+0], xm2 %endif - movd [px-2*%3-4], xm14 - movd [px-1*%3-4], xm14 - movd [px-2*%3+%1*2], xm14 - movd [px-1*%3+%1*2], xm14 + movd [px-2*32-4], xm14 + movd [px-1*32-4], xm14 + movd [px-2*32+%1*2], xm14 + movd [px-1*32+%1*2], xm14 jmp .top_done .no_top: - movu [px-2*%3-%1], m14 - movu [px-1*%3-%1], m14 + movu [px-2*32-%1], m14 + movu [px-1*32-%1], m14 .top_done: ; left @@ -313,27 +1254,27 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ %if %2 == 8 pmovzxbw xm2, [leftq+ 8] %endif - movd [px+0*%3-4], xm1 - pextrd [px+1*%3-4], xm1, 1 - pextrd [px+2*%3-4], xm1, 2 - pextrd [px+3*%3-4], xm1, 3 + movd [px+0*32-4], xm1 + pextrd [px+1*32-4], xm1, 1 + pextrd [px+2*32-4], xm1, 2 + pextrd [px+3*32-4], xm1, 3 %if %2 == 8 - movd [px+4*%3-4], xm2 - pextrd [px+5*%3-4], xm2, 1 - pextrd [px+6*%3-4], xm2, 2 - pextrd [px+7*%3-4], xm2, 3 + movd [px+4*32-4], xm2 + pextrd [px+5*32-4], xm2, 1 + pextrd [px+6*32-4], xm2, 2 + pextrd [px+7*32-4], xm2, 3 %endif jmp .left_done .no_left: - movd [px+0*%3-4], xm14 - movd [px+1*%3-4], xm14 - movd [px+2*%3-4], xm14 - movd [px+3*%3-4], xm14 + movd [px+0*32-4], xm14 + movd [px+1*32-4], xm14 + movd [px+2*32-4], xm14 + movd [px+3*32-4], xm14 %if %2 == 8 - movd [px+4*%3-4], xm14 - movd [px+5*%3-4], xm14 - movd [px+6*%3-4], xm14 - movd [px+7*%3-4], xm14 + movd [px+4*32-4], xm14 + movd [px+5*32-4], xm14 + movd [px+6*32-4], xm14 + movd [px+7*32-4], xm14 %endif .left_done: @@ -348,51 +1289,51 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ jz .bottom_no_right pmovzxbw m1, [dst8q-(%1/2)] pmovzxbw m2, [dst8q+strideq-(%1/2)] - movu [px+(%2+0)*%3-%1], m1 - movu [px+(%2+1)*%3-%1], m2 + movu [px+(%2+0)*32-%1], m1 + movu [px+(%2+1)*32-%1], m2 jmp .bottom_done .bottom_no_right: pmovzxbw m1, [dst8q-%1] pmovzxbw m2, [dst8q+strideq-%1] - movu [px+(%2+0)*%3-%1*2], m1 - movu [px+(%2+1)*%3-%1*2], m2 + movu [px+(%2+0)*32-%1*2], m1 + movu [px+(%2+1)*32-%1*2], m2 %if %1 == 8 - movd [px+(%2-1)*%3+%1*2], xm14 ; overwritten by previous movu + movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu %endif - movd [px+(%2+0)*%3+%1*2], xm14 - movd [px+(%2+1)*%3+%1*2], xm14 + movd [px+(%2+0)*32+%1*2], xm14 + movd [px+(%2+1)*32+%1*2], xm14 jmp .bottom_done .bottom_no_left: test edgeb, 2 ; have_right jz .bottom_no_left_right pmovzxbw m1, [dst8q] pmovzxbw m2, [dst8q+strideq] - mova [px+(%2+0)*%3+0], m1 - mova [px+(%2+1)*%3+0], m2 - movd [px+(%2+0)*%3-4], xm14 - movd [px+(%2+1)*%3-4], xm14 + mova [px+(%2+0)*32+0], m1 + mova [px+(%2+1)*32+0], m2 + movd [px+(%2+0)*32-4], xm14 + movd [px+(%2+1)*32-4], xm14 jmp .bottom_done .bottom_no_left_right: %if %1 == 4 movd xm1, [dst8q] pinsrd xm1, [dst8q+strideq], 1 pmovzxbw xm1, xm1 - movq [px+(%2+0)*%3+0], xm1 - movhps [px+(%2+1)*%3+0], xm1 + movq [px+(%2+0)*32+0], xm1 + movhps [px+(%2+1)*32+0], xm1 %else pmovzxbw xm1, [dst8q] pmovzxbw xm2, [dst8q+strideq] - mova [px+(%2+0)*%3+0], xm1 - mova [px+(%2+1)*%3+0], xm2 + mova [px+(%2+0)*32+0], xm1 + mova [px+(%2+1)*32+0], xm2 %endif - movd [px+(%2+0)*%3-4], xm14 - movd [px+(%2+1)*%3-4], xm14 - movd [px+(%2+0)*%3+%1*2], xm14 - movd [px+(%2+1)*%3+%1*2], xm14 + movd [px+(%2+0)*32-4], xm14 + movd [px+(%2+1)*32-4], xm14 + movd [px+(%2+0)*32+%1*2], xm14 + movd [px+(%2+1)*32+%1*2], xm14 jmp .bottom_done .no_bottom: - movu [px+(%2+0)*%3-%1], m14 - movu [px+(%2+1)*%3-%1], m14 + movu [px+(%2+0)*32-%1], m14 + movu [px+(%2+1)*32-%1], m14 .bottom_done: ; actual filter @@ -453,32 +1394,32 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ lea stkq, [px] pxor m11, m11 %if %1*%2*2/mmsize > 1 -.v_loop: +.border_v_loop: %endif mov kd, 1 %if %1 == 4 - movq xm4, [stkq+%3*0] - movhps xm4, [stkq+%3*1] - movq xm5, [stkq+%3*2] - movhps xm5, [stkq+%3*3] + movq xm4, [stkq+32*0] + movhps xm4, [stkq+32*1] + movq xm5, [stkq+32*2] + movhps xm5, [stkq+32*3] vinserti128 m4, xm5, 1 %else - mova xm4, [stkq+%3*0] ; px - vinserti128 m4, [stkq+%3*1], 1 + mova xm4, [stkq+32*0] ; px + vinserti128 m4, [stkq+32*1], 1 %endif pxor m15, m15 ; sum mova m7, m4 ; max mova m8, m4 ; min -.k_loop: +.border_k_loop: vpbroadcastb m2, [priq+kq] ; pri_taps vpbroadcastb m3, [secq+kq] ; sec_taps - ACCUMULATE_TAP 0*2, [rsp+0], m13, m0, m2, %1, %3 - ACCUMULATE_TAP 2*2, [rsp+8], m14, m1, m3, %1, %3 - ACCUMULATE_TAP 6*2, [rsp+8], m14, m1, m3, %1, %3 + ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1 + ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1 + ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1 dec kq - jge .k_loop + jge .border_k_loop vpbroadcastd m10, [pw_2048] pcmpgtw m9, m11, m15 @@ -502,17 +1443,17 @@ cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \ %if %1*%2*2/mmsize > 1 %define vloop_lines (mmsize/(%1*2)) lea dstq, [dstq+strideq*vloop_lines] - add stkq, %3*vloop_lines + add stkq, 32*vloop_lines dec hd - jg .v_loop + jg .border_v_loop %endif RET %endmacro -CDEF_FILTER 8, 8, 32 -CDEF_FILTER 4, 8, 32 -CDEF_FILTER 4, 4, 32 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 INIT_YMM avx2 cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 From bf2dfd36346038f77fb5731041a07d0bbbb74cd8 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Fri, 14 Feb 2020 15:46:20 +0100 Subject: [PATCH 065/155] x86: optimize cdef_filter_{4x{4,8},8x8}_avx2 Add 2 seperate code paths for pri/sec strengths equal 0. Having both strengths not equal to 0 is uncommon, branching to skip unnecessary computations is therefore beneficial. ------------------------------------------ before: cdef_filter_4x4_8bpc_avx2: 93.8 after: cdef_filter_4x4_8bpc_avx2: 71.7 --------------------- before: cdef_filter_4x8_8bpc_avx2: 161.5 after: cdef_filter_4x8_8bpc_avx2: 116.3 --------------------- before: cdef_filter_8x8_8bpc_avx2: 221.8 after: cdef_filter_8x8_8bpc_avx2: 156.4 ------------------------------------------ --- src/x86/cdef.asm | 553 +++++++++++++++++++++++++++++++---------------- 1 file changed, 364 insertions(+), 189 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 46c2ddb60a..9f4aa1f4cd 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -129,16 +129,82 @@ CDEF_FILTER_JMP_TABLE 8x8 SECTION .text -%macro ACCUMULATE_TAP_BYTE 7 ; tap_offset, shift, mask, strength, mul_tap, w, h +%macro PREP_REGS 2 ; w, h + ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] + mov dird, r6m + lea tableq, [cdef_filter_%1x%2_jmptable] + lea dirq, [tableq+dirq*2*4] +%if %1 == 4 + %if %2 == 4 + DEFINE_ARGS dst, stride, left, top, pri, sec, \ + table, dir, dirjmp, dst4, stride3, k + %else + DEFINE_ARGS dst, stride, left, top, pri, sec, \ + table, dir, dirjmp, dst4, dst8, stride3, k + lea dst8q, [dstq+strideq*8] + %endif +%else + DEFINE_ARGS dst, stride, h, top1, pri, sec, \ + table, dir, dirjmp, top2, dst4, stride3, k + mov hq, -8 + lea top1q, [top1q+strideq*0] + lea top2q, [top1q+strideq*1] +%endif + lea dst4q, [dstq+strideq*4] +%if %1 == 4 + lea stride3q, [strideq*3] +%endif +%endmacro + +%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max + mov kd, 1 + pxor m15, m15 ; sum +%if %2 == 8 + pxor m12, m12 + %if %1 == 4 + movd xm4, [dstq +strideq*0] + movd xm6, [dstq +strideq*1] + movd xm5, [dstq +strideq*2] + movd xm7, [dstq +stride3q ] + vinserti128 m4, [dst4q+strideq*0], 1 + vinserti128 m6, [dst4q+strideq*1], 1 + vinserti128 m5, [dst4q+strideq*2], 1 + vinserti128 m7, [dst4q+stride3q ], 1 + punpckldq m4, m6 + punpckldq m5, m7 + %else + movq xm4, [dstq+strideq*0] + movq xm5, [dstq+strideq*1] + vinserti128 m4, [dstq+strideq*2], 1 + vinserti128 m5, [dstq+stride3q ], 1 + %endif + punpcklqdq m4, m5 +%else + movd xm4, [dstq+strideq*0] + movd xm5, [dstq+strideq*1] + vinserti128 m4, [dstq+strideq*2], 1 + vinserti128 m5, [dstq+stride3q ], 1 + punpckldq m4, m5 +%endif +%if %3 == 1 + mova m7, m4 ; min + mova m8, m4 ; max +%endif +%endmacro + +%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength + ; mul_tap, w, h, clip ; load p0/p1 movsxd dirjmpq, [dirq+kq*4+%1*2*4] add dirjmpq, tableq call dirjmpq +%if %8 == 1 pmaxub m7, m5 pminub m8, m5 pmaxub m7, m6 pminub m8, m6 +%endif ; accumulate sum[m15] over p0/p1 %if %7 == 4 @@ -148,7 +214,7 @@ SECTION .text psubusb m5, m6, m5 por m9, m5 ; abs_diff_p01(p01 - px) pcmpeqb m5, m9 - por m5, m3 + por m5, %5 psignb m6, %5, m5 psrlw m5, m9, %2 ; emulate 8-bit shift pand m5, %3 @@ -167,8 +233,8 @@ SECTION .text pcmpeqb m6, m11 punpckhbw m10, m9, m11 punpcklbw m9, m11 - por m5, m3 - por m11, m6, m3 + por m5, %5 + por m11, m6, %5 punpckhbw m6, m5, m11 punpcklbw m5, m11 psignb m11, %5, m6 @@ -188,7 +254,107 @@ SECTION .text %endif %endmacro -%macro ACCUMULATE_TAP_WORD 6 ; tap_offset, shift, mask, strength, mul_tap, w +%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip +%if %2 == 4 + %if %5 == 1 + punpcklbw m4, %3 + %endif + pcmpgtw %3, m15 + paddw m15, %3 + pmulhrsw m15, %4 + %if %5 == 0 + packsswb m15, m15 + paddb m4, m15 + %else + paddw m4, m15 + packuswb m4, m4 ; clip px in [0x0,0xff] + pminub m4, m7 + pmaxub m4, m8 + %endif + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + movd [dstq+strideq*2], xm5 + pextrd [dstq+strideq*1], xm4, 1 + pextrd [dstq+stride3q ], xm5, 1 +%else + pcmpgtw m6, %3, m12 + pcmpgtw m5, %3, m15 + paddw m12, m6 + paddw m15, m5 + %if %5 == 1 + punpckhbw m5, m4, %3 + punpcklbw m4, %3 + %endif + pmulhrsw m12, %4 + pmulhrsw m15, %4 + %if %5 == 0 + packsswb m15, m12 + paddb m4, m15 + %else + paddw m5, m12 + paddw m4, m15 + packuswb m4, m5 ; clip px in [0x0,0xff] + pminub m4, m7 + pmaxub m4, m8 + %endif + vextracti128 xm5, m4, 1 + %if %1 == 4 + movd [dstq +strideq*0], xm4 + movd [dst4q+strideq*0], xm5 + pextrd [dstq +strideq*1], xm4, 1 + pextrd [dst4q+strideq*1], xm5, 1 + pextrd [dstq +strideq*2], xm4, 2 + pextrd [dst4q+strideq*2], xm5, 2 + pextrd [dstq +stride3q ], xm4, 3 + pextrd [dst4q+stride3q ], xm5, 3 + %else + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm5 + movhps [dstq+strideq*1], xm4 + movhps [dstq+stride3q ], xm5 + %endif +%endif +%endmacro + +%macro BORDER_PREP_REGS 2 ; w, h + ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] + mov dird, r6m + lea dirq, [tableq+dirq*2+14] +%if %1*%2*2/mmsize > 1 + %if %1 == 4 + DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k + %else + DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k + %endif + mov hd, %1*%2*2/mmsize +%else + DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k +%endif + lea stkq, [px] + pxor m11, m11 +%endmacro + +%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max + mov kd, 1 +%if %1 == 4 + movq xm4, [stkq+32*0] + movhps xm4, [stkq+32*1] + movq xm5, [stkq+32*2] + movhps xm5, [stkq+32*3] + vinserti128 m4, xm5, 1 +%else + mova xm4, [stkq+32*0] ; px + vinserti128 m4, [stkq+32*1], 1 +%endif + pxor m15, m15 ; sum +%if %3 == 1 + mova m7, m4 ; max + mova m8, m4 ; min +%endif +%endmacro + +%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength + ; mul_tap, w, clip ; load p0/p1 movsx offq, byte [dirq+kq+%1] ; off1 %if %6 == 4 @@ -212,6 +378,7 @@ SECTION .text movu xm6, [stkq+offq*2+32*0] ; p1 vinserti128 m6, [stkq+offq*2+32*1], 1 %endif +%if %7 == 1 ; out of bounds values are set to a value that is a both a large unsigned ; value and a negative signed value. ; use signed max and unsigned min to remove them @@ -219,6 +386,7 @@ SECTION .text pminuw m8, m5 ; min after p0 pmaxsw m7, m6 ; max after p1 pminuw m8, m6 ; min after p1 +%endif ; accumulate sum[m15] over p0/p1 ; calculate difference before converting @@ -243,6 +411,28 @@ SECTION .text paddw m15, m5 %endmacro +%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip + pcmpgtw m9, m11, m15 + paddw m15, m9 + pmulhrsw m15, %2 + paddw m4, m15 +%if %3 == 1 + pminsw m4, m7 + pmaxsw m4, m8 +%endif + packuswb m4, m4 + vextracti128 xm5, m4, 1 +%if %1 == 4 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + movd [dstq+strideq*2], xm5 + pextrd [dstq+stride3q], xm5, 1 +%else + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 +%endif +%endmacro + %macro CDEF_FILTER 2 ; w, h INIT_YMM avx2 cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ @@ -304,21 +494,24 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ %endif DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping + mov dampingd, r7m + xor zerod, zerod movifnidn prid, prim -%if UNIX64 + sub dampingd, 31 + movifnidn secdmpd, secdmpm + or prid, 0 + jz .sec_only movd xm0, prid - movd xm1, secdmpd -%endif - mov dampingd, r7m lzcnt pridmpd, prid - lzcnt secdmpd, secdmpm - sub dampingd, 31 - xor zerod, zerod add pridmpd, dampingd cmovs pridmpd, zerod + mov [rsp+0], pridmpq ; pri_shift + or secdmpd, 0 + jz .pri_only + movd xm1, secdmpd + lzcnt secdmpd, secdmpd add secdmpd, dampingd cmovs secdmpd, zerod - mov [rsp+0], pridmpq ; pri_shift mov [rsp+8], secdmpq ; sec_shift DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp @@ -328,132 +521,29 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ ; pri/sec_taps[k] [4 total] DEFINE_ARGS dst, stride, left, top, pri, sec, table, dir -%if UNIX64 vpbroadcastb m0, xm0 ; pri_strength vpbroadcastb m1, xm1 ; sec_strength -%else - vpbroadcastb m0, prim - vpbroadcastb m1, secm -%endif and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps lea secq, [tableq+12] ; sec_taps - ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] - mov dird, r6m - lea tableq, [cdef_filter_%1x%2_jmptable] - lea dirq, [tableq+dirq*2*4] -%if %1 == 4 - %if %2 == 4 - DEFINE_ARGS dst, stride, left, top, pri, sec, \ - table, dir, dirjmp, dst4, stride3, k - %else - DEFINE_ARGS dst, stride, left, top, pri, sec, \ - table, dir, dirjmp, dst4, dst8, stride3, k - lea dst8q, [dstq+strideq*8] - %endif -%else - DEFINE_ARGS dst, stride, h, top1, pri, sec, \ - table, dir, dirjmp, top2, dst4, stride3, k - mov hq, -8 - lea top1q, [top1q+strideq*0] - lea top2q, [top1q+strideq*1] -%endif - lea dst4q, [dstq+strideq*4] -%if %1 == 4 - lea stride3q, [strideq*3] -%endif + PREP_REGS %1, %2 %if %1*%2 > mmsize .v_loop: %endif - mov kd, 1 - pxor m15, m15 ; sum -%if %2 == 8 - pxor m12, m12 - %if %1 == 4 - movd xm4, [dstq +strideq*0] - movd xm6, [dstq +strideq*1] - movd xm5, [dstq +strideq*2] - movd xm7, [dstq +stride3q ] - vinserti128 m4, [dst4q+strideq*0], 1 - vinserti128 m6, [dst4q+strideq*1], 1 - vinserti128 m5, [dst4q+strideq*2], 1 - vinserti128 m7, [dst4q+stride3q ], 1 - punpckldq m4, m6 - punpckldq m5, m7 - %else - movq xm4, [dstq+strideq*0] - movq xm5, [dstq+strideq*1] - vinserti128 m4, [dstq+strideq*2], 1 - vinserti128 m5, [dstq+stride3q ], 1 - %endif - punpcklqdq m4, m5 -%else - movd xm4, [dstq+strideq*0] - movd xm5, [dstq+strideq*1] - vinserti128 m4, [dstq+strideq*2], 1 - vinserti128 m5, [dstq+stride3q ], 1 - punpckldq m4, m5 -%endif - mova m7, m4 ; min - mova m8, m4 ; max + LOAD_BLOCK %1, %2, 1 .k_loop: - vpbroadcastb m2, [priq+kq] ; pri_taps - vpbroadcastb m3, [secq+kq] ; sec_taps - - ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0 - ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2 - ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2 + vpbroadcastb m2, [priq+kq] ; pri_taps + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0 + ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2 + ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2 dec kq jge .k_loop vpbroadcastd m10, [pw_2048] pxor m9, m9 -%if %2 == 4 - punpcklbw m4, m9 - pcmpgtw m9, m15 - paddw m15, m9 - pmulhrsw m15, m10 - paddw m4, m15 - packuswb m4, m4 ; clip px in [0x0,0xff] - pminub m4, m7 - pmaxub m4, m8 - vextracti128 xm5, m4, 1 - movd [dstq+strideq*0], xm4 - movd [dstq+strideq*2], xm5 - pextrd [dstq+strideq*1], xm4, 1 - pextrd [dstq+stride3q ], xm5, 1 -%else - pcmpgtw m6, m9, m12 - pcmpgtw m5, m9, m15 - paddw m12, m6 - paddw m15, m5 - punpckhbw m5, m4, m9 - punpcklbw m4, m9 - pmulhrsw m12, m10 - pmulhrsw m15, m10 - paddw m5, m12 - paddw m4, m15 - packuswb m4, m5 ; clip px in [0x0,0xff] - pminub m4, m7 - pmaxub m4, m8 - vextracti128 xm5, m4, 1 - %if %1 == 4 - movd [dstq +strideq*0], xm4 - movd [dst4q+strideq*0], xm5 - pextrd [dstq +strideq*1], xm4, 1 - pextrd [dst4q+strideq*1], xm5, 1 - pextrd [dstq +strideq*2], xm4, 2 - pextrd [dst4q+strideq*2], xm5, 2 - pextrd [dstq +stride3q ], xm4, 3 - pextrd [dst4q+stride3q ], xm5, 3 - %else - movq [dstq+strideq*0], xm4 - movq [dstq+strideq*2], xm5 - movhps [dstq+strideq*1], xm4 - movhps [dstq+stride3q ], xm5 - %endif -%endif + ADJUST_PIXEL %1, %2, m9, m10, 1 %if %1*%2 > mmsize mov dstq, dst4q lea top1q, [rsp+0x90] @@ -464,6 +554,76 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ %endif RET +.pri_only: + DEFINE_ARGS dst, stride, left, top, pri, _, table, pridmp + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, left, top, pri, _, table, dir + vpbroadcastb m0, xm0 ; pri_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + PREP_REGS %1, %2 + vpbroadcastd m3, [pw_2048] + pxor m1, m1 +%if %1*%2 > mmsize +.pri_v_loop: +%endif + LOAD_BLOCK %1, %2 +.pri_k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0 + dec kq + jge .pri_k_loop + ADJUST_PIXEL %1, %2, m1, m3 +%if %1*%2 > mmsize + mov dstq, dst4q + lea top1q, [rsp+0x90] + lea top2q, [rsp+0xA0] + lea dst4q, [dst4q+strideq*4] + add hq, 4 + jl .pri_v_loop +%endif + RET + +.sec_only: + DEFINE_ARGS dst, stride, left, top, _, secdmp, zero, _, damping + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + cmovs secdmpd, zerod + mov [rsp+8], secdmpq ; sec_shift + DEFINE_ARGS dst, stride, left, top, _, secdmp, table + lea tableq, [tap_table] + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, left, top, _, sec, table, dir + vpbroadcastb m1, xm1 ; sec_strength + lea secq, [tableq+12] ; sec_taps + PREP_REGS %1, %2 + vpbroadcastd m2, [pw_2048] + pxor m0, m0 +%if %1*%2 > mmsize +.sec_v_loop: +%endif + LOAD_BLOCK %1, %2 +.sec_k_loop: + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2 + ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2 + dec kq + jge .sec_k_loop + ADJUST_PIXEL %1, %2, m0, m2 +%if %1*%2 > mmsize + mov dstq, dst4q + lea top1q, [rsp+0x90] + lea top2q, [rsp+0xA0] + lea dst4q, [dst4q+strideq*4] + add hq, 4 + jl .sec_v_loop +%endif + RET + .d0k0: %if %1 == 4 %if %2 == 4 @@ -1343,21 +1503,24 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ ; register to shuffle values into after packing vbroadcasti128 m12, [shufb_lohi] - movifnidn prid, prim mov dampingd, r7m - lzcnt pridmpd, prid -%if UNIX64 - movd xm0, prid - movd xm1, secdmpd -%endif - lzcnt secdmpd, secdmpm - sub dampingd, 31 xor zerod, zerod + movifnidn prid, prim + sub dampingd, 31 + movifnidn secdmpd, secdmpm + or prid, 0 + jz .border_sec_only + movd xm0, prid + lzcnt pridmpd, prid add pridmpd, dampingd cmovs pridmpd, zerod + mov [rsp+0], pridmpq ; pri_shift + or secdmpd, 0 + jz .border_pri_only + movd xm1, secdmpd + lzcnt secdmpd, secdmpd add secdmpd, dampingd cmovs secdmpd, zerod - mov [rsp+0], pridmpq ; pri_shift mov [rsp+8], secdmpq ; sec_shift DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3 @@ -1367,87 +1530,99 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ ; pri/sec_taps[k] [4 total] DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3 -%if UNIX64 vpbroadcastb m0, xm0 ; pri_strength vpbroadcastb m1, xm1 ; sec_strength -%else - vpbroadcastb m0, prim - vpbroadcastb m1, secm -%endif and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps lea secq, [tableq+12] ; sec_taps - ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] - mov dird, r6m - lea dirq, [tableq+dirq*2+14] -%if %1*%2*2/mmsize > 1 - %if %1 == 4 - DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k - %else - DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k - %endif - mov hd, %1*%2*2/mmsize -%else - DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k -%endif - lea stkq, [px] - pxor m11, m11 + BORDER_PREP_REGS %1, %2 %if %1*%2*2/mmsize > 1 .border_v_loop: %endif - mov kd, 1 -%if %1 == 4 - movq xm4, [stkq+32*0] - movhps xm4, [stkq+32*1] - movq xm5, [stkq+32*2] - movhps xm5, [stkq+32*3] - vinserti128 m4, xm5, 1 -%else - mova xm4, [stkq+32*0] ; px - vinserti128 m4, [stkq+32*1], 1 -%endif - pxor m15, m15 ; sum - mova m7, m4 ; max - mova m8, m4 ; min + BORDER_LOAD_BLOCK %1, %2, 1 .border_k_loop: vpbroadcastb m2, [priq+kq] ; pri_taps vpbroadcastb m3, [secq+kq] ; sec_taps - - ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1 - ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1 - ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1 - + ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1 + ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1 + ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1 dec kq jge .border_k_loop vpbroadcastd m10, [pw_2048] - pcmpgtw m9, m11, m15 - paddw m15, m9 - pmulhrsw m15, m10 - paddw m4, m15 - pminsw m4, m7 - pmaxsw m4, m8 - packuswb m4, m4 - vextracti128 xm5, m4, 1 -%if %1 == 4 - movd [dstq+strideq*0], xm4 - pextrd [dstq+strideq*1], xm4, 1 - movd [dstq+strideq*2], xm5 - pextrd [dstq+stride3q], xm5, 1 -%else - movq [dstq+strideq*0], xm4 - movq [dstq+strideq*1], xm5 + BORDER_ADJUST_PIXEL %1, m10, 1 +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, 32*vloop_lines + dec hd + jg .border_v_loop %endif + RET +.border_pri_only: + DEFINE_ARGS dst, stride, pridmp, table, pri, _, stride3 + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + DEFINE_ARGS dst, stride, dir, table, pri, _, stride3 + vpbroadcastb m0, xm0 ; pri_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + BORDER_PREP_REGS %1, %2 + vpbroadcastd m1, [pw_2048] +%if %1*%2*2/mmsize > 1 +.border_pri_v_loop: +%endif + BORDER_LOAD_BLOCK %1, %2 +.border_pri_k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1 + dec kq + jge .border_pri_k_loop + BORDER_ADJUST_PIXEL %1, m1 %if %1*%2*2/mmsize > 1 %define vloop_lines (mmsize/(%1*2)) lea dstq, [dstq+strideq*vloop_lines] add stkq, 32*vloop_lines dec hd - jg .border_v_loop + jg .border_pri_v_loop %endif + RET +.border_sec_only: + DEFINE_ARGS dst, stride, _, damping, _, secdmp, stride3, zero + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + cmovs secdmpd, zerod + mov [rsp+8], secdmpq ; sec_shift + DEFINE_ARGS dst, stride, _, table, _, secdmp, stride3 + lea tableq, [tap_table] + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + DEFINE_ARGS dst, stride, dir, table, _, sec, stride3 + vpbroadcastb m1, xm1 ; sec_strength + lea secq, [tableq+12] ; sec_taps + BORDER_PREP_REGS %1, %2 + vpbroadcastd m0, [pw_2048] +%if %1*%2*2/mmsize > 1 +.border_sec_v_loop: +%endif + BORDER_LOAD_BLOCK %1, %2 +.border_sec_k_loop: + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1 + ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1 + dec kq + jge .border_sec_k_loop + BORDER_ADJUST_PIXEL %1, m0 +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, 32*vloop_lines + dec hd + jg .border_sec_v_loop +%endif RET %endmacro From 7d1284608ca640709e0fb8c6d32e18a20a09898d Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 4 Mar 2020 23:05:45 +0100 Subject: [PATCH 066/155] x86: Fix crash in AVX2 cdef_filter with <32-byte stack alignment --- src/x86/cdef.asm | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 9f4aa1f4cd..6ef8326cee 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -447,7 +447,7 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ PUSH r11 %if %2 == 4 %assign regs_used 12 - %if WIN64 + %if STACK_ALIGNMENT < 32 PUSH r%+regs_used %assign regs_used regs_used+1 %endif @@ -459,24 +459,24 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ movu [rsp+0x10], m0 movu [rsp+0x28], m1 movu [rsp+0x40], m2 -%else +%elif %1 == 4 PUSH r12 - %if %1 == 4 - %assign regs_used 13 - %if WIN64 + %assign regs_used 13 + %if STACK_ALIGNMENT < 32 PUSH r%+regs_used %assign regs_used regs_used+1 - %endif + %endif ALLOC_STACK 8*2+%1*%2*1, 16 pmovzxwd m0, [leftq] mova [rsp+0x10], m0 - %else +%else + PUSH r12 PUSH r13 - %assign regs_used 14 - %if WIN64 + %assign regs_used 14 + %if STACK_ALIGNMENT < 32 PUSH r%+regs_used - %assign regs_used regs_used+1 - %endif + %assign regs_used regs_used+1 + %endif ALLOC_STACK 8*2+%1*%2*2+32, 16 lea r11, [strideq*3] movu xm4, [dstq+strideq*2] @@ -490,7 +490,6 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ mova [rsp+0x50], m2 mova [rsp+0x70], m3 mova [rsp+0x90], m4 - %endif %endif DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping @@ -1256,7 +1255,7 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ %else %assign regs_used 9 %endif -%if WIN64 +%if STACK_ALIGNMENT < 32 PUSH r%+regs_used %assign regs_used regs_used+1 %endif From 5c9295f4d3b2423bafa49b213f2c4f17d65ae24b Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Thu, 26 Mar 2020 21:44:45 +0100 Subject: [PATCH 067/155] meson/x86: add option to disable AVX-512 asm Allows building with nasm < 2.14. --- src/x86/cdef.asm | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 6ef8326cee..0052557504 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -1850,6 +1850,8 @@ DECLARE_REG_TMP 8, 5 ; 4e 4f 40 41 42 43 44 45 ; 5e 5f 50 51 52 53 54 55 +%if HAVE_AVX512ICL + INIT_ZMM avx512icl cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge %define base r7-edge_mask @@ -1999,4 +2001,6 @@ ALIGN function_align vpdpbusd m0, m5, m9 ret +%endif ; HAVE_AVX512ICL + %endif ; ARCH_X86_64 From d9b83f231d8710143c6a03495ae365694caeeb5b Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 18 Feb 2020 14:41:22 +0100 Subject: [PATCH 068/155] x86: Add cdef_filter_{4,8}x8 AVX-512 (Ice Lake) asm cdef_filter_4x8_8bpc_avx2: 54.0 cdef_filter_4x8_8bpc_avx512icl: 35.5 => +52.1% cdef_filter_8x8_8bpc_avx2: 71.0 cdef_filter_8x8_8bpc_avx512icl: 49.0 => +44.9% --- src/x86/cdef.asm | 632 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 630 insertions(+), 2 deletions(-) diff --git a/src/x86/cdef.asm b/src/x86/cdef.asm index 0052557504..bd2dd8bb5c 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef.asm @@ -68,6 +68,21 @@ lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 +lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 + db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 +lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 + db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 + db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 + db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 +pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 +lut_perm_8x8a: db 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 + db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55 + db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87 + db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119 +lut_perm_8x8b: db 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27 + db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 + db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91 + db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123 edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 @@ -82,7 +97,13 @@ gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 + times 16 db 0 ; realign (introduced by cdef_dirs) +end_perm_w8clip:db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 + db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 + db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 + db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 sec_tap: db 32, 32, 16, 16 pd_268435568: dd 268435568 @@ -1892,7 +1913,7 @@ cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, dampin psubb m2, m1, m6 lzcnt r6d, prid vpsubb m2{k1}, m6, m1 ; abs(diff) - vpbroadcastb m4, prim + vpbroadcastb m4, prid and prid, 1 vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift movifnidn t1d, secm @@ -2001,6 +2022,613 @@ ALIGN function_align vpdpbusd m0, m5, m9 ret -%endif ; HAVE_AVX512ICL +DECLARE_REG_TMP 2, 7 + +; lut top lut bottom +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 +; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 +; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 +; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 +; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 +; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85 +; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95 + +cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \ + pri, sec, dir, damping, edge +%define base r8-edge_mask + vpbroadcastd ym21, strided + mov r6d, edgem + lea r8, [edge_mask] + movq xm1, [topq+strideq*0-2] + pmulld ym21, [base+pd_01234567] + kxnorb k1, k1, k1 + movq xm2, [topq+strideq*1-2] + vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7 + mova m14, [base+lut_perm_4x8a] + movu m15, [base+lut_perm_4x8b] + test r6b, 0x08 ; avoid buffer overread + jz .main + lea r7, [dstq+strideq*8-2] + vinserti32x4 ym1, [r7+strideq*0], 1 + vinserti32x4 ym2, [r7+strideq*1], 1 +.main: + punpcklqdq ym1, ym2 + vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ + movifnidn prid, prim + mov t0d, dirm + mova m16, [base+px_idx] + mov r3d, dampingm + vpermi2b m14, m0, m1 ; lut top + vpermi2b m15, m0, m1 ; lut bottom + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m20, m20 + lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m2, m16, m14 ; pxt + vpermb m3, m16, m15 ; pxb + mova m1, m0 + cmp r6b, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1 + vpermb m5, m6, m15 ; pNb +%macro CDEF_FILTER_4x8_PRI 0 + vpcmpub k1, m2, m4, 6 ; pxt > pNt + vpcmpub k2, m3, m5, 6 ; pxb > pNb + psubb m6, m4, m2 + psubb m7, m5, m3 + lzcnt r6d, prid + vpsubb m6{k1}, m2, m4 ; abs(diff_top) + vpsubb m7{k2}, m3, m5 ; abs(diff_bottom) + vpbroadcastb m13, prid + vpbroadcastq m9, [r3+r6*8] + and prid, 1 + vpbroadcastd m11, [base+pri_tap+priq*4] + vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift + vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift + mova m10, m11 + movifnidn t1d, secm + vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top) + vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom) + psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift))) + psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift))) + pminub m6, m12 + pminub m7, m13 + vpdpbusd m0, m6, m10 ; sum top + vpdpbusd m1, m7, m11 ; sum bottom +%endmacro + CDEF_FILTER_4x8_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m10, m4, m2 + pminub m12, m6, m8 + pminub m11, m5, m3 + pminub m13, m7, m9 + pmaxub m4, m2 + pmaxub m6, m8 + pmaxub m5, m3 + pmaxub m7, m9 + pminub m10, m12 + pminub m11, m13 + pmaxub m4, m6 + pmaxub m5, m7 + mov r2d, 0xAAAAAAAA + kmovd k1, r2d + kxnorb k2, k2, k2 ; hw lw + vpshrdd m12, m0, m1, 16 ; m1lw m0hw + vpshrdd m6, m10, m11, 16 ; m11lw m10hw + vpshrdd m8, m4, m5, 16 ; m5lw m4hw + vpblendmw m7{k1}, m10, m11 ; m11hw m10lw + vpblendmw m9{k1}, m4, m5 ; m5hw m4lw + vpblendmw m4{k1}, m0, m12 ; m1lw m0lw + vpblendmw m5{k1}, m12, m1 ; m1hw m0hw + vpshrdd m2, m3, 16 + pminub m6, m7 + pmaxub m8, m9 + mova ym14, [base+end_perm] + vpcmpw k1, m4, m20, 1 + vpshldw m2, m5, 8 + pslldq m7, m6, 1 + pslldq m9, m8, 1 + psubw m5, m20, m4 + paddusw m0, m4, m2 ; clip >0xff + pminub m6, m7 + pmaxub m8, m9 + psubusw m0{k1}, m2, m5 ; clip <0x00 + pmaxub m0, m6 + pminub m0, m8 + vpermb m0, m14, m0 + vpscatterdd [dstq+ym21]{k2}, ym0 + RET +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + mova ym4, [base+end_perm] + kxnorb k1, k1, k1 + vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m3, m1, 8 + paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddw m1, m3 + pslld m0, 16 + vpshrdd m0, m1, 16 + vpermb m0, m4, m0 ; output in bits 8-15 of each word + vpscatterdd [dstq+ym21]{k1}, ym0 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + mov t1d, r6d + or r6d, 8 ; top 4x4 has bottom + or t1d, 4 ; bottom 4x4 has top + vpbroadcastq m17, [base+edge_mask+r6*8] + vpbroadcastq m18, [base+edge_mask+t1*8] + test prid, prid + jz .mask_edges_sec_only + vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m17, m6 ; index in-range + vpshufbitqmb k2, m18, m6 + mova m4, m2 + mova m5, m3 + vpermb m4{k1}, m6, m14 + vpermb m5{k2}, m6, m15 + CDEF_FILTER_4x8_PRI + test t1d, t1d + jz .end_no_clip + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m17, m10 + vpshufbitqmb k2, m18, m10 + vpshufbitqmb k3, m17, m11 + vpshufbitqmb k4, m18, m11 + mova m6, m2 + mova m7, m3 + mova m8, m2 + mova m9, m3 + vpermb m6{k1}, m10, m14 + vpermb m7{k2}, m10, m15 + vpermb m8{k3}, m11, m14 + vpermb m9{k4}, m11, m15 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1 + vpermb m7, m8, m15 ; pNb + vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3 + vpermb m9, m9, m15 ; pNb +.sec_main: + vpbroadcastb m18, t1d + lzcnt t1d, t1d + vpcmpub k1, m2, m6, 6 + vpcmpub k2, m3, m7, 6 + vpcmpub k3, m2, m8, 6 + vpcmpub k4, m3, m9, 6 + vpbroadcastq m17, [r3+t1*8] + psubb m10, m6, m2 + psubb m11, m7, m3 + psubb m12, m8, m2 + psubb m13, m9, m3 + vpsubb m10{k1}, m2, m6 ; abs(dt0) + vpsubb m11{k2}, m3, m7 ; abs(db0) + vpsubb m12{k3}, m2, m8 ; abs(dt1) + vpsubb m13{k4}, m3, m9 ; abs(db1) + vpbroadcastd m19, [base+sec_tap] + gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift + gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift + gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift + gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift + psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift))) + psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift))) + psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift))) + psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift))) + pminub m10, m14 + pminub m11, m15 + pminub m12, m16 + pminub m13, m17 + mova m14, m19 + mova m15, m19 + mova m16, m19 + vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0) + vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0) + vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1) + vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1) + vpdpbusd m0, m10, m14 + vpdpbusd m1, m11, m15 + vpdpbusd m0, m12, m16 + vpdpbusd m1, m13, m19 + ret +; lut tl lut tr +; t0 t1 t2 t3 t4 t5 t6 t7 t6 t7 t8 t9 ta tb tc td +; T0 T1 T2 T3 T4 T5 T6 T7 T6 T7 T8 T9 TA TB TC TD +; L0 L1 00 01 02 03 04 05 04 05 06 07 08 09 0a 0b +; L2 L3 10 11 12 13 14 15 14 15 16 17 18 19 1a 1b +; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b +; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b +; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b +; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b +; lut bl lut br +; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b +; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b +; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b +; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b +; Lc Ld 60 61 62 63 64 65 64 65 66 67 68 69 6a 6b +; Le Lf 70 71 72 73 74 75 74 75 76 77 78 79 7a 7b +; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b +; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b + +cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \ + pri, sec, dir, damping, edge +%define base r8-edge_mask + mov r6d, edgem + lea r10, [dstq+strideq*4-2] + movu xmm0, [topq+strideq*0-2] + movu xmm1, [dstq+strideq*2-2] + movu xmm2, [r10 +strideq*2 ] + lea r8, [edge_mask] + lea r9, [strideq*3] + pmovzxwq m10, [leftq-4] + vinserti32x4 ym0, ymm0, [topq+strideq*1-2], 1 + vinserti32x4 ym1, ymm1, [dstq+r9 -2], 1 + vinserti32x4 ym2, ymm2, [r10 +r9 ], 1 + lea r7, [r10 +strideq*4 ] + pmovzxwq m11, [leftq+4] + vinserti32x4 m0, [dstq+strideq*0-2], 2 + vinserti32x4 m1, [r10 +strideq*0 ], 2 + mova m12, [base+lut_perm_8x8a] + movu m13, [base+lut_perm_8x8b] + vinserti32x4 m0, [dstq+strideq*1-2], 3 + vinserti32x4 m1, [r10 +strideq*1 ], 3 + test r6b, 0x08 ; avoid buffer overread + jz .main + vinserti32x4 m2, [r7 +strideq*0], 2 + vinserti32x4 m2, [r7 +strideq*1], 3 +.main: + mov t1d, 0x11111100 + mova m14, m12 + mova m15, m13 + kmovd k1, t1d + kshiftrd k2, k1, 8 + movifnidn prid, prim + mov t0d, dirm + mova m30, [base+px_idx] + mov r3d, dampingm + vpermi2b m12, m0, m1 ; lut tl + vpermi2b m14, m1, m2 ; lut bl + vpermi2b m13, m0, m1 ; lut tr + vpermi2b m15, m1, m2 ; lut br + vpblendmw m12{k1}, m12, m10 + vpblendmw m14{k2}, m14, m11 + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m31, m31 + lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m4, m30, m12 ; pxtl + vpermb m5, m30, m13 ; pxtr + vpermb m6, m30, m14 ; pxbl + vpermb m7, m30, m15 ; pxbr + mova m1, m0 + mova m2, m0 + mova m3, m0 + cmp r6b, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1 + vpermb m9, m11, m13 ; pNtr + vpermb m10, m11, m14 ; pNbl + vpermb m11, m11, m15 ; pNbr +%macro CDEF_FILTER_8x8_PRI 0 + vpcmpub k1, m4, m8, 6 ; pxtl > pNtl + vpcmpub k2, m5, m9, 6 ; pxtr > pNtr + vpcmpub k3, m6, m10, 6 ; pxbl > pNbl + vpcmpub k4, m7, m11, 6 ; pxbr > pNbr + psubb m16, m8, m4 + psubb m17, m9, m5 + psubb m18, m10, m6 + psubb m19, m11, m7 + lzcnt r6d, prid + vpsubb m16{k1}, m4, m8 ; abs(diff_tl) + vpsubb m17{k2}, m5, m9 ; abs(diff_tr) + vpsubb m18{k3}, m6, m10 ; abs(diff_bl) + vpsubb m19{k4}, m7, m11 ; abs(diff_br) + vpbroadcastq m28, [r3+r6*8] + vpbroadcastb m29, prid + and prid, 1 + vpbroadcastd m27, [base+pri_tap+priq*4] + vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift + vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift + vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift + vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift + mova m24, m27 + mova m25, m27 + mova m26, m27 + movifnidn t1d, secm + vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl) + vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr) + vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl) + vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr) + psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift))) + psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift))) + psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift))) + psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift))) + pminub m16, m20 + pminub m17, m21 + pminub m18, m22 + pminub m19, m23 + vpdpbusd m0, m16, m24 ; sum tl + vpdpbusd m1, m17, m25 ; sum tr + vpdpbusd m2, m18, m26 ; sum bl + vpdpbusd m3, m19, m27 ; sum br +%endmacro + CDEF_FILTER_8x8_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m20, m8, m4 + pminub m24, m12, m16 + pminub m21, m9, m5 + pminub m25, m13, m17 + pminub m22, m10, m6 + pminub m26, m14, m18 + pminub m23, m11, m7 + pminub m27, m15, m19 + pmaxub m8, m4 + pmaxub m12, m16 + pmaxub m9, m5 + pmaxub m13, m17 + pmaxub m10, m6 + pmaxub m14, m18 + pmaxub m11, m7 + pmaxub m15, m19 + pminub m20, m24 + pminub m21, m25 + pminub m22, m26 + pminub m23, m27 + pmaxub m8, m12 + pmaxub m9, m13 + pmaxub m10, m14 + pmaxub m11, m15 + mov r2d, 0xAAAAAAAA + kmovd k1, r2d + vpshrdd m24, m0, m1, 16 + vpshrdd m25, m2, m3, 16 + vpshrdd m12, m20, m21, 16 + vpshrdd m14, m22, m23, 16 + vpshrdd m16, m8, m9, 16 + vpshrdd m18, m10, m11, 16 + vpblendmw m13{k1}, m20, m21 + vpblendmw m15{k1}, m22, m23 + vpblendmw m17{k1}, m8, m9 + vpblendmw m19{k1}, m10, m11 + vpblendmw m20{k1}, m0, m24 + vpblendmw m21{k1}, m24, m1 + vpblendmw m22{k1}, m2, m25 + vpblendmw m23{k1}, m25, m3 + vpshrdd m4, m5, 16 + vpshrdd m6, m7, 16 + pminub m12, m13 + pminub m14, m15 + pmaxub m16, m17 + pmaxub m18, m19 + mova m8, [base+end_perm_w8clip] + vpcmpw k2, m20, m31, 1 + vpcmpw k3, m22, m31, 1 + vpshldw m4, m21, 8 + vpshldw m6, m23, 8 + kunpckdq k1, k1, k1 + kxnorb k4, k4, k4 + vpshrdw m11, m12, m14, 8 + vpshrdw m15, m16, m18, 8 + vpblendmb m13{k1}, m12, m14 + vpblendmb m17{k1}, m16, m18 + psubw m21, m31, m20 + psubw m23, m31, m22 + paddusw m0, m20, m4 ; clip >0xff + paddusw m1, m22, m6 + pminub m11, m13 + pmaxub m15, m17 + psubusw m0{k2}, m4, m21 ; clip <0x00 + psubusw m1{k3}, m6, m23 + psrlw m0, 8 + vmovdqu8 m0{k1}, m1 + pmaxub m0, m11 + pminub m0, m15 + vpermb m0, m8, m0 + add r10, 2 + vextracti32x4 xm1, m0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*2], xm1 + movq [r10 +strideq*0], xm2 + movq [r10 +strideq*2], xm3 + movhps [dstq+strideq*1], xm0 + movhps [dstq+r9 ], xm1 + movhps [r10 +strideq*1], xm2 + movhps [r10 +r9 ], xm3 + RET +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + mova xm8, [base+end_perm] + kxnorb k1, k1, k1 + vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m5, m1, 8 + vpshldd m6, m2, 8 + vpshldd m7, m3, 8 + paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + vpermb m0, m8, m0 + vpermb m1, m8, m1 + vpermb m2, m8, m2 + vpermb m3, m8, m3 + add r10, 2 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm0 + movq [r10 +strideq*0], xm5 + movq [r10 +strideq*2], xm2 + movhps [dstq+strideq*1], xm4 + movhps [dstq+r9 ], xm0 + movhps [r10 +strideq*1], xm5 + movhps [r10 +r9 ], xm2 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + mov t0d, r6d + mov t1d, r6d + or t0d, 0xA ; top-left 4x4 has bottom and right + or t1d, 0x9 ; top-right 4x4 has bottom and left + vpbroadcastq m26, [base+edge_mask+t0*8] + vpbroadcastq m27, [base+edge_mask+t1*8] + mov t1d, r6d + or r6d, 0x6 ; bottom-left 4x4 has top and right + or t1d, 0x5 ; bottom-right 4x4 has top and left + vpbroadcastq m28, [base+edge_mask+r6*8] + vpbroadcastq m29, [base+edge_mask+t1*8] + mov t0d, dirm + test prid, prid + jz .mask_edges_sec_only + vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m26, m20 ; index in-range + vpshufbitqmb k2, m27, m20 + vpshufbitqmb k3, m28, m20 + vpshufbitqmb k4, m29, m20 + mova m8, m4 + mova m9, m5 + mova m10, m6 + mova m11, m7 + vpermb m8{k1}, m20, m12 + vpermb m9{k2}, m20, m13 + vpermb m10{k3}, m20, m14 + vpermb m11{k4}, m20, m15 + mova [rsp+0x00], m26 + mova [rsp+0x40], m27 + mova [rsp+0x80], m28 + mova [rsp+0xC0], m29 + CDEF_FILTER_8x8_PRI + test t1d, t1d + jz .end_no_clip + mova m26, [rsp+0x00] + mova m27, [rsp+0x40] + mova m28, [rsp+0x80] + mova m29, [rsp+0xC0] + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m26, m20 + vpshufbitqmb k2, m27, m20 + vpshufbitqmb k3, m28, m20 + vpshufbitqmb k4, m29, m20 + mova m16, m4 + mova m17, m5 + mova m18, m6 + mova m19, m7 + vpermb m16{k1}, m20, m12 + vpermb m17{k2}, m20, m13 + vpermb m18{k3}, m20, m14 + vpermb m19{k4}, m20, m15 + vpshufbitqmb k1, m26, m21 + vpshufbitqmb k2, m27, m21 + vpshufbitqmb k3, m28, m21 + vpshufbitqmb k4, m29, m21 + vpermb m12, m21, m12 + vpermb m13, m21, m13 + vpermb m14, m21, m14 + vpermb m15, m21, m15 + vpblendmb m12{k1}, m4, m12 + vpblendmb m13{k2}, m5, m13 + vpblendmb m14{k3}, m6, m14 + vpblendmb m15{k4}, m7, m15 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1 + vpermb m17, m20, m13 ; pNtr + vpermb m18, m20, m14 ; pNbl + vpermb m19, m20, m15 ; pNbr + vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3 + vpermb m13, m21, m13 ; pNtr + vpermb m14, m21, m14 ; pNbl + vpermb m15, m21, m15 ; pNbr +.sec_main: +%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants + vpcmpub k1, m4, %1, 6 + vpcmpub k2, m5, %2, 6 + vpcmpub k3, m6, %3, 6 + vpcmpub k4, m7, %4, 6 + psubb m20, %1, m4 + psubb m21, %2, m5 + psubb m22, %3, m6 + psubb m23, %4, m7 +%if %5 + vpbroadcastb m28, t1d + lzcnt t1d, t1d + vpbroadcastq m29, [r3+t1*8] +%endif + vpsubb m20{k1}, m4, %1 + vpsubb m21{k2}, m5, %2 + vpsubb m22{k3}, m6, %3 + vpsubb m23{k4}, m7, %4 + gf2p8affineqb m24, m20, m29, 0 + gf2p8affineqb m25, m21, m29, 0 + gf2p8affineqb m26, m22, m29, 0 + gf2p8affineqb m27, m23, m29, 0 +%if %5 + vpbroadcastd m30, [base+sec_tap] +%endif + psubusb m24, m28, m24 + psubusb m25, m28, m25 + psubusb m26, m28, m26 + psubusb m27, m28, m27 + pminub m20, m24 + pminub m21, m25 + pminub m22, m26 + pminub m23, m27 + mova m24, m30 + mova m25, m30 + mova m26, m30 + mova m27, m30 + vpsubb m24{k1}, m31, m30 + vpsubb m25{k2}, m31, m30 + vpsubb m26{k3}, m31, m30 + vpsubb m27{k4}, m31, m30 + vpdpbusd m0, m20, m24 + vpdpbusd m1, m21, m25 + vpdpbusd m2, m22, m26 + vpdpbusd m3, m23, m27 +%endmacro + CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1 + CDEF_FILTER_8x8_SEC m12, m13, m14, m15 + ret + +%endif ; HAVE_AVX512ICL %endif ; ARCH_X86_64 From f9b8572c7e2c765658749b6cdc08714064cf0d97 Mon Sep 17 00:00:00 2001 From: Victorien Le Couviour--Tuffet Date: Tue, 7 Apr 2020 15:51:36 +0200 Subject: [PATCH 069/155] x86: Split AVX2 / AVX-512 CDEF into dedicated files --- src/x86/{cdef.asm => cdef_avx2.asm} | 843 +-------------------------- src/x86/cdef_avx512.asm | 868 ++++++++++++++++++++++++++++ 2 files changed, 872 insertions(+), 839 deletions(-) rename src/x86/{cdef.asm => cdef_avx2.asm} (65%) create mode 100644 src/x86/cdef_avx512.asm diff --git a/src/x86/cdef.asm b/src/x86/cdef_avx2.asm similarity index 65% rename from src/x86/cdef.asm rename to src/x86/cdef_avx2.asm index bd2dd8bb5c..0eafdb1eff 100644 --- a/src/x86/cdef.asm +++ b/src/x86/cdef_avx2.asm @@ -28,22 +28,6 @@ %if ARCH_X86_64 -%macro DUP4 1-* - %rep %0 - times 4 db %1 - %rotate 1 - %endrep -%endmacro - -%macro DIRS 16 ; cdef_directions[] - %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 - ; masking away unused bits allows us to use a single vpaddd {1to16} - ; instruction instead of having to do vpbroadcastd + paddb - db %13 & 0x3f, -%13 & 0x3f - %rotate 1 - %endrep -%endmacro - %macro JMP_TABLE 2-* %xdefine %1_jmptable %%table %xdefine %%base mangle(private_prefix %+ _%1_avx2) @@ -62,51 +46,9 @@ JMP_TABLE cdef_filter_%1, \ d0k0, d0k1, d1k0, d1k1 %endmacro -SECTION_RODATA 64 - -lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 - db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 - db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 - db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 -lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 - db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 -lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 - db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 - db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 - db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 -pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 -lut_perm_8x8a: db 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 - db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55 - db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87 - db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119 -lut_perm_8x8b: db 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27 - db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 - db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91 - db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123 -edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 - dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 - dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 - dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 - dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 - dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 - dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 - dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 -px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 -cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 -gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 - dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 - dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 - dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 - times 16 db 0 ; realign (introduced by cdef_dirs) -end_perm_w8clip:db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 - db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 - db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 - db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 -end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 - db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 -pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 -sec_tap: db 32, 32, 16, 16 -pd_268435568: dd 268435568 +SECTION_RODATA 32 + +pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00 dd 0x80, 0x00, 0x00 blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 @@ -118,7 +60,6 @@ blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 dd 0x0000, 0x0000 blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80 blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000 -pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 @@ -747,7 +688,7 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ movu xm9, [dstq+strideq*1-1] vinserti128 m5, [dstq+strideq*2-1], 1 vinserti128 m9, [dstq+stride3q -1], 1 - mova m10, [blend_8x8_0+16] + movu m10, [blend_8x8_0+16] punpcklqdq m6, m5, m9 vpblendvb m6, [rsp+gprsize+80+hq*8+64], m10 psrldq m5, 2 @@ -1855,780 +1796,4 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 movd [varq], xm2 RET -%if WIN64 -DECLARE_REG_TMP 5, 6 -%else -DECLARE_REG_TMP 8, 5 -%endif - -; lut: -; t0 t1 t2 t3 t4 t5 t6 t7 -; T0 T1 T2 T3 T4 T5 T6 T7 -; L0 L1 00 01 02 03 04 05 -; L2 L3 10 11 12 13 14 15 -; L4 L5 20 21 22 23 24 25 -; L6 L7 30 31 32 33 34 35 -; 4e 4f 40 41 42 43 44 45 -; 5e 5f 50 51 52 53 54 55 - -%if HAVE_AVX512ICL - -INIT_ZMM avx512icl -cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge -%define base r7-edge_mask - movq xmm0, [dstq+strideq*0] - movhps xmm0, [dstq+strideq*1] - lea r7, [edge_mask] - movq xmm1, [topq+strideq*0-2] - movhps xmm1, [topq+strideq*1-2] - mov r6d, edgem - vinserti32x4 ym0, ymm0, [leftq], 1 - lea r2, [strideq*3] - vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 - mova m5, [base+lut_perm_4x4] - vinserti32x4 m0, [dstq+r2], 2 - test r6b, 0x08 ; avoid buffer overread - jz .main - lea r3, [dstq+strideq*4-4] - vinserti32x4 m1, [r3+strideq*0], 2 - vinserti32x4 m0, [r3+strideq*1], 3 -.main: - movifnidn prid, prim - mov t0d, dirm - mova m3, [base+px_idx] - mov r3d, dampingm - vpermi2b m5, m0, m1 ; lut - vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) - pxor m7, m7 - lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 - vpermb m6, m3, m5 ; px - cmp r6d, 0x0f - jne .mask_edges ; mask edges only if required - test prid, prid - jz .sec_only - vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir - vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 -%macro CDEF_FILTER_4x4_PRI 0 - vpcmpub k1, m6, m1, 6 ; px > pN - psubb m2, m1, m6 - lzcnt r6d, prid - vpsubb m2{k1}, m6, m1 ; abs(diff) - vpbroadcastb m4, prid - and prid, 1 - vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift - movifnidn t1d, secm - vpbroadcastd m10, [base+pri_tap+priq*4] - vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) - psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) - pminub m2, m4 - vpdpbusd m0, m2, m10 ; sum -%endmacro - CDEF_FILTER_4x4_PRI - test t1d, t1d ; sec - jz .end_no_clip - call .sec -.end_clip: - pminub m4, m6, m1 - pmaxub m1, m6 - pminub m5, m2, m3 - pmaxub m2, m3 - pminub m4, m5 - pmaxub m2, m1 - psrldq m1, m4, 2 - psrldq m3, m2, 2 - pminub m1, m4 - vpcmpw k1, m0, m7, 1 - vpshldd m6, m0, 8 - pmaxub m2, m3 - pslldq m3, m1, 1 - psubw m7, m0 - paddusw m0, m6 ; clip >0xff - vpsubusw m0{k1}, m6, m7 ; clip <0x00 - pslldq m4, m2, 1 - pminub m1, m3 - pmaxub m2, m4 - pmaxub m0, m1 - pminub m0, m2 - jmp .end -.sec_only: - movifnidn t1d, secm - call .sec -.end_no_clip: - vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) - paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) -.end: - mova xm1, [base+end_perm] - vpermb m0, m1, m0 ; output in bits 8-15 of each dword - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+r2 ], xm0, 3 - RET -.mask_edges_sec_only: - movifnidn t1d, secm - call .mask_edges_sec - jmp .end_no_clip -ALIGN function_align -.mask_edges: - vpbroadcastq m8, [base+edge_mask+r6*8] - test prid, prid - jz .mask_edges_sec_only - vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} - vpshufbitqmb k1, m8, m2 ; index in-range - mova m1, m6 - vpermb m1{k1}, m2, m5 - CDEF_FILTER_4x4_PRI - test t1d, t1d - jz .end_no_clip - call .mask_edges_sec - jmp .end_clip -.mask_edges_sec: - vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} - vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} - vpshufbitqmb k1, m8, m4 - mova m2, m6 - vpermb m2{k1}, m4, m5 - vpshufbitqmb k1, m8, m9 - mova m3, m6 - vpermb m3{k1}, m9, m5 - jmp .sec_main -ALIGN function_align -.sec: - vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 - vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 - vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 - vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 -.sec_main: - vpbroadcastd m8, [base+sec_tap] - vpcmpub k1, m6, m2, 6 - psubb m4, m2, m6 - vpbroadcastb m12, t1d - lzcnt t1d, t1d - vpsubb m4{k1}, m6, m2 - vpcmpub k2, m6, m3, 6 - vpbroadcastq m11, [r3+t1*8] - gf2p8affineqb m10, m4, m11, 0 - psubb m5, m3, m6 - mova m9, m8 - vpsubb m8{k1}, m7, m8 - psubusb m10, m12, m10 - vpsubb m5{k2}, m6, m3 - pminub m4, m10 - vpdpbusd m0, m4, m8 - gf2p8affineqb m11, m5, m11, 0 - vpsubb m9{k2}, m7, m9 - psubusb m12, m11 - pminub m5, m12 - vpdpbusd m0, m5, m9 - ret - -DECLARE_REG_TMP 2, 7 - -; lut top lut bottom -; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 -; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 -; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 -; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 -; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 -; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 -; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85 -; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95 - -cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \ - pri, sec, dir, damping, edge -%define base r8-edge_mask - vpbroadcastd ym21, strided - mov r6d, edgem - lea r8, [edge_mask] - movq xm1, [topq+strideq*0-2] - pmulld ym21, [base+pd_01234567] - kxnorb k1, k1, k1 - movq xm2, [topq+strideq*1-2] - vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7 - mova m14, [base+lut_perm_4x8a] - movu m15, [base+lut_perm_4x8b] - test r6b, 0x08 ; avoid buffer overread - jz .main - lea r7, [dstq+strideq*8-2] - vinserti32x4 ym1, [r7+strideq*0], 1 - vinserti32x4 ym2, [r7+strideq*1], 1 -.main: - punpcklqdq ym1, ym2 - vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ - movifnidn prid, prim - mov t0d, dirm - mova m16, [base+px_idx] - mov r3d, dampingm - vpermi2b m14, m0, m1 ; lut top - vpermi2b m15, m0, m1 ; lut bottom - vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) - pxor m20, m20 - lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 - vpermb m2, m16, m14 ; pxt - vpermb m3, m16, m15 ; pxb - mova m1, m0 - cmp r6b, 0x0f - jne .mask_edges ; mask edges only if required - test prid, prid - jz .sec_only - vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir - vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1 - vpermb m5, m6, m15 ; pNb -%macro CDEF_FILTER_4x8_PRI 0 - vpcmpub k1, m2, m4, 6 ; pxt > pNt - vpcmpub k2, m3, m5, 6 ; pxb > pNb - psubb m6, m4, m2 - psubb m7, m5, m3 - lzcnt r6d, prid - vpsubb m6{k1}, m2, m4 ; abs(diff_top) - vpsubb m7{k2}, m3, m5 ; abs(diff_bottom) - vpbroadcastb m13, prid - vpbroadcastq m9, [r3+r6*8] - and prid, 1 - vpbroadcastd m11, [base+pri_tap+priq*4] - vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift - vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift - mova m10, m11 - movifnidn t1d, secm - vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top) - vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom) - psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift))) - psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift))) - pminub m6, m12 - pminub m7, m13 - vpdpbusd m0, m6, m10 ; sum top - vpdpbusd m1, m7, m11 ; sum bottom -%endmacro - CDEF_FILTER_4x8_PRI - test t1d, t1d ; sec - jz .end_no_clip - call .sec -.end_clip: - pminub m10, m4, m2 - pminub m12, m6, m8 - pminub m11, m5, m3 - pminub m13, m7, m9 - pmaxub m4, m2 - pmaxub m6, m8 - pmaxub m5, m3 - pmaxub m7, m9 - pminub m10, m12 - pminub m11, m13 - pmaxub m4, m6 - pmaxub m5, m7 - mov r2d, 0xAAAAAAAA - kmovd k1, r2d - kxnorb k2, k2, k2 ; hw lw - vpshrdd m12, m0, m1, 16 ; m1lw m0hw - vpshrdd m6, m10, m11, 16 ; m11lw m10hw - vpshrdd m8, m4, m5, 16 ; m5lw m4hw - vpblendmw m7{k1}, m10, m11 ; m11hw m10lw - vpblendmw m9{k1}, m4, m5 ; m5hw m4lw - vpblendmw m4{k1}, m0, m12 ; m1lw m0lw - vpblendmw m5{k1}, m12, m1 ; m1hw m0hw - vpshrdd m2, m3, 16 - pminub m6, m7 - pmaxub m8, m9 - mova ym14, [base+end_perm] - vpcmpw k1, m4, m20, 1 - vpshldw m2, m5, 8 - pslldq m7, m6, 1 - pslldq m9, m8, 1 - psubw m5, m20, m4 - paddusw m0, m4, m2 ; clip >0xff - pminub m6, m7 - pmaxub m8, m9 - psubusw m0{k1}, m2, m5 ; clip <0x00 - pmaxub m0, m6 - pminub m0, m8 - vpermb m0, m14, m0 - vpscatterdd [dstq+ym21]{k2}, ym0 - RET -.sec_only: - movifnidn t1d, secm - call .sec -.end_no_clip: - mova ym4, [base+end_perm] - kxnorb k1, k1, k1 - vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4) - vpshldd m3, m1, 8 - paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) - paddw m1, m3 - pslld m0, 16 - vpshrdd m0, m1, 16 - vpermb m0, m4, m0 ; output in bits 8-15 of each word - vpscatterdd [dstq+ym21]{k1}, ym0 - RET -.mask_edges_sec_only: - movifnidn t1d, secm - call .mask_edges_sec - jmp .end_no_clip -ALIGN function_align -.mask_edges: - mov t1d, r6d - or r6d, 8 ; top 4x4 has bottom - or t1d, 4 ; bottom 4x4 has top - vpbroadcastq m17, [base+edge_mask+r6*8] - vpbroadcastq m18, [base+edge_mask+t1*8] - test prid, prid - jz .mask_edges_sec_only - vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} - vpshufbitqmb k1, m17, m6 ; index in-range - vpshufbitqmb k2, m18, m6 - mova m4, m2 - mova m5, m3 - vpermb m4{k1}, m6, m14 - vpermb m5{k2}, m6, m15 - CDEF_FILTER_4x8_PRI - test t1d, t1d - jz .end_no_clip - call .mask_edges_sec - jmp .end_clip -.mask_edges_sec: - vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16} - vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16} - vpshufbitqmb k1, m17, m10 - vpshufbitqmb k2, m18, m10 - vpshufbitqmb k3, m17, m11 - vpshufbitqmb k4, m18, m11 - mova m6, m2 - mova m7, m3 - mova m8, m2 - mova m9, m3 - vpermb m6{k1}, m10, m14 - vpermb m7{k2}, m10, m15 - vpermb m8{k3}, m11, m14 - vpermb m9{k4}, m11, m15 - jmp .sec_main -ALIGN function_align -.sec: - vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 - vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 - vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1 - vpermb m7, m8, m15 ; pNb - vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3 - vpermb m9, m9, m15 ; pNb -.sec_main: - vpbroadcastb m18, t1d - lzcnt t1d, t1d - vpcmpub k1, m2, m6, 6 - vpcmpub k2, m3, m7, 6 - vpcmpub k3, m2, m8, 6 - vpcmpub k4, m3, m9, 6 - vpbroadcastq m17, [r3+t1*8] - psubb m10, m6, m2 - psubb m11, m7, m3 - psubb m12, m8, m2 - psubb m13, m9, m3 - vpsubb m10{k1}, m2, m6 ; abs(dt0) - vpsubb m11{k2}, m3, m7 ; abs(db0) - vpsubb m12{k3}, m2, m8 ; abs(dt1) - vpsubb m13{k4}, m3, m9 ; abs(db1) - vpbroadcastd m19, [base+sec_tap] - gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift - gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift - gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift - gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift - psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift))) - psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift))) - psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift))) - psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift))) - pminub m10, m14 - pminub m11, m15 - pminub m12, m16 - pminub m13, m17 - mova m14, m19 - mova m15, m19 - mova m16, m19 - vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0) - vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0) - vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1) - vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1) - vpdpbusd m0, m10, m14 - vpdpbusd m1, m11, m15 - vpdpbusd m0, m12, m16 - vpdpbusd m1, m13, m19 - ret - -; lut tl lut tr -; t0 t1 t2 t3 t4 t5 t6 t7 t6 t7 t8 t9 ta tb tc td -; T0 T1 T2 T3 T4 T5 T6 T7 T6 T7 T8 T9 TA TB TC TD -; L0 L1 00 01 02 03 04 05 04 05 06 07 08 09 0a 0b -; L2 L3 10 11 12 13 14 15 14 15 16 17 18 19 1a 1b -; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b -; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b -; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b -; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b -; lut bl lut br -; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b -; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b -; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b -; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b -; Lc Ld 60 61 62 63 64 65 64 65 66 67 68 69 6a 6b -; Le Lf 70 71 72 73 74 75 74 75 76 77 78 79 7a 7b -; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b -; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b - -cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \ - pri, sec, dir, damping, edge -%define base r8-edge_mask - mov r6d, edgem - lea r10, [dstq+strideq*4-2] - movu xmm0, [topq+strideq*0-2] - movu xmm1, [dstq+strideq*2-2] - movu xmm2, [r10 +strideq*2 ] - lea r8, [edge_mask] - lea r9, [strideq*3] - pmovzxwq m10, [leftq-4] - vinserti32x4 ym0, ymm0, [topq+strideq*1-2], 1 - vinserti32x4 ym1, ymm1, [dstq+r9 -2], 1 - vinserti32x4 ym2, ymm2, [r10 +r9 ], 1 - lea r7, [r10 +strideq*4 ] - pmovzxwq m11, [leftq+4] - vinserti32x4 m0, [dstq+strideq*0-2], 2 - vinserti32x4 m1, [r10 +strideq*0 ], 2 - mova m12, [base+lut_perm_8x8a] - movu m13, [base+lut_perm_8x8b] - vinserti32x4 m0, [dstq+strideq*1-2], 3 - vinserti32x4 m1, [r10 +strideq*1 ], 3 - test r6b, 0x08 ; avoid buffer overread - jz .main - vinserti32x4 m2, [r7 +strideq*0], 2 - vinserti32x4 m2, [r7 +strideq*1], 3 -.main: - mov t1d, 0x11111100 - mova m14, m12 - mova m15, m13 - kmovd k1, t1d - kshiftrd k2, k1, 8 - movifnidn prid, prim - mov t0d, dirm - mova m30, [base+px_idx] - mov r3d, dampingm - vpermi2b m12, m0, m1 ; lut tl - vpermi2b m14, m1, m2 ; lut bl - vpermi2b m13, m0, m1 ; lut tr - vpermi2b m15, m1, m2 ; lut br - vpblendmw m12{k1}, m12, m10 - vpblendmw m14{k2}, m14, m11 - vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) - pxor m31, m31 - lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 - vpermb m4, m30, m12 ; pxtl - vpermb m5, m30, m13 ; pxtr - vpermb m6, m30, m14 ; pxbl - vpermb m7, m30, m15 ; pxbr - mova m1, m0 - mova m2, m0 - mova m3, m0 - cmp r6b, 0x0f - jne .mask_edges ; mask edges only if required - test prid, prid - jz .sec_only - vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir - vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1 - vpermb m9, m11, m13 ; pNtr - vpermb m10, m11, m14 ; pNbl - vpermb m11, m11, m15 ; pNbr -%macro CDEF_FILTER_8x8_PRI 0 - vpcmpub k1, m4, m8, 6 ; pxtl > pNtl - vpcmpub k2, m5, m9, 6 ; pxtr > pNtr - vpcmpub k3, m6, m10, 6 ; pxbl > pNbl - vpcmpub k4, m7, m11, 6 ; pxbr > pNbr - psubb m16, m8, m4 - psubb m17, m9, m5 - psubb m18, m10, m6 - psubb m19, m11, m7 - lzcnt r6d, prid - vpsubb m16{k1}, m4, m8 ; abs(diff_tl) - vpsubb m17{k2}, m5, m9 ; abs(diff_tr) - vpsubb m18{k3}, m6, m10 ; abs(diff_bl) - vpsubb m19{k4}, m7, m11 ; abs(diff_br) - vpbroadcastq m28, [r3+r6*8] - vpbroadcastb m29, prid - and prid, 1 - vpbroadcastd m27, [base+pri_tap+priq*4] - vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift - vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift - vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift - vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift - mova m24, m27 - mova m25, m27 - mova m26, m27 - movifnidn t1d, secm - vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl) - vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr) - vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl) - vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr) - psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift))) - psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift))) - psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift))) - psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift))) - pminub m16, m20 - pminub m17, m21 - pminub m18, m22 - pminub m19, m23 - vpdpbusd m0, m16, m24 ; sum tl - vpdpbusd m1, m17, m25 ; sum tr - vpdpbusd m2, m18, m26 ; sum bl - vpdpbusd m3, m19, m27 ; sum br -%endmacro - CDEF_FILTER_8x8_PRI - test t1d, t1d ; sec - jz .end_no_clip - call .sec -.end_clip: - pminub m20, m8, m4 - pminub m24, m12, m16 - pminub m21, m9, m5 - pminub m25, m13, m17 - pminub m22, m10, m6 - pminub m26, m14, m18 - pminub m23, m11, m7 - pminub m27, m15, m19 - pmaxub m8, m4 - pmaxub m12, m16 - pmaxub m9, m5 - pmaxub m13, m17 - pmaxub m10, m6 - pmaxub m14, m18 - pmaxub m11, m7 - pmaxub m15, m19 - pminub m20, m24 - pminub m21, m25 - pminub m22, m26 - pminub m23, m27 - pmaxub m8, m12 - pmaxub m9, m13 - pmaxub m10, m14 - pmaxub m11, m15 - mov r2d, 0xAAAAAAAA - kmovd k1, r2d - vpshrdd m24, m0, m1, 16 - vpshrdd m25, m2, m3, 16 - vpshrdd m12, m20, m21, 16 - vpshrdd m14, m22, m23, 16 - vpshrdd m16, m8, m9, 16 - vpshrdd m18, m10, m11, 16 - vpblendmw m13{k1}, m20, m21 - vpblendmw m15{k1}, m22, m23 - vpblendmw m17{k1}, m8, m9 - vpblendmw m19{k1}, m10, m11 - vpblendmw m20{k1}, m0, m24 - vpblendmw m21{k1}, m24, m1 - vpblendmw m22{k1}, m2, m25 - vpblendmw m23{k1}, m25, m3 - vpshrdd m4, m5, 16 - vpshrdd m6, m7, 16 - pminub m12, m13 - pminub m14, m15 - pmaxub m16, m17 - pmaxub m18, m19 - mova m8, [base+end_perm_w8clip] - vpcmpw k2, m20, m31, 1 - vpcmpw k3, m22, m31, 1 - vpshldw m4, m21, 8 - vpshldw m6, m23, 8 - kunpckdq k1, k1, k1 - kxnorb k4, k4, k4 - vpshrdw m11, m12, m14, 8 - vpshrdw m15, m16, m18, 8 - vpblendmb m13{k1}, m12, m14 - vpblendmb m17{k1}, m16, m18 - psubw m21, m31, m20 - psubw m23, m31, m22 - paddusw m0, m20, m4 ; clip >0xff - paddusw m1, m22, m6 - pminub m11, m13 - pmaxub m15, m17 - psubusw m0{k2}, m4, m21 ; clip <0x00 - psubusw m1{k3}, m6, m23 - psrlw m0, 8 - vmovdqu8 m0{k1}, m1 - pmaxub m0, m11 - pminub m0, m15 - vpermb m0, m8, m0 - add r10, 2 - vextracti32x4 xm1, m0, 1 - vextracti32x4 xm2, m0, 2 - vextracti32x4 xm3, m0, 3 - movq [dstq+strideq*0], xm0 - movq [dstq+strideq*2], xm1 - movq [r10 +strideq*0], xm2 - movq [r10 +strideq*2], xm3 - movhps [dstq+strideq*1], xm0 - movhps [dstq+r9 ], xm1 - movhps [r10 +strideq*1], xm2 - movhps [r10 +r9 ], xm3 - RET -.sec_only: - movifnidn t1d, secm - call .sec -.end_no_clip: - mova xm8, [base+end_perm] - kxnorb k1, k1, k1 - vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4) - vpshldd m5, m1, 8 - vpshldd m6, m2, 8 - vpshldd m7, m3, 8 - paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) - paddw m1, m5 - paddw m2, m6 - paddw m3, m7 - vpermb m0, m8, m0 - vpermb m1, m8, m1 - vpermb m2, m8, m2 - vpermb m3, m8, m3 - add r10, 2 - punpckldq m4, m0, m1 - punpckhdq m0, m1 - punpckldq m5, m2, m3 - punpckhdq m2, m3 - movq [dstq+strideq*0], xm4 - movq [dstq+strideq*2], xm0 - movq [r10 +strideq*0], xm5 - movq [r10 +strideq*2], xm2 - movhps [dstq+strideq*1], xm4 - movhps [dstq+r9 ], xm0 - movhps [r10 +strideq*1], xm5 - movhps [r10 +r9 ], xm2 - RET -.mask_edges_sec_only: - movifnidn t1d, secm - call .mask_edges_sec - jmp .end_no_clip -ALIGN function_align -.mask_edges: - mov t0d, r6d - mov t1d, r6d - or t0d, 0xA ; top-left 4x4 has bottom and right - or t1d, 0x9 ; top-right 4x4 has bottom and left - vpbroadcastq m26, [base+edge_mask+t0*8] - vpbroadcastq m27, [base+edge_mask+t1*8] - mov t1d, r6d - or r6d, 0x6 ; bottom-left 4x4 has top and right - or t1d, 0x5 ; bottom-right 4x4 has top and left - vpbroadcastq m28, [base+edge_mask+r6*8] - vpbroadcastq m29, [base+edge_mask+t1*8] - mov t0d, dirm - test prid, prid - jz .mask_edges_sec_only - vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16} - vpshufbitqmb k1, m26, m20 ; index in-range - vpshufbitqmb k2, m27, m20 - vpshufbitqmb k3, m28, m20 - vpshufbitqmb k4, m29, m20 - mova m8, m4 - mova m9, m5 - mova m10, m6 - mova m11, m7 - vpermb m8{k1}, m20, m12 - vpermb m9{k2}, m20, m13 - vpermb m10{k3}, m20, m14 - vpermb m11{k4}, m20, m15 - mova [rsp+0x00], m26 - mova [rsp+0x40], m27 - mova [rsp+0x80], m28 - mova [rsp+0xC0], m29 - CDEF_FILTER_8x8_PRI - test t1d, t1d - jz .end_no_clip - mova m26, [rsp+0x00] - mova m27, [rsp+0x40] - mova m28, [rsp+0x80] - mova m29, [rsp+0xC0] - call .mask_edges_sec - jmp .end_clip -.mask_edges_sec: - vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} - vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} - vpshufbitqmb k1, m26, m20 - vpshufbitqmb k2, m27, m20 - vpshufbitqmb k3, m28, m20 - vpshufbitqmb k4, m29, m20 - mova m16, m4 - mova m17, m5 - mova m18, m6 - mova m19, m7 - vpermb m16{k1}, m20, m12 - vpermb m17{k2}, m20, m13 - vpermb m18{k3}, m20, m14 - vpermb m19{k4}, m20, m15 - vpshufbitqmb k1, m26, m21 - vpshufbitqmb k2, m27, m21 - vpshufbitqmb k3, m28, m21 - vpshufbitqmb k4, m29, m21 - vpermb m12, m21, m12 - vpermb m13, m21, m13 - vpermb m14, m21, m14 - vpermb m15, m21, m15 - vpblendmb m12{k1}, m4, m12 - vpblendmb m13{k2}, m5, m13 - vpblendmb m14{k3}, m6, m14 - vpblendmb m15{k4}, m7, m15 - jmp .sec_main -ALIGN function_align -.sec: - vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 - vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 - vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1 - vpermb m17, m20, m13 ; pNtr - vpermb m18, m20, m14 ; pNbl - vpermb m19, m20, m15 ; pNbr - vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3 - vpermb m13, m21, m13 ; pNtr - vpermb m14, m21, m14 ; pNbl - vpermb m15, m21, m15 ; pNbr -.sec_main: -%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants - vpcmpub k1, m4, %1, 6 - vpcmpub k2, m5, %2, 6 - vpcmpub k3, m6, %3, 6 - vpcmpub k4, m7, %4, 6 - psubb m20, %1, m4 - psubb m21, %2, m5 - psubb m22, %3, m6 - psubb m23, %4, m7 -%if %5 - vpbroadcastb m28, t1d - lzcnt t1d, t1d - vpbroadcastq m29, [r3+t1*8] -%endif - vpsubb m20{k1}, m4, %1 - vpsubb m21{k2}, m5, %2 - vpsubb m22{k3}, m6, %3 - vpsubb m23{k4}, m7, %4 - gf2p8affineqb m24, m20, m29, 0 - gf2p8affineqb m25, m21, m29, 0 - gf2p8affineqb m26, m22, m29, 0 - gf2p8affineqb m27, m23, m29, 0 -%if %5 - vpbroadcastd m30, [base+sec_tap] -%endif - psubusb m24, m28, m24 - psubusb m25, m28, m25 - psubusb m26, m28, m26 - psubusb m27, m28, m27 - pminub m20, m24 - pminub m21, m25 - pminub m22, m26 - pminub m23, m27 - mova m24, m30 - mova m25, m30 - mova m26, m30 - mova m27, m30 - vpsubb m24{k1}, m31, m30 - vpsubb m25{k2}, m31, m30 - vpsubb m26{k3}, m31, m30 - vpsubb m27{k4}, m31, m30 - vpdpbusd m0, m20, m24 - vpdpbusd m1, m21, m25 - vpdpbusd m2, m22, m26 - vpdpbusd m3, m23, m27 -%endmacro - CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1 - CDEF_FILTER_8x8_SEC m12, m13, m14, m15 - ret - -%endif ; HAVE_AVX512ICL %endif ; ARCH_X86_64 diff --git a/src/x86/cdef_avx512.asm b/src/x86/cdef_avx512.asm new file mode 100644 index 0000000000..b1fa1ad16f --- /dev/null +++ b/src/x86/cdef_avx512.asm @@ -0,0 +1,868 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if HAVE_AVX512ICL && ARCH_X86_64 + +%macro DUP4 1-* + %rep %0 + times 4 db %1 + %rotate 1 + %endrep +%endmacro + +%macro DIRS 16 ; cdef_directions[] + %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 + ; masking away unused bits allows us to use a single vpaddd {1to16} + ; instruction instead of having to do vpbroadcastd + paddb + db %13 & 0x3f, -%13 & 0x3f + %rotate 1 + %endrep +%endmacro + +SECTION_RODATA 64 + +lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 + db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 + db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 + db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 +lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 + db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 +lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 + db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 + db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 + db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 +pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 +lut_perm_8x8a: db 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 + db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55 + db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87 + db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119 +lut_perm_8x8b: db 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27 + db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 + db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91 + db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123 +edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 + dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 + dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 + dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 + dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 + dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 + dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 + dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 +px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 +cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 +gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 + dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 + dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 + dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 + times 16 db 0 ; realign (introduced by cdef_dirs) +end_perm_w8clip:db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 + db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 + db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 + db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 +end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 +pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 +sec_tap: db 32, 32, 16, 16 +pd_268435568: dd 268435568 + +SECTION .text + +%if WIN64 +DECLARE_REG_TMP 5, 6 +%else +DECLARE_REG_TMP 8, 5 +%endif + +; lut: +; t0 t1 t2 t3 t4 t5 t6 t7 +; T0 T1 T2 T3 T4 T5 T6 T7 +; L0 L1 00 01 02 03 04 05 +; L2 L3 10 11 12 13 14 15 +; L4 L5 20 21 22 23 24 25 +; L6 L7 30 31 32 33 34 35 +; 4e 4f 40 41 42 43 44 45 +; 5e 5f 50 51 52 53 54 55 + +INIT_ZMM avx512icl +cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge +%define base r7-edge_mask + movq xmm0, [dstq+strideq*0] + movhps xmm0, [dstq+strideq*1] + lea r7, [edge_mask] + movq xmm1, [topq+strideq*0-2] + movhps xmm1, [topq+strideq*1-2] + mov r6d, edgem + vinserti32x4 ym0, ymm0, [leftq], 1 + lea r2, [strideq*3] + vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 + mova m5, [base+lut_perm_4x4] + vinserti32x4 m0, [dstq+r2], 2 + test r6b, 0x08 ; avoid buffer overread + jz .main + lea r3, [dstq+strideq*4-4] + vinserti32x4 m1, [r3+strideq*0], 2 + vinserti32x4 m0, [r3+strideq*1], 3 +.main: + movifnidn prid, prim + mov t0d, dirm + mova m3, [base+px_idx] + mov r3d, dampingm + vpermi2b m5, m0, m1 ; lut + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m7, m7 + lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m6, m3, m5 ; px + cmp r6d, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 +%macro CDEF_FILTER_4x4_PRI 0 + vpcmpub k1, m6, m1, 6 ; px > pN + psubb m2, m1, m6 + lzcnt r6d, prid + vpsubb m2{k1}, m6, m1 ; abs(diff) + vpbroadcastb m4, prid + and prid, 1 + vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift + movifnidn t1d, secm + vpbroadcastd m10, [base+pri_tap+priq*4] + vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) + psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) + pminub m2, m4 + vpdpbusd m0, m2, m10 ; sum +%endmacro + CDEF_FILTER_4x4_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m4, m6, m1 + pmaxub m1, m6 + pminub m5, m2, m3 + pmaxub m2, m3 + pminub m4, m5 + pmaxub m2, m1 + psrldq m1, m4, 2 + psrldq m3, m2, 2 + pminub m1, m4 + vpcmpw k1, m0, m7, 1 + vpshldd m6, m0, 8 + pmaxub m2, m3 + pslldq m3, m1, 1 + psubw m7, m0 + paddusw m0, m6 ; clip >0xff + vpsubusw m0{k1}, m6, m7 ; clip <0x00 + pslldq m4, m2, 1 + pminub m1, m3 + pmaxub m2, m4 + pmaxub m0, m1 + pminub m0, m2 + jmp .end +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) + paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) +.end: + mova xm1, [base+end_perm] + vpermb m0, m1, m0 ; output in bits 8-15 of each dword + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + vpbroadcastq m8, [base+edge_mask+r6*8] + test prid, prid + jz .mask_edges_sec_only + vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m8, m2 ; index in-range + mova m1, m6 + vpermb m1{k1}, m2, m5 + CDEF_FILTER_4x4_PRI + test t1d, t1d + jz .end_no_clip + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m8, m4 + mova m2, m6 + vpermb m2{k1}, m4, m5 + vpshufbitqmb k1, m8, m9 + mova m3, m6 + vpermb m3{k1}, m9, m5 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 + vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 +.sec_main: + vpbroadcastd m8, [base+sec_tap] + vpcmpub k1, m6, m2, 6 + psubb m4, m2, m6 + vpbroadcastb m12, t1d + lzcnt t1d, t1d + vpsubb m4{k1}, m6, m2 + vpcmpub k2, m6, m3, 6 + vpbroadcastq m11, [r3+t1*8] + gf2p8affineqb m10, m4, m11, 0 + psubb m5, m3, m6 + mova m9, m8 + vpsubb m8{k1}, m7, m8 + psubusb m10, m12, m10 + vpsubb m5{k2}, m6, m3 + pminub m4, m10 + vpdpbusd m0, m4, m8 + gf2p8affineqb m11, m5, m11, 0 + vpsubb m9{k2}, m7, m9 + psubusb m12, m11 + pminub m5, m12 + vpdpbusd m0, m5, m9 + ret + +DECLARE_REG_TMP 2, 7 + +; lut top lut bottom +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 +; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 +; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 +; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 +; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 +; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85 +; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95 + +cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \ + pri, sec, dir, damping, edge +%define base r8-edge_mask + vpbroadcastd ym21, strided + mov r6d, edgem + lea r8, [edge_mask] + movq xm1, [topq+strideq*0-2] + pmulld ym21, [base+pd_01234567] + kxnorb k1, k1, k1 + movq xm2, [topq+strideq*1-2] + vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7 + mova m14, [base+lut_perm_4x8a] + movu m15, [base+lut_perm_4x8b] + test r6b, 0x08 ; avoid buffer overread + jz .main + lea r7, [dstq+strideq*8-2] + vinserti32x4 ym1, [r7+strideq*0], 1 + vinserti32x4 ym2, [r7+strideq*1], 1 +.main: + punpcklqdq ym1, ym2 + vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ + movifnidn prid, prim + mov t0d, dirm + mova m16, [base+px_idx] + mov r3d, dampingm + vpermi2b m14, m0, m1 ; lut top + vpermi2b m15, m0, m1 ; lut bottom + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m20, m20 + lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m2, m16, m14 ; pxt + vpermb m3, m16, m15 ; pxb + mova m1, m0 + cmp r6b, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1 + vpermb m5, m6, m15 ; pNb +%macro CDEF_FILTER_4x8_PRI 0 + vpcmpub k1, m2, m4, 6 ; pxt > pNt + vpcmpub k2, m3, m5, 6 ; pxb > pNb + psubb m6, m4, m2 + psubb m7, m5, m3 + lzcnt r6d, prid + vpsubb m6{k1}, m2, m4 ; abs(diff_top) + vpsubb m7{k2}, m3, m5 ; abs(diff_bottom) + vpbroadcastb m13, prid + vpbroadcastq m9, [r3+r6*8] + and prid, 1 + vpbroadcastd m11, [base+pri_tap+priq*4] + vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift + vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift + mova m10, m11 + movifnidn t1d, secm + vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top) + vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom) + psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift))) + psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift))) + pminub m6, m12 + pminub m7, m13 + vpdpbusd m0, m6, m10 ; sum top + vpdpbusd m1, m7, m11 ; sum bottom +%endmacro + CDEF_FILTER_4x8_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m10, m4, m2 + pminub m12, m6, m8 + pminub m11, m5, m3 + pminub m13, m7, m9 + pmaxub m4, m2 + pmaxub m6, m8 + pmaxub m5, m3 + pmaxub m7, m9 + pminub m10, m12 + pminub m11, m13 + pmaxub m4, m6 + pmaxub m5, m7 + mov r2d, 0xAAAAAAAA + kmovd k1, r2d + kxnorb k2, k2, k2 ; hw lw + vpshrdd m12, m0, m1, 16 ; m1lw m0hw + vpshrdd m6, m10, m11, 16 ; m11lw m10hw + vpshrdd m8, m4, m5, 16 ; m5lw m4hw + vpblendmw m7{k1}, m10, m11 ; m11hw m10lw + vpblendmw m9{k1}, m4, m5 ; m5hw m4lw + vpblendmw m4{k1}, m0, m12 ; m1lw m0lw + vpblendmw m5{k1}, m12, m1 ; m1hw m0hw + vpshrdd m2, m3, 16 + pminub m6, m7 + pmaxub m8, m9 + mova ym14, [base+end_perm] + vpcmpw k1, m4, m20, 1 + vpshldw m2, m5, 8 + pslldq m7, m6, 1 + pslldq m9, m8, 1 + psubw m5, m20, m4 + paddusw m0, m4, m2 ; clip >0xff + pminub m6, m7 + pmaxub m8, m9 + psubusw m0{k1}, m2, m5 ; clip <0x00 + pmaxub m0, m6 + pminub m0, m8 + vpermb m0, m14, m0 + vpscatterdd [dstq+ym21]{k2}, ym0 + RET +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + mova ym4, [base+end_perm] + kxnorb k1, k1, k1 + vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m3, m1, 8 + paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddw m1, m3 + pslld m0, 16 + vpshrdd m0, m1, 16 + vpermb m0, m4, m0 ; output in bits 8-15 of each word + vpscatterdd [dstq+ym21]{k1}, ym0 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + mov t1d, r6d + or r6d, 8 ; top 4x4 has bottom + or t1d, 4 ; bottom 4x4 has top + vpbroadcastq m17, [base+edge_mask+r6*8] + vpbroadcastq m18, [base+edge_mask+t1*8] + test prid, prid + jz .mask_edges_sec_only + vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m17, m6 ; index in-range + vpshufbitqmb k2, m18, m6 + mova m4, m2 + mova m5, m3 + vpermb m4{k1}, m6, m14 + vpermb m5{k2}, m6, m15 + CDEF_FILTER_4x8_PRI + test t1d, t1d + jz .end_no_clip + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m17, m10 + vpshufbitqmb k2, m18, m10 + vpshufbitqmb k3, m17, m11 + vpshufbitqmb k4, m18, m11 + mova m6, m2 + mova m7, m3 + mova m8, m2 + mova m9, m3 + vpermb m6{k1}, m10, m14 + vpermb m7{k2}, m10, m15 + vpermb m8{k3}, m11, m14 + vpermb m9{k4}, m11, m15 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1 + vpermb m7, m8, m15 ; pNb + vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3 + vpermb m9, m9, m15 ; pNb +.sec_main: + vpbroadcastb m18, t1d + lzcnt t1d, t1d + vpcmpub k1, m2, m6, 6 + vpcmpub k2, m3, m7, 6 + vpcmpub k3, m2, m8, 6 + vpcmpub k4, m3, m9, 6 + vpbroadcastq m17, [r3+t1*8] + psubb m10, m6, m2 + psubb m11, m7, m3 + psubb m12, m8, m2 + psubb m13, m9, m3 + vpsubb m10{k1}, m2, m6 ; abs(dt0) + vpsubb m11{k2}, m3, m7 ; abs(db0) + vpsubb m12{k3}, m2, m8 ; abs(dt1) + vpsubb m13{k4}, m3, m9 ; abs(db1) + vpbroadcastd m19, [base+sec_tap] + gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift + gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift + gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift + gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift + psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift))) + psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift))) + psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift))) + psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift))) + pminub m10, m14 + pminub m11, m15 + pminub m12, m16 + pminub m13, m17 + mova m14, m19 + mova m15, m19 + mova m16, m19 + vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0) + vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0) + vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1) + vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1) + vpdpbusd m0, m10, m14 + vpdpbusd m1, m11, m15 + vpdpbusd m0, m12, m16 + vpdpbusd m1, m13, m19 + ret + +; lut tl lut tr +; t0 t1 t2 t3 t4 t5 t6 t7 t6 t7 t8 t9 ta tb tc td +; T0 T1 T2 T3 T4 T5 T6 T7 T6 T7 T8 T9 TA TB TC TD +; L0 L1 00 01 02 03 04 05 04 05 06 07 08 09 0a 0b +; L2 L3 10 11 12 13 14 15 14 15 16 17 18 19 1a 1b +; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b +; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b +; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b +; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b +; lut bl lut br +; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b +; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b +; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b +; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b +; Lc Ld 60 61 62 63 64 65 64 65 66 67 68 69 6a 6b +; Le Lf 70 71 72 73 74 75 74 75 76 77 78 79 7a 7b +; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b +; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b + +cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \ + pri, sec, dir, damping, edge +%define base r8-edge_mask + mov r6d, edgem + lea r10, [dstq+strideq*4-2] + movu xmm0, [topq+strideq*0-2] + movu xmm1, [dstq+strideq*2-2] + movu xmm2, [r10 +strideq*2 ] + lea r8, [edge_mask] + lea r9, [strideq*3] + pmovzxwq m10, [leftq-4] + vinserti32x4 ym0, ymm0, [topq+strideq*1-2], 1 + vinserti32x4 ym1, ymm1, [dstq+r9 -2], 1 + vinserti32x4 ym2, ymm2, [r10 +r9 ], 1 + lea r7, [r10 +strideq*4 ] + pmovzxwq m11, [leftq+4] + vinserti32x4 m0, [dstq+strideq*0-2], 2 + vinserti32x4 m1, [r10 +strideq*0 ], 2 + mova m12, [base+lut_perm_8x8a] + movu m13, [base+lut_perm_8x8b] + vinserti32x4 m0, [dstq+strideq*1-2], 3 + vinserti32x4 m1, [r10 +strideq*1 ], 3 + test r6b, 0x08 ; avoid buffer overread + jz .main + vinserti32x4 m2, [r7 +strideq*0], 2 + vinserti32x4 m2, [r7 +strideq*1], 3 +.main: + mov t1d, 0x11111100 + mova m14, m12 + mova m15, m13 + kmovd k1, t1d + kshiftrd k2, k1, 8 + movifnidn prid, prim + mov t0d, dirm + mova m30, [base+px_idx] + mov r3d, dampingm + vpermi2b m12, m0, m1 ; lut tl + vpermi2b m14, m1, m2 ; lut bl + vpermi2b m13, m0, m1 ; lut tr + vpermi2b m15, m1, m2 ; lut br + vpblendmw m12{k1}, m12, m10 + vpblendmw m14{k2}, m14, m11 + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m31, m31 + lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m4, m30, m12 ; pxtl + vpermb m5, m30, m13 ; pxtr + vpermb m6, m30, m14 ; pxbl + vpermb m7, m30, m15 ; pxbr + mova m1, m0 + mova m2, m0 + mova m3, m0 + cmp r6b, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1 + vpermb m9, m11, m13 ; pNtr + vpermb m10, m11, m14 ; pNbl + vpermb m11, m11, m15 ; pNbr +%macro CDEF_FILTER_8x8_PRI 0 + vpcmpub k1, m4, m8, 6 ; pxtl > pNtl + vpcmpub k2, m5, m9, 6 ; pxtr > pNtr + vpcmpub k3, m6, m10, 6 ; pxbl > pNbl + vpcmpub k4, m7, m11, 6 ; pxbr > pNbr + psubb m16, m8, m4 + psubb m17, m9, m5 + psubb m18, m10, m6 + psubb m19, m11, m7 + lzcnt r6d, prid + vpsubb m16{k1}, m4, m8 ; abs(diff_tl) + vpsubb m17{k2}, m5, m9 ; abs(diff_tr) + vpsubb m18{k3}, m6, m10 ; abs(diff_bl) + vpsubb m19{k4}, m7, m11 ; abs(diff_br) + vpbroadcastq m28, [r3+r6*8] + vpbroadcastb m29, prid + and prid, 1 + vpbroadcastd m27, [base+pri_tap+priq*4] + vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift + vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift + vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift + vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift + mova m24, m27 + mova m25, m27 + mova m26, m27 + movifnidn t1d, secm + vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl) + vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr) + vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl) + vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr) + psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift))) + psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift))) + psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift))) + psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift))) + pminub m16, m20 + pminub m17, m21 + pminub m18, m22 + pminub m19, m23 + vpdpbusd m0, m16, m24 ; sum tl + vpdpbusd m1, m17, m25 ; sum tr + vpdpbusd m2, m18, m26 ; sum bl + vpdpbusd m3, m19, m27 ; sum br +%endmacro + CDEF_FILTER_8x8_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m20, m8, m4 + pminub m24, m12, m16 + pminub m21, m9, m5 + pminub m25, m13, m17 + pminub m22, m10, m6 + pminub m26, m14, m18 + pminub m23, m11, m7 + pminub m27, m15, m19 + pmaxub m8, m4 + pmaxub m12, m16 + pmaxub m9, m5 + pmaxub m13, m17 + pmaxub m10, m6 + pmaxub m14, m18 + pmaxub m11, m7 + pmaxub m15, m19 + pminub m20, m24 + pminub m21, m25 + pminub m22, m26 + pminub m23, m27 + pmaxub m8, m12 + pmaxub m9, m13 + pmaxub m10, m14 + pmaxub m11, m15 + mov r2d, 0xAAAAAAAA + kmovd k1, r2d + vpshrdd m24, m0, m1, 16 + vpshrdd m25, m2, m3, 16 + vpshrdd m12, m20, m21, 16 + vpshrdd m14, m22, m23, 16 + vpshrdd m16, m8, m9, 16 + vpshrdd m18, m10, m11, 16 + vpblendmw m13{k1}, m20, m21 + vpblendmw m15{k1}, m22, m23 + vpblendmw m17{k1}, m8, m9 + vpblendmw m19{k1}, m10, m11 + vpblendmw m20{k1}, m0, m24 + vpblendmw m21{k1}, m24, m1 + vpblendmw m22{k1}, m2, m25 + vpblendmw m23{k1}, m25, m3 + vpshrdd m4, m5, 16 + vpshrdd m6, m7, 16 + pminub m12, m13 + pminub m14, m15 + pmaxub m16, m17 + pmaxub m18, m19 + mova m8, [base+end_perm_w8clip] + vpcmpw k2, m20, m31, 1 + vpcmpw k3, m22, m31, 1 + vpshldw m4, m21, 8 + vpshldw m6, m23, 8 + kunpckdq k1, k1, k1 + kxnorb k4, k4, k4 + vpshrdw m11, m12, m14, 8 + vpshrdw m15, m16, m18, 8 + vpblendmb m13{k1}, m12, m14 + vpblendmb m17{k1}, m16, m18 + psubw m21, m31, m20 + psubw m23, m31, m22 + paddusw m0, m20, m4 ; clip >0xff + paddusw m1, m22, m6 + pminub m11, m13 + pmaxub m15, m17 + psubusw m0{k2}, m4, m21 ; clip <0x00 + psubusw m1{k3}, m6, m23 + psrlw m0, 8 + vmovdqu8 m0{k1}, m1 + pmaxub m0, m11 + pminub m0, m15 + vpermb m0, m8, m0 + add r10, 2 + vextracti32x4 xm1, m0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*2], xm1 + movq [r10 +strideq*0], xm2 + movq [r10 +strideq*2], xm3 + movhps [dstq+strideq*1], xm0 + movhps [dstq+r9 ], xm1 + movhps [r10 +strideq*1], xm2 + movhps [r10 +r9 ], xm3 + RET +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + mova xm8, [base+end_perm] + kxnorb k1, k1, k1 + vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m5, m1, 8 + vpshldd m6, m2, 8 + vpshldd m7, m3, 8 + paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + vpermb m0, m8, m0 + vpermb m1, m8, m1 + vpermb m2, m8, m2 + vpermb m3, m8, m3 + add r10, 2 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm0 + movq [r10 +strideq*0], xm5 + movq [r10 +strideq*2], xm2 + movhps [dstq+strideq*1], xm4 + movhps [dstq+r9 ], xm0 + movhps [r10 +strideq*1], xm5 + movhps [r10 +r9 ], xm2 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + mov t0d, r6d + mov t1d, r6d + or t0d, 0xA ; top-left 4x4 has bottom and right + or t1d, 0x9 ; top-right 4x4 has bottom and left + vpbroadcastq m26, [base+edge_mask+t0*8] + vpbroadcastq m27, [base+edge_mask+t1*8] + mov t1d, r6d + or r6d, 0x6 ; bottom-left 4x4 has top and right + or t1d, 0x5 ; bottom-right 4x4 has top and left + vpbroadcastq m28, [base+edge_mask+r6*8] + vpbroadcastq m29, [base+edge_mask+t1*8] + mov t0d, dirm + test prid, prid + jz .mask_edges_sec_only + vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m26, m20 ; index in-range + vpshufbitqmb k2, m27, m20 + vpshufbitqmb k3, m28, m20 + vpshufbitqmb k4, m29, m20 + mova m8, m4 + mova m9, m5 + mova m10, m6 + mova m11, m7 + vpermb m8{k1}, m20, m12 + vpermb m9{k2}, m20, m13 + vpermb m10{k3}, m20, m14 + vpermb m11{k4}, m20, m15 + mova [rsp+0x00], m26 + mova [rsp+0x40], m27 + mova [rsp+0x80], m28 + mova [rsp+0xC0], m29 + CDEF_FILTER_8x8_PRI + test t1d, t1d + jz .end_no_clip + mova m26, [rsp+0x00] + mova m27, [rsp+0x40] + mova m28, [rsp+0x80] + mova m29, [rsp+0xC0] + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m26, m20 + vpshufbitqmb k2, m27, m20 + vpshufbitqmb k3, m28, m20 + vpshufbitqmb k4, m29, m20 + mova m16, m4 + mova m17, m5 + mova m18, m6 + mova m19, m7 + vpermb m16{k1}, m20, m12 + vpermb m17{k2}, m20, m13 + vpermb m18{k3}, m20, m14 + vpermb m19{k4}, m20, m15 + vpshufbitqmb k1, m26, m21 + vpshufbitqmb k2, m27, m21 + vpshufbitqmb k3, m28, m21 + vpshufbitqmb k4, m29, m21 + vpermb m12, m21, m12 + vpermb m13, m21, m13 + vpermb m14, m21, m14 + vpermb m15, m21, m15 + vpblendmb m12{k1}, m4, m12 + vpblendmb m13{k2}, m5, m13 + vpblendmb m14{k3}, m6, m14 + vpblendmb m15{k4}, m7, m15 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1 + vpermb m17, m20, m13 ; pNtr + vpermb m18, m20, m14 ; pNbl + vpermb m19, m20, m15 ; pNbr + vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3 + vpermb m13, m21, m13 ; pNtr + vpermb m14, m21, m14 ; pNbl + vpermb m15, m21, m15 ; pNbr +.sec_main: +%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants + vpcmpub k1, m4, %1, 6 + vpcmpub k2, m5, %2, 6 + vpcmpub k3, m6, %3, 6 + vpcmpub k4, m7, %4, 6 + psubb m20, %1, m4 + psubb m21, %2, m5 + psubb m22, %3, m6 + psubb m23, %4, m7 +%if %5 + vpbroadcastb m28, t1d + lzcnt t1d, t1d + vpbroadcastq m29, [r3+t1*8] +%endif + vpsubb m20{k1}, m4, %1 + vpsubb m21{k2}, m5, %2 + vpsubb m22{k3}, m6, %3 + vpsubb m23{k4}, m7, %4 + gf2p8affineqb m24, m20, m29, 0 + gf2p8affineqb m25, m21, m29, 0 + gf2p8affineqb m26, m22, m29, 0 + gf2p8affineqb m27, m23, m29, 0 +%if %5 + vpbroadcastd m30, [base+sec_tap] +%endif + psubusb m24, m28, m24 + psubusb m25, m28, m25 + psubusb m26, m28, m26 + psubusb m27, m28, m27 + pminub m20, m24 + pminub m21, m25 + pminub m22, m26 + pminub m23, m27 + mova m24, m30 + mova m25, m30 + mova m26, m30 + mova m27, m30 + vpsubb m24{k1}, m31, m30 + vpsubb m25{k2}, m31, m30 + vpsubb m26{k3}, m31, m30 + vpsubb m27{k4}, m31, m30 + vpdpbusd m0, m20, m24 + vpdpbusd m1, m21, m25 + vpdpbusd m2, m22, m26 + vpdpbusd m3, m23, m27 +%endmacro + CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1 + CDEF_FILTER_8x8_SEC m12, m13, m14, m15 + ret + +%endif ; HAVE_AVX512ICL && ARCH_X86_64 From d6ab5ec8b777704fed82580ca4e43efc55ced236 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Sun, 10 May 2020 15:55:04 +0200 Subject: [PATCH 070/155] x86: Use 'test' instead of 'or' to compare with zero Allows for macro-op fusion. --- src/x86/cdef_avx2.asm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/x86/cdef_avx2.asm b/src/x86/cdef_avx2.asm index 0eafdb1eff..672fae7a51 100644 --- a/src/x86/cdef_avx2.asm +++ b/src/x86/cdef_avx2.asm @@ -460,14 +460,14 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ movifnidn prid, prim sub dampingd, 31 movifnidn secdmpd, secdmpm - or prid, 0 + test prid, prid jz .sec_only movd xm0, prid lzcnt pridmpd, prid add pridmpd, dampingd cmovs pridmpd, zerod mov [rsp+0], pridmpq ; pri_shift - or secdmpd, 0 + test secdmpd, secdmpd jz .pri_only movd xm1, secdmpd lzcnt secdmpd, secdmpd @@ -1469,14 +1469,14 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ movifnidn prid, prim sub dampingd, 31 movifnidn secdmpd, secdmpm - or prid, 0 + test prid, prid jz .border_sec_only movd xm0, prid lzcnt pridmpd, prid add pridmpd, dampingd cmovs pridmpd, zerod mov [rsp+0], pridmpq ; pri_shift - or secdmpd, 0 + test secdmpd, secdmpd jz .border_pri_only movd xm1, secdmpd lzcnt secdmpd, secdmpd From 23d0e6701dbade59f932f57a367a7bee3aca393f Mon Sep 17 00:00:00 2001 From: Tristan Matthews Date: Wed, 3 Mar 2021 13:40:51 -0500 Subject: [PATCH 071/155] doc: fix typo in CONTRIBUTING.md (#2681) --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6fd3052367..081b29456d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -85,7 +85,7 @@ cargo bench --features=bench Install `cargo-fuzz` with `cargo install cargo-fuzz`. Running fuzz targets with stable Rust requires `--sanitizer=none` or the shorter `-s none`. * List the fuzz targets with `cargo fuzz list`. -* Run a fuzz target with `cargo fuzz run --santizer=none `. +* Run a fuzz target with `cargo fuzz run --sanitizer=none `. * Parallel fuzzing: `cargo fuzz run -s none --jobs -- -workers=`. * Bump the "slow unit" time limit: `cargo fuzz run -s none -- -report_slow_units=600`. * Make the fuzzer generate long inputs right away: `cargo fuzz run -s none -- -max_len=256 -len_control=0`. From a8ea7ad115ce2309f7f039419ce4b48cc8b4418d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 30 Nov 2020 11:35:53 +0200 Subject: [PATCH 072/155] arm64: looprestoration16: Don't keep precalculated squares in box3/5_h Instead of calculating squares of pixels once, and shifting and adding the precalculated squares, just do multiply-accumulate of the pixels that are shifted anyway for the non-squared sum. This results in more multiplications in total, but fewer instructions, and multiplications aren't that much more expensive than regular arithmetic operations anyway. On Cortex A53 and A72, this is a fairly substantial gain, on A73 it's a very marginal gain. The runtimes for the box3/5_h functions themselves are reduced by around 16-20%, and the overall runtime for SGR is reduced by around 2-8%. Before: Cortex A53 A72 A73 selfguided_3x3_10bpc_neon: 513086.5 385767.7 348774.3 selfguided_5x5_10bpc_neon: 378108.6 291133.5 253251.4 selfguided_mix_10bpc_neon: 876833.1 662801.0 586387.4 After: Cortex A53 A72 A73 selfguided_3x3_10bpc_neon: 502734.0 363754.5 343199.8 selfguided_5x5_10bpc_neon: 361696.4 265848.2 249476.8 selfguided_mix_10bpc_neon: 852683.8 615848.6 577615.0 --- src/arm/64/looprestoration16.S | 170 ++++++++++++--------------------- 1 file changed, 63 insertions(+), 107 deletions(-) diff --git a/src/arm/64/looprestoration16.S b/src/arm/64/looprestoration16.S index 437988cfac..18de59a7d3 100644 --- a/src/arm/64/looprestoration16.S +++ b/src/arm/64/looprestoration16.S @@ -770,13 +770,6 @@ function sgr_box3_h_16bpc_neon, export=1 ext v16.16b, v18.16b, v16.16b, #12 2: - umull v2.4s, v0.4h, v0.4h - umull2 v3.4s, v0.8h, v0.8h - umull v4.4s, v1.4h, v1.4h - umull v18.4s, v16.4h, v16.4h - umull2 v19.4s, v16.8h, v16.8h - umull v20.4s, v17.4h, v17.4h - tst w7, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that byte to pad with @@ -796,40 +789,32 @@ function sgr_box3_h_16bpc_neon, export=1 b 6f 4: // Loop horizontally -.macro ext_n dst1, dst2, src1, src2, src3, n, w - ext \dst1, \src1, \src2, \n +.macro add3 w, wd + ext v26.16b, v0.16b, v1.16b, #2 + ext v28.16b, v16.16b, v17.16b, #2 + ext v27.16b, v0.16b, v1.16b, #4 + ext v29.16b, v16.16b, v17.16b, #4 + + add v6\wd, v0\wd, v26\wd + umull v22.4s, v0.4h, v0.4h + umlal v22.4s, v26.4h, v26.4h + umlal v22.4s, v27.4h, v27.4h + add v7\wd, v16\wd, v28\wd + umull v24.4s, v16.4h, v16.4h + umlal v24.4s, v28.4h, v28.4h + umlal v24.4s, v29.4h, v29.4h + add v6\wd, v6\wd, v27\wd .if \w > 4 - ext \dst2, \src2, \src3, \n + umull2 v23.4s, v0.8h, v0.8h + umlal2 v23.4s, v26.8h, v26.8h + umlal2 v23.4s, v27.8h, v27.8h .endif -.endm -.macro add_n dst1, dst2, src1, src2, src3, src4, w - add \dst1, \src1, \src3 + add v7\wd, v7\wd, v29\wd .if \w > 4 - add \dst2, \src2, \src4 + umull2 v25.4s, v16.8h, v16.8h + umlal2 v25.4s, v28.8h, v28.8h + umlal2 v25.4s, v29.8h, v29.8h .endif -.endm - -.macro add3 w, wd - ext v24.16b, v0.16b, v1.16b, #2 - ext v25.16b, v0.16b, v1.16b, #4 - ext v26.16b, v16.16b, v17.16b, #2 - ext v27.16b, v16.16b, v17.16b, #4 - add v6\wd, v0\wd, v24\wd - add v7\wd, v16\wd, v26\wd - add v6\wd, v6\wd, v25\wd - add v7\wd, v7\wd, v27\wd - - ext_n v24.16b, v25.16b, v2.16b, v3.16b, v4.16b, #4, \w - ext_n v26.16b, v27.16b, v2.16b, v3.16b, v4.16b, #8, \w - - add_n v22.4s, v23.4s, v2.4s, v3.4s, v24.4s, v25.4s, \w - add_n v22.4s, v23.4s, v22.4s, v23.4s, v26.4s, v27.4s, \w - - ext_n v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w - ext_n v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w - - add_n v24.4s, v25.4s, v18.4s, v19.4s, v24.4s, v25.4s, \w - add_n v24.4s, v25.4s, v24.4s, v25.4s, v26.4s, v27.4s, \w .endm add3 8, .8h st1 {v6.8h}, [x1], #16 @@ -844,12 +829,6 @@ function sgr_box3_h_16bpc_neon, export=1 mov v16.16b, v17.16b ld1 {v1.8h}, [x3], #16 ld1 {v17.8h}, [x12], #16 - mov v2.16b, v4.16b - umull2 v3.4s, v0.8h, v0.8h - umull v4.4s, v1.4h, v1.4h - mov v18.16b, v20.16b - umull2 v19.4s, v16.8h, v16.8h - umull v20.4s, v17.4h, v17.4h b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. @@ -907,11 +886,6 @@ L(box3_variable_shift_tbl): .hword L(box3_variable_shift_tbl) - 55b 88: - umull v2.4s, v0.4h, v0.4h - umull2 v3.4s, v0.8h, v0.8h - umull v18.4s, v16.4h, v16.4h - umull2 v19.4s, v16.8h, v16.8h - add3 4, .4h subs w5, w5, #4 st1 {v6.4h}, [x1], #8 @@ -921,10 +895,6 @@ L(box3_variable_shift_tbl): b.le 9f ext v0.16b, v0.16b, v0.16b, #8 ext v16.16b, v16.16b, v16.16b, #8 - mov v2.16b, v3.16b - mov v3.16b, v4.16b - mov v18.16b, v19.16b - mov v19.16b, v20.16b // Only one needed pixel left, but do a normal 4 pixel // addition anyway add3 4, .4h @@ -1036,13 +1006,6 @@ function sgr_box5_h_16bpc_neon, export=1 ext v16.16b, v18.16b, v16.16b, #10 2: - umull v2.4s, v0.4h, v0.4h - umull2 v3.4s, v0.8h, v0.8h - umull v4.4s, v1.4h, v1.4h - umull v18.4s, v16.4h, v16.4h - umull2 v19.4s, v16.8h, v16.8h - umull v20.4s, v17.4h, v17.4h - tst w7, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that byte to pad with @@ -1063,43 +1026,53 @@ function sgr_box5_h_16bpc_neon, export=1 4: // Loop horizontally .macro add5 w, wd - ext v24.16b, v0.16b, v1.16b, #2 - ext v25.16b, v0.16b, v1.16b, #4 - ext v26.16b, v0.16b, v1.16b, #6 - ext v27.16b, v0.16b, v1.16b, #8 - - add v6\wd, v0\wd, v24\wd - add v25\wd, v25\wd, v26\wd + ext v26.16b, v0.16b, v1.16b, #2 + ext v28.16b, v16.16b, v17.16b, #2 + ext v27.16b, v0.16b, v1.16b, #4 + ext v29.16b, v16.16b, v17.16b, #4 + + add v6\wd, v0\wd, v26\wd + umull v22.4s, v0.4h, v0.4h + umlal v22.4s, v26.4h, v26.4h + umlal v22.4s, v27.4h, v27.4h + add v7\wd, v16\wd, v28\wd + umull v24.4s, v16.4h, v16.4h + umlal v24.4s, v28.4h, v28.4h + umlal v24.4s, v29.4h, v29.4h add v6\wd, v6\wd, v27\wd +.if \w > 4 + umull2 v23.4s, v0.8h, v0.8h + umlal2 v23.4s, v26.8h, v26.8h + umlal2 v23.4s, v27.8h, v27.8h +.endif + add v7\wd, v7\wd, v29\wd +.if \w > 4 + umull2 v25.4s, v16.8h, v16.8h + umlal2 v25.4s, v28.8h, v28.8h + umlal2 v25.4s, v29.8h, v29.8h +.endif - ext v26.16b, v16.16b, v17.16b, #2 - ext v27.16b, v16.16b, v17.16b, #4 + ext v26.16b, v0.16b, v1.16b, #6 ext v28.16b, v16.16b, v17.16b, #6 + ext v27.16b, v0.16b, v1.16b, #8 ext v29.16b, v16.16b, v17.16b, #8 - add v7\wd, v16\wd, v26\wd - add v27\wd, v27\wd, v28\wd + add v6\wd, v6\wd, v26\wd + umlal v22.4s, v26.4h, v26.4h + umlal v22.4s, v27.4h, v27.4h + add v7\wd, v7\wd, v28\wd + umlal v24.4s, v28.4h, v28.4h + umlal v24.4s, v29.4h, v29.4h + add v6\wd, v6\wd, v27\wd +.if \w > 4 + umlal2 v23.4s, v26.8h, v26.8h + umlal2 v23.4s, v27.8h, v27.8h +.endif add v7\wd, v7\wd, v29\wd - add v6\wd, v6\wd, v25\wd - add v7\wd, v7\wd, v27\wd - - ext_n v24.16b, v25.16b, v2.16b, v3.16b, v4.16b, #4, \w - ext_n v26.16b, v27.16b, v2.16b, v3.16b, v4.16b, #8, \w - ext_n v28.16b, v29.16b, v2.16b, v3.16b, v4.16b, #12, \w - - add_n v22.4s, v23.4s, v2.4s, v3.4s, v24.4s, v25.4s, \w - add_n v26.4s, v27.4s, v26.4s, v27.4s, v28.4s, v29.4s, \w - add_n v22.4s, v23.4s, v22.4s, v23.4s, v3.4s, v4.4s, \w - add_n v22.4s, v23.4s, v22.4s, v23.4s, v26.4s, v27.4s, \w - - ext_n v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w - ext_n v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w - ext_n v28.16b, v29.16b, v18.16b, v19.16b, v20.16b, #12, \w - - add_n v24.4s, v25.4s, v18.4s, v19.4s, v24.4s, v25.4s, \w - add_n v26.4s, v27.4s, v26.4s, v27.4s, v28.4s, v29.4s, \w - add_n v24.4s, v25.4s, v24.4s, v25.4s, v19.4s, v20.4s, \w - add_n v24.4s, v25.4s, v24.4s, v25.4s, v26.4s, v27.4s, \w +.if \w > 4 + umlal2 v25.4s, v28.8h, v28.8h + umlal2 v25.4s, v29.8h, v29.8h +.endif .endm add5 8, .8h st1 {v6.8h}, [x1], #16 @@ -1114,12 +1087,6 @@ function sgr_box5_h_16bpc_neon, export=1 mov v16.16b, v17.16b ld1 {v1.8h}, [x3], #16 ld1 {v17.8h}, [x12], #16 - mov v2.16b, v4.16b - umull2 v3.4s, v0.8h, v0.8h - umull v4.4s, v1.4h, v1.4h - mov v18.16b, v20.16b - umull2 v19.4s, v16.8h, v16.8h - umull v20.4s, v17.4h, v17.4h b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. @@ -1193,13 +1160,6 @@ L(box5_variable_shift_tbl): .hword L(box5_variable_shift_tbl) - 77b 88: - umull v2.4s, v0.4h, v0.4h - umull2 v3.4s, v0.8h, v0.8h - umull v4.4s, v1.4h, v1.4h - umull v18.4s, v16.4h, v16.4h - umull2 v19.4s, v16.8h, v16.8h - umull v20.4s, v17.4h, v17.4h - add5 4, .4h subs w5, w5, #4 st1 {v6.4h}, [x1], #8 @@ -1209,10 +1169,6 @@ L(box5_variable_shift_tbl): b.le 9f ext v0.16b, v0.16b, v1.16b, #8 ext v16.16b, v16.16b, v17.16b, #8 - mov v2.16b, v3.16b - mov v3.16b, v4.16b - mov v18.16b, v19.16b - mov v19.16b, v20.16b add5 4, .4h st1 {v6.4h}, [x1], #8 st1 {v7.4h}, [x11], #8 From ed15b5bcd3baeae505fa9f3f3a5a872a79876942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 27 Nov 2020 16:26:54 +0200 Subject: [PATCH 073/155] arm32: looprestoration: Specify alignment in loads/stores in SGR where possible --- src/arm/32/looprestoration.S | 94 ++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/src/arm/32/looprestoration.S b/src/arm/32/looprestoration.S index 073525a5a5..6cfe0f874e 100644 --- a/src/arm/32/looprestoration.S +++ b/src/arm/32/looprestoration.S @@ -1675,12 +1675,12 @@ function sgr_finish_filter1_8bpc_neon, export=1 vmov.i16 q14, #3 vmov.i32 q15, #3 1: - vld1.16 {q0}, [r9]! - vld1.16 {q1}, [r4]! - vld1.16 {q2}, [r10]! - vld1.32 {q8, q9}, [r7]! - vld1.32 {q10, q11}, [r3]! - vld1.32 {q12, q13}, [r8]! + vld1.16 {q0}, [r9, :128]! + vld1.16 {q1}, [r4, :128]! + vld1.16 {q2}, [r10, :128]! + vld1.32 {q8, q9}, [r7, :128]! + vld1.32 {q10, q11}, [r3, :128]! + vld1.32 {q12, q13}, [r8, :128]! 2: subs r5, r5, #4 @@ -1711,7 +1711,7 @@ function sgr_finish_filter1_8bpc_neon, export=1 vadd.i32 q3, q3, q5 vext.8 q7, q12, q13, #4 // +stride vext.8 q10, q12, q13, #8 // +1+stride - vld1.32 {d24[0]}, [r1]! // src + vld1.32 {d24[0]}, [r1, :32]! // src vadd.i32 q3, q3, q7 // +stride vadd.i32 q8, q8, q10 // +1+stride vshl.i32 q3, q3, #2 @@ -1728,12 +1728,12 @@ function sgr_finish_filter1_8bpc_neon, export=1 vmov q8, q9 vmov q10, q11 vmov q12, q13 - vld1.16 {d1}, [r9]! - vld1.16 {d3}, [r4]! - vld1.16 {d5}, [r10]! - vld1.32 {q9}, [r7]! - vld1.32 {q11}, [r3]! - vld1.32 {q13}, [r8]! + vld1.16 {d1}, [r9, :64]! + vld1.16 {d3}, [r4, :64]! + vld1.16 {d5}, [r10, :64]! + vld1.32 {q9}, [r7, :128]! + vld1.32 {q11}, [r3, :128]! + vld1.32 {q13}, [r8, :128]! b 2b 3: @@ -1779,12 +1779,12 @@ function sgr_finish_filter2_8bpc_neon, export=1 mov lr, r5 1: - vld1.16 {q0, q1}, [r4]! - vld1.16 {q2, q3}, [r8]! - vld1.32 {q8, q9}, [r3]! - vld1.32 {q11, q12}, [r7]! - vld1.32 {q10}, [r3]! - vld1.32 {q13}, [r7]! + vld1.16 {q0, q1}, [r4, :128]! + vld1.16 {q2, q3}, [r8, :128]! + vld1.32 {q8, q9}, [r3, :128]! + vld1.32 {q11, q12}, [r7, :128]! + vld1.32 {q10}, [r3, :128]! + vld1.32 {q13}, [r7, :128]! 2: vmov.i16 q14, #5 @@ -1816,7 +1816,7 @@ function sgr_finish_filter2_8bpc_neon, export=1 vext.8 q8, q11, q12, #4 // +stride vext.8 q11, q12, q13, #4 - vld1.8 {d4}, [r1]! + vld1.8 {d4}, [r1, :64]! vmov.i32 q14, #5 vmov.i32 q15, #6 @@ -1835,15 +1835,15 @@ function sgr_finish_filter2_8bpc_neon, export=1 vrshrn.i32 d8, q4, #9 vrshrn.i32 d9, q5, #9 vmov q2, q3 - vst1.16 {q4}, [r0]! + vst1.16 {q4}, [r0, :128]! ble 3f vmov q8, q10 vmov q11, q13 - vld1.16 {q1}, [r4]! - vld1.16 {q3}, [r8]! - vld1.32 {q9, q10}, [r3]! - vld1.32 {q12, q13}, [r7]! + vld1.16 {q1}, [r4, :128]! + vld1.16 {q3}, [r8, :128]! + vld1.32 {q9, q10}, [r3, :128]! + vld1.32 {q12, q13}, [r7, :128]! b 2b 3: @@ -1857,9 +1857,9 @@ function sgr_finish_filter2_8bpc_neon, export=1 add r4, r4, r12, lsl #1 add r8, r8, r12, lsl #1 - vld1.32 {q8, q9}, [r3]! - vld1.16 {q0, q1}, [r4]! - vld1.32 {q10}, [r3]! + vld1.32 {q8, q9}, [r3, :128]! + vld1.16 {q0, q1}, [r4, :128]! + vld1.32 {q10}, [r3, :128]! vmov.i16 q12, #5 vmov.i16 q13, #6 @@ -1876,7 +1876,7 @@ function sgr_finish_filter2_8bpc_neon, export=1 vext.8 q7, q9, q10, #8 vmul.i16 q2, q2, q13 // * 6 vmla.i16 q2, q0, q12 // * 5 -> a - vld1.8 {d22}, [r1]! + vld1.8 {d22}, [r1, :64]! vadd.i32 q8, q8, q6 // -1, +1 vadd.i32 q9, q9, q7 vmovl.u8 q11, d22 @@ -1891,11 +1891,11 @@ function sgr_finish_filter2_8bpc_neon, export=1 vrshrn.i32 d8, q4, #8 vrshrn.i32 d9, q5, #8 vmov q8, q10 - vst1.16 {q4}, [r0]! + vst1.16 {q4}, [r0, :128]! ble 5f - vld1.16 {q1}, [r4]! - vld1.32 {q9, q10}, [r3]! + vld1.16 {q1}, [r4, :128]! + vld1.32 {q9, q10}, [r3, :128]! b 4b 5: @@ -1939,10 +1939,10 @@ function sgr_weighted1_8bpc_neon, export=1 mov r8, r5 blt 2f 1: - vld1.8 {d0}, [r2]! - vld1.8 {d16}, [r12]! - vld1.16 {q1}, [r4]! - vld1.16 {q9}, [lr]! + vld1.8 {d0}, [r2, :64]! + vld1.8 {d16}, [r12, :64]! + vld1.16 {q1}, [r4, :128]! + vld1.16 {q9}, [lr, :128]! subs r5, r5, #8 vshll.u8 q0, d0, #4 // u vshll.u8 q8, d16, #4 // u @@ -1980,8 +1980,8 @@ function sgr_weighted1_8bpc_neon, export=1 b 1b 2: - vld1.8 {d0}, [r2]! - vld1.16 {q1}, [r4]! + vld1.8 {d0}, [r2, :64]! + vld1.16 {q1}, [r4, :128]! subs r5, r5, #8 vshll.u8 q0, d0, #4 // u vsub.i16 q1, q1, q0 // t1 - u @@ -2025,12 +2025,12 @@ function sgr_weighted2_8bpc_neon, export=1 mov r9, r6 blt 2f 1: - vld1.8 {d0}, [r2]! - vld1.8 {d16}, [r11]! - vld1.16 {q1}, [r4]! - vld1.16 {q9}, [r12]! - vld1.16 {q2}, [r5]! - vld1.16 {q10}, [lr]! + vld1.8 {d0}, [r2, :64]! + vld1.8 {d16}, [r11, :64]! + vld1.16 {q1}, [r4, :128]! + vld1.16 {q9}, [r12, :128]! + vld1.16 {q2}, [r5, :128]! + vld1.16 {q10}, [lr, :128]! subs r6, r6, #8 vshll.u8 q0, d0, #4 // u vshll.u8 q8, d16, #4 // u @@ -2076,9 +2076,9 @@ function sgr_weighted2_8bpc_neon, export=1 b 1b 2: - vld1.8 {d0}, [r2]! - vld1.16 {q1}, [r4]! - vld1.16 {q2}, [r5]! + vld1.8 {d0}, [r2, :64]! + vld1.16 {q1}, [r4, :128]! + vld1.16 {q2}, [r5, :128]! subs r6, r6, #8 vshll.u8 q0, d0, #4 // u vsub.i16 q1, q1, q0 // t1 - u From 9c3681cb23d0f4a41ce98f0c59190b2a8b43852a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 27 Nov 2020 22:47:04 +0200 Subject: [PATCH 074/155] arm32: looprestoration: Remove an unnecessary stack arg load in SGR For the existing 8 bpc support, there's no stack argument to load into r8. --- src/arm/32/looprestoration.S | 1 - 1 file changed, 1 deletion(-) diff --git a/src/arm/32/looprestoration.S b/src/arm/32/looprestoration.S index 6cfe0f874e..dbde543b6a 100644 --- a/src/arm/32/looprestoration.S +++ b/src/arm/32/looprestoration.S @@ -1922,7 +1922,6 @@ function sgr_weighted1_8bpc_neon, export=1 push {r4-r9,lr} ldrd r4, r5, [sp, #28] ldrd r6, r7, [sp, #36] - ldr r8, [sp, #44] vdup.16 d31, r7 cmp r6, #2 add r9, r0, r1 From 9b5911f18e2f944c478d214f4b5bed5a5b5b22c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 27 Nov 2020 16:05:11 +0200 Subject: [PATCH 075/155] arm64: looprestoration: Add a missed parameter in a comment Make it consistent with the weighted1 function. --- src/arm/64/looprestoration_tmpl.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arm/64/looprestoration_tmpl.S b/src/arm/64/looprestoration_tmpl.S index 520365b41e..7cdfd6f3f7 100644 --- a/src/arm/64/looprestoration_tmpl.S +++ b/src/arm/64/looprestoration_tmpl.S @@ -454,7 +454,7 @@ endfunc // const pixel *src, const ptrdiff_t src_stride, // const int16_t *t1, const int16_t *t2, // const int w, const int h, -// const int16_t wt[2]); +// const int16_t wt[2], const int bitdepth_max); function sgr_weighted2_\bpc\()bpc_neon, export=1 .if \bpc == 8 ldr x8, [sp] From 7be62d51457bb2c49e0935bbd386896d8adcab85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 28 Nov 2020 23:23:18 +0200 Subject: [PATCH 076/155] arm: looprestoration16: Fix comments referring to pixels as bytes A number of other similar comments were updated to say pixels when the 16 bpc code was written originally, but these were missed. --- src/arm/32/looprestoration16.S | 2 +- src/arm/64/looprestoration16.S | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/arm/32/looprestoration16.S b/src/arm/32/looprestoration16.S index 39c248f8b5..3c0acaa56c 100644 --- a/src/arm/32/looprestoration16.S +++ b/src/arm/32/looprestoration16.S @@ -133,7 +133,7 @@ function wiener_filter_h_16bpc_neon, export=1 tst r7, #2 // LR_HAVE_RIGHT bne 4f - // If we'll need to pad the right edge, load that byte to pad with + // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. sub r9, r5, #14 lsl r9, r9, #1 diff --git a/src/arm/64/looprestoration16.S b/src/arm/64/looprestoration16.S index 18de59a7d3..450413d857 100644 --- a/src/arm/64/looprestoration16.S +++ b/src/arm/64/looprestoration16.S @@ -126,7 +126,7 @@ function wiener_filter_h_16bpc_neon, export=1 tst w7, #2 // LR_HAVE_RIGHT b.ne 4f - // If we'll need to pad the right edge, load that byte to pad with + // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. sub w9, w5, #14 ldr h27, [x2, w9, sxtw #1] @@ -772,7 +772,7 @@ function sgr_box3_h_16bpc_neon, export=1 2: tst w7, #2 // LR_HAVE_RIGHT b.ne 4f - // If we'll need to pad the right edge, load that byte to pad with + // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. sub w13, w5, #(2 + 16 - 2 + 1) ldr h30, [x3, w13, sxtw #1] @@ -996,7 +996,7 @@ function sgr_box5_h_16bpc_neon, export=1 // and shift v0/v1 to have 3x the first pixel at the front. dup v2.8h, v0.h[0] dup v18.8h, v16.h[0] - // Move x3 back to account for the last 6 bytes we loaded before, + // Move x3 back to account for the last 3 pixels we loaded before, // which we shifted out. sub x3, x3, #6 sub x12, x12, #6 @@ -1008,7 +1008,7 @@ function sgr_box5_h_16bpc_neon, export=1 2: tst w7, #2 // LR_HAVE_RIGHT b.ne 4f - // If we'll need to pad the right edge, load that byte to pad with + // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. sub w13, w5, #(2 + 16 - 3 + 1) ldr h30, [x3, w13, sxtw #1] From f7ff74283dad7bb6b2dc8d64e5db2b0807a5f65d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sun, 9 Feb 2020 23:39:11 +0200 Subject: [PATCH 077/155] arm32: looprestoration: Prepare for 16 bpc by splitting code to separate files looprestoration_common.S contains functions that can be used as is with one single instantiation of the functions for both 8 and 16 bpc. This file will be built once, regardless of which bitdepths are enabled. looprestoration_tmpl.S contains functions where the source can be shared and templated between 8 and 16 bpc. This will be included by the separate 8/16bpc implementaton files. --- src/arm/32/looprestoration.S | 861 +--------------------------- src/arm/32/looprestoration_common.S | 441 ++++++++++++++ src/arm/32/looprestoration_tmpl.S | 477 +++++++++++++++ 3 files changed, 920 insertions(+), 859 deletions(-) create mode 100644 src/arm/32/looprestoration_common.S create mode 100644 src/arm/32/looprestoration_tmpl.S diff --git a/src/arm/32/looprestoration.S b/src/arm/32/looprestoration.S index dbde543b6a..baa3d469e5 100644 --- a/src/arm/32/looprestoration.S +++ b/src/arm/32/looprestoration.S @@ -676,6 +676,8 @@ endfunc #define SUM_STRIDE (384+16) +#include "looprestoration_tmpl.S" + // void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum, // const pixel (*left)[4], // const pixel *src, const ptrdiff_t stride, @@ -1237,862 +1239,3 @@ L(box5_variable_shift_tbl): pop {r4-r11,pc} .purgem add5 endfunc - -// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box3_v_neon, export=1 - push {r4-r9,lr} - ldr r4, [sp, #28] - add r12, r3, #2 // Number of output rows to move back - mov lr, r3 // Number of input rows to move back - add r2, r2, #2 // Actual summed width - mov r7, #(4*SUM_STRIDE) // sumsq stride - mov r8, #(2*SUM_STRIDE) // sum stride - sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride - sub r1, r1, #(2*SUM_STRIDE) // sum -= stride - - tst r4, #4 // LR_HAVE_TOP - beq 0f - // If have top, read from row -2. - sub r5, r0, #(4*SUM_STRIDE) - sub r6, r1, #(2*SUM_STRIDE) - add lr, lr, #2 - b 1f -0: - // !LR_HAVE_TOP - // If we don't have top, read from row 0 even if - // we start writing to row -1. - add r5, r0, #(4*SUM_STRIDE) - add r6, r1, #(2*SUM_STRIDE) -1: - - tst r4, #8 // LR_HAVE_BOTTOM - beq 1f - // LR_HAVE_BOTTOM - add r3, r3, #2 // Sum all h+2 lines with the main loop - add lr, lr, #2 -1: - mov r9, r3 // Backup of h for next loops - -1: - // Start of horizontal loop; start one vertical filter slice. - // Start loading rows into q8-q13 and q0-q2 taking top - // padding into consideration. - tst r4, #4 // LR_HAVE_TOP - vld1.32 {q8, q9}, [r5, :128], r7 - vld1.16 {q0}, [r6, :128], r8 - beq 2f - // LR_HAVE_TOP - vld1.32 {q10, q11}, [r5, :128], r7 - vld1.16 {q1}, [r6, :128], r8 - vld1.32 {q12, q13}, [r5, :128], r7 - vld1.16 {q2}, [r6, :128], r8 - b 3f -2: // !LR_HAVE_TOP - vmov q10, q8 - vmov q11, q9 - vmov q1, q0 - vmov q12, q8 - vmov q13, q9 - vmov q2, q0 - -3: - subs r3, r3, #1 -.macro add3 - vadd.i32 q8, q8, q10 - vadd.i32 q9, q9, q11 - vadd.i16 q0, q0, q1 - vadd.i32 q8, q8, q12 - vadd.i32 q9, q9, q13 - vadd.i16 q0, q0, q2 - vst1.32 {q8, q9}, [r0, :128], r7 - vst1.16 {q0}, [r1, :128], r8 -.endm - add3 - vmov q8, q10 - vmov q9, q11 - vmov q0, q1 - vmov q10, q12 - vmov q11, q13 - vmov q1, q2 - ble 4f - vld1.32 {q12, q13}, [r5, :128], r7 - vld1.16 {q2}, [r6, :128], r8 - b 3b - -4: - tst r4, #8 // LR_HAVE_BOTTOM - bne 5f - // !LR_HAVE_BOTTOM - // Produce two more rows, extending the already loaded rows. - add3 - vmov q8, q10 - vmov q9, q11 - vmov q0, q1 - add3 - -5: // End of one vertical slice. - subs r2, r2, #8 - ble 0f - // Move pointers back up to the top and loop horizontally. - // Input pointers - mls r5, r7, lr, r5 - mls r6, r8, lr, r6 - // Output pointers - mls r0, r7, r12, r0 - mls r1, r8, r12, r1 - add r0, r0, #32 - add r1, r1, #16 - add r5, r5, #32 - add r6, r6, #16 - mov r3, r9 - b 1b - -0: - pop {r4-r9,pc} -.purgem add3 -endfunc - -// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box5_v_neon, export=1 - push {r4-r9,lr} - vpush {q5-q7} - ldr r4, [sp, #76] - add r12, r3, #2 // Number of output rows to move back - mov lr, r3 // Number of input rows to move back - add r2, r2, #8 // Actual summed width - mov r7, #(4*SUM_STRIDE) // sumsq stride - mov r8, #(2*SUM_STRIDE) // sum stride - sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride - sub r1, r1, #(2*SUM_STRIDE) // sum -= stride - - tst r4, #4 // LR_HAVE_TOP - beq 0f - // If have top, read from row -2. - sub r5, r0, #(4*SUM_STRIDE) - sub r6, r1, #(2*SUM_STRIDE) - add lr, lr, #2 - b 1f -0: - // !LR_HAVE_TOP - // If we don't have top, read from row 0 even if - // we start writing to row -1. - add r5, r0, #(4*SUM_STRIDE) - add r6, r1, #(2*SUM_STRIDE) -1: - - tst r4, #8 // LR_HAVE_BOTTOM - beq 0f - // LR_HAVE_BOTTOM - add r3, r3, #2 // Handle h+2 lines with the main loop - add lr, lr, #2 - b 1f -0: - // !LR_HAVE_BOTTOM - sub r3, r3, #1 // Handle h-1 lines with the main loop -1: - mov r9, r3 // Backup of h for next loops - -1: - // Start of horizontal loop; start one vertical filter slice. - // Start loading rows into q6-q15 and q0-q3,q5 taking top - // padding into consideration. - tst r4, #4 // LR_HAVE_TOP - vld1.32 {q6, q7}, [r5, :128], r7 - vld1.16 {q0}, [r6, :128], r8 - beq 2f - // LR_HAVE_TOP - vld1.32 {q10, q11}, [r5, :128], r7 - vld1.16 {q2}, [r6, :128], r8 - vmov q8, q6 - vmov q9, q7 - vmov q1, q0 - vld1.32 {q12, q13}, [r5, :128], r7 - vld1.16 {q3}, [r6, :128], r8 - b 3f -2: // !LR_HAVE_TOP - vmov q8, q6 - vmov q9, q7 - vmov q1, q0 - vmov q10, q6 - vmov q11, q7 - vmov q2, q0 - vmov q12, q6 - vmov q13, q7 - vmov q3, q0 - -3: - cmp r3, #0 - beq 4f - vld1.32 {q14, q15}, [r5, :128], r7 - vld1.16 {q5}, [r6, :128], r8 - -3: - // Start of vertical loop - subs r3, r3, #2 -.macro add5 - vadd.i32 q6, q6, q8 - vadd.i32 q7, q7, q9 - vadd.i16 q0, q0, q1 - vadd.i32 q6, q6, q10 - vadd.i32 q7, q7, q11 - vadd.i16 q0, q0, q2 - vadd.i32 q6, q6, q12 - vadd.i32 q7, q7, q13 - vadd.i16 q0, q0, q3 - vadd.i32 q6, q6, q14 - vadd.i32 q7, q7, q15 - vadd.i16 q0, q0, q5 - vst1.32 {q6, q7}, [r0, :128], r7 - vst1.16 {q0}, [r1, :128], r8 -.endm - add5 -.macro shift2 - vmov q6, q10 - vmov q7, q11 - vmov q0, q2 - vmov q8, q12 - vmov q9, q13 - vmov q1, q3 - vmov q10, q14 - vmov q11, q15 - vmov q2, q5 -.endm - shift2 - add r0, r0, r7 - add r1, r1, r8 - ble 5f - vld1.32 {q12, q13}, [r5, :128], r7 - vld1.16 {q3}, [r6, :128], r8 - vld1.32 {q14, q15}, [r5, :128], r7 - vld1.16 {q5}, [r6, :128], r8 - b 3b - -4: - // h == 1, !LR_HAVE_BOTTOM. - // Pad the last row with the only content row, and add. - vmov q14, q12 - vmov q15, q13 - vmov q5, q3 - add5 - shift2 - add r0, r0, r7 - add r1, r1, r8 - add5 - b 6f - -5: - tst r4, #8 // LR_HAVE_BOTTOM - bne 6f - // !LR_HAVE_BOTTOM - cmp r3, #0 - bne 5f - // The intended three edge rows left; output the one at h-2 and - // the past edge one at h. - vld1.32 {q12, q13}, [r5, :128], r7 - vld1.16 {q3}, [r6, :128], r8 - // Pad the past-edge row from the last content row. - vmov q14, q12 - vmov q15, q13 - vmov q5, q3 - add5 - shift2 - add r0, r0, r7 - add r1, r1, r8 - // The last two rows are already padded properly here. - add5 - b 6f - -5: - // r3 == -1, two rows left, output one. - // Pad the last two rows from the mid one. - vmov q12, q10 - vmov q13, q11 - vmov q3, q2 - vmov q14, q10 - vmov q15, q11 - vmov q5, q2 - add5 - add r0, r0, r7 - add r1, r1, r8 - b 6f - -6: // End of one vertical slice. - subs r2, r2, #8 - ble 0f - // Move pointers back up to the top and loop horizontally. - // Input pointers - mls r5, r7, lr, r5 - mls r6, r8, lr, r6 - // Output pointers - mls r0, r7, r12, r0 - mls r1, r8, r12, r1 - add r0, r0, #32 - add r1, r1, #16 - add r5, r5, #32 - add r6, r6, #16 - mov r3, r9 - b 1b - -0: - vpop {q5-q7} - pop {r4-r9,pc} -.purgem add5 -endfunc - -// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, -// const int w, const int h, const int strength); -// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, -// const int w, const int h, const int strength); -function sgr_calc_ab1_neon, export=1 - push {r4-r5,lr} - vpush {q4-q7} - ldr r4, [sp, #76] - add r3, r3, #2 // h += 2 - vmov.i32 q15, #9 // n - movw r5, #455 - mov lr, #SUM_STRIDE - b sgr_calc_ab_neon -endfunc - -function sgr_calc_ab2_neon, export=1 - push {r4-r5,lr} - vpush {q4-q7} - ldr r4, [sp, #76] - add r3, r3, #3 // h += 3 - asr r3, r3, #1 // h /= 2 - vmov.i32 q15, #25 // n - mov r5, #164 - mov lr, #(2*SUM_STRIDE) -endfunc - -function sgr_calc_ab_neon - movrel r12, X(sgr_x_by_x) - vld1.8 {q8, q9}, [r12, :128]! - vmov.i8 q11, #5 - vmov.i8 d10, #55 // idx of last 5 - vld1.8 {q10}, [r12, :128] - vmov.i8 d11, #72 // idx of last 4 - vmov.i8 d12, #101 // idx of last 3 - vmov.i8 d13, #169 // idx of last 2 - vmov.i8 d14, #254 // idx of last 1 - vmov.i8 d15, #32 // elements consumed in first vtbl - add r2, r2, #2 // w += 2 - add r12, r2, #7 - bic r12, r12, #7 // aligned w - sub r12, lr, r12 // increment between rows - vmov.i16 q13, #256 - vdup.32 q12, r4 - vdup.32 q14, r5 // one_by_x - sub r0, r0, #(4*(SUM_STRIDE)) - sub r1, r1, #(2*(SUM_STRIDE)) - mov r4, r2 // backup of w - vsub.i8 q8, q8, q11 - vsub.i8 q9, q9, q11 - vsub.i8 q10, q10, q11 -1: - subs r2, r2, #8 - vld1.32 {q0, q1}, [r0, :128] // a - vld1.16 {q2}, [r1, :128] // b - vmul.i32 q0, q0, q15 // a * n - vmul.i32 q1, q1, q15 // a * n - vmull.u16 q3, d4, d4 // b * b - vmull.u16 q4, d5, d5 // b * b - vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0) - vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0) - vmul.i32 q0, q0, q12 // p * s - vmul.i32 q1, q1, q12 // p * s - vqshrn.u32 d0, q0, #16 - vqshrn.u32 d1, q1, #16 - vqrshrn.u16 d0, q0, #4 // imin(z, 255) - - vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5 - vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4 - vtbl.8 d1, {q8, q9}, d0 - vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3 - vsub.i8 d9, d0, d15 // indices for vtbx - vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2 - vadd.i8 d2, d2, d3 - vtbx.8 d1, {q10}, d9 - vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1 - vadd.i8 d6, d6, d7 - vadd.i8 d8, d8, d22 - vadd.i8 d2, d2, d6 - vadd.i8 d1, d1, d8 - vadd.i8 d1, d1, d2 - vmovl.u8 q0, d1 // x - - vmull.u16 q1, d0, d4 // x * BB[i] - vmull.u16 q2, d1, d5 // x * BB[i] - vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x - vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x - vrshr.s32 q1, q1, #12 // AA[i] - vrshr.s32 q2, q2, #12 // AA[i] - vsub.i16 q0, q13, q0 // 256 - x - - vst1.32 {q1, q2}, [r0, :128]! - vst1.16 {q0}, [r1, :128]! - bgt 1b - - subs r3, r3, #1 - ble 0f - add r0, r0, r12, lsl #2 - add r1, r1, r12, lsl #1 - mov r2, r4 - b 1b -0: - vpop {q4-q7} - pop {r4-r5,pc} -endfunc - -#define FILTER_OUT_STRIDE 384 - -// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp, -// const pixel *src, const ptrdiff_t stride, -// const int32_t *a, const int16_t *b, -// const int w, const int h); -function sgr_finish_filter1_8bpc_neon, export=1 - push {r4-r11,lr} - vpush {q4-q7} - ldrd r4, r5, [sp, #100] - ldr r6, [sp, #108] - sub r7, r3, #(4*SUM_STRIDE) - add r8, r3, #(4*SUM_STRIDE) - sub r9, r4, #(2*SUM_STRIDE) - add r10, r4, #(2*SUM_STRIDE) - mov r11, #SUM_STRIDE - mov r12, #FILTER_OUT_STRIDE - add lr, r5, #3 - bic lr, lr, #3 // Aligned width - sub r2, r2, lr - sub r12, r12, lr - sub r11, r11, lr - sub r11, r11, #4 // We read 4 extra elements from both a and b - mov lr, r5 - vmov.i16 q14, #3 - vmov.i32 q15, #3 -1: - vld1.16 {q0}, [r9, :128]! - vld1.16 {q1}, [r4, :128]! - vld1.16 {q2}, [r10, :128]! - vld1.32 {q8, q9}, [r7, :128]! - vld1.32 {q10, q11}, [r3, :128]! - vld1.32 {q12, q13}, [r8, :128]! - -2: - subs r5, r5, #4 - vext.8 d6, d0, d1, #2 // -stride - vext.8 d7, d2, d3, #2 // 0 - vext.8 d8, d4, d5, #2 // +stride - vext.8 d9, d0, d1, #4 // +1-stride - vext.8 d10, d2, d3, #4 // +1 - vext.8 d11, d4, d5, #4 // +1+stride - vadd.i16 d2, d2, d6 // -1, -stride - vadd.i16 d7, d7, d8 // 0, +stride - vadd.i16 d0, d0, d9 // -1-stride, +1-stride - vadd.i16 d2, d2, d7 - vadd.i16 d4, d4, d11 // -1+stride, +1+stride - vadd.i16 d2, d2, d10 // +1 - vadd.i16 d0, d0, d4 - - vext.8 q3, q8, q9, #4 // -stride - vshl.i16 d2, d2, #2 - vext.8 q4, q8, q9, #8 // +1-stride - vext.8 q5, q10, q11, #4 // 0 - vext.8 q6, q10, q11, #8 // +1 - vmla.i16 d2, d0, d28 // * 3 -> a - vadd.i32 q3, q3, q10 // -stride, -1 - vadd.i32 q8, q8, q4 // -1-stride, +1-stride - vadd.i32 q5, q5, q6 // 0, +1 - vadd.i32 q8, q8, q12 // -1+stride - vadd.i32 q3, q3, q5 - vext.8 q7, q12, q13, #4 // +stride - vext.8 q10, q12, q13, #8 // +1+stride - vld1.32 {d24[0]}, [r1, :32]! // src - vadd.i32 q3, q3, q7 // +stride - vadd.i32 q8, q8, q10 // +1+stride - vshl.i32 q3, q3, #2 - vmla.i32 q3, q8, q15 // * 3 -> b - vmovl.u8 q12, d24 // src - vmov d0, d1 - vmlal.u16 q3, d2, d24 // b + a * src - vmov d2, d3 - vrshrn.i32 d6, q3, #9 - vmov d4, d5 - vst1.16 {d6}, [r0]! - - ble 3f - vmov q8, q9 - vmov q10, q11 - vmov q12, q13 - vld1.16 {d1}, [r9, :64]! - vld1.16 {d3}, [r4, :64]! - vld1.16 {d5}, [r10, :64]! - vld1.32 {q9}, [r7, :128]! - vld1.32 {q11}, [r3, :128]! - vld1.32 {q13}, [r8, :128]! - b 2b - -3: - subs r6, r6, #1 - ble 0f - mov r5, lr - add r0, r0, r12, lsl #1 - add r1, r1, r2 - add r3, r3, r11, lsl #2 - add r7, r7, r11, lsl #2 - add r8, r8, r11, lsl #2 - add r4, r4, r11, lsl #1 - add r9, r9, r11, lsl #1 - add r10, r10, r11, lsl #1 - b 1b -0: - vpop {q4-q7} - pop {r4-r11,pc} -endfunc - -// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp, -// const pixel *src, const ptrdiff_t stride, -// const int32_t *a, const int16_t *b, -// const int w, const int h); -function sgr_finish_filter2_8bpc_neon, export=1 - push {r4-r11,lr} - vpush {q4-q7} - ldrd r4, r5, [sp, #100] - ldr r6, [sp, #108] - add r7, r3, #(4*(SUM_STRIDE)) - sub r3, r3, #(4*(SUM_STRIDE)) - add r8, r4, #(2*(SUM_STRIDE)) - sub r4, r4, #(2*(SUM_STRIDE)) - mov r9, #(2*SUM_STRIDE) - mov r10, #FILTER_OUT_STRIDE - add r11, r5, #7 - bic r11, r11, #7 // Aligned width - sub r2, r2, r11 - sub r10, r10, r11 - sub r9, r9, r11 - sub r9, r9, #4 // We read 4 extra elements from a - sub r12, r9, #4 // We read 8 extra elements from b - mov lr, r5 - -1: - vld1.16 {q0, q1}, [r4, :128]! - vld1.16 {q2, q3}, [r8, :128]! - vld1.32 {q8, q9}, [r3, :128]! - vld1.32 {q11, q12}, [r7, :128]! - vld1.32 {q10}, [r3, :128]! - vld1.32 {q13}, [r7, :128]! - -2: - vmov.i16 q14, #5 - vmov.i16 q15, #6 - subs r5, r5, #8 - vext.8 q4, q0, q1, #4 // +1-stride - vext.8 q5, q2, q3, #4 // +1+stride - vext.8 q6, q0, q1, #2 // -stride - vext.8 q7, q2, q3, #2 // +stride - vadd.i16 q0, q0, q4 // -1-stride, +1-stride - vadd.i16 q5, q2, q5 // -1+stride, +1+stride - vadd.i16 q2, q6, q7 // -stride, +stride - vadd.i16 q0, q0, q5 - - vext.8 q4, q8, q9, #8 // +1-stride - vext.8 q5, q9, q10, #8 - vext.8 q6, q11, q12, #8 // +1+stride - vext.8 q7, q12, q13, #8 - vmul.i16 q0, q0, q14 // * 5 - vmla.i16 q0, q2, q15 // * 6 - vadd.i32 q4, q4, q8 // -1-stride, +1-stride - vadd.i32 q5, q5, q9 - vadd.i32 q6, q6, q11 // -1+stride, +1+stride - vadd.i32 q7, q7, q12 - vadd.i32 q4, q4, q6 - vadd.i32 q5, q5, q7 - vext.8 q6, q8, q9, #4 // -stride - vext.8 q7, q9, q10, #4 - vext.8 q8, q11, q12, #4 // +stride - vext.8 q11, q12, q13, #4 - - vld1.8 {d4}, [r1, :64]! - - vmov.i32 q14, #5 - vmov.i32 q15, #6 - - vadd.i32 q6, q6, q8 // -stride, +stride - vadd.i32 q7, q7, q11 - vmul.i32 q4, q4, q14 // * 5 - vmla.i32 q4, q6, q15 // * 6 - vmul.i32 q5, q5, q14 // * 5 - vmla.i32 q5, q7, q15 // * 6 - - vmovl.u8 q2, d4 - vmlal.u16 q4, d0, d4 // b + a * src - vmlal.u16 q5, d1, d5 // b + a * src - vmov q0, q1 - vrshrn.i32 d8, q4, #9 - vrshrn.i32 d9, q5, #9 - vmov q2, q3 - vst1.16 {q4}, [r0, :128]! - - ble 3f - vmov q8, q10 - vmov q11, q13 - vld1.16 {q1}, [r4, :128]! - vld1.16 {q3}, [r8, :128]! - vld1.32 {q9, q10}, [r3, :128]! - vld1.32 {q12, q13}, [r7, :128]! - b 2b - -3: - subs r6, r6, #1 - ble 0f - mov r5, lr - add r0, r0, r10, lsl #1 - add r1, r1, r2 - add r3, r3, r9, lsl #2 - add r7, r7, r9, lsl #2 - add r4, r4, r12, lsl #1 - add r8, r8, r12, lsl #1 - - vld1.32 {q8, q9}, [r3, :128]! - vld1.16 {q0, q1}, [r4, :128]! - vld1.32 {q10}, [r3, :128]! - - vmov.i16 q12, #5 - vmov.i16 q13, #6 - -4: - subs r5, r5, #8 - vext.8 q3, q0, q1, #4 // +1 - vext.8 q2, q0, q1, #2 // 0 - vadd.i16 q0, q0, q3 // -1, +1 - - vext.8 q4, q8, q9, #4 // 0 - vext.8 q5, q9, q10, #4 - vext.8 q6, q8, q9, #8 // +1 - vext.8 q7, q9, q10, #8 - vmul.i16 q2, q2, q13 // * 6 - vmla.i16 q2, q0, q12 // * 5 -> a - vld1.8 {d22}, [r1, :64]! - vadd.i32 q8, q8, q6 // -1, +1 - vadd.i32 q9, q9, q7 - vmovl.u8 q11, d22 - vmul.i32 q4, q4, q15 // * 6 - vmla.i32 q4, q8, q14 // * 5 -> b - vmul.i32 q5, q5, q15 // * 6 - vmla.i32 q5, q9, q14 // * 5 -> b - - vmlal.u16 q4, d4, d22 // b + a * src - vmlal.u16 q5, d5, d23 - vmov q0, q1 - vrshrn.i32 d8, q4, #8 - vrshrn.i32 d9, q5, #8 - vmov q8, q10 - vst1.16 {q4}, [r0, :128]! - - ble 5f - vld1.16 {q1}, [r4, :128]! - vld1.32 {q9, q10}, [r3, :128]! - b 4b - -5: - subs r6, r6, #1 - ble 0f - mov r5, lr - sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started - sub r4, r4, r11, lsl #1 - add r0, r0, r10, lsl #1 - add r1, r1, r2 - sub r3, r3, #16 - sub r4, r4, #16 - b 1b -0: - vpop {q4-q7} - pop {r4-r11,pc} -endfunc - -// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride, -// const pixel *src, const ptrdiff_t src_stride, -// const int16_t *t1, const int w, const int h, -// const int wt); -function sgr_weighted1_8bpc_neon, export=1 - push {r4-r9,lr} - ldrd r4, r5, [sp, #28] - ldrd r6, r7, [sp, #36] - vdup.16 d31, r7 - cmp r6, #2 - add r9, r0, r1 - add r12, r2, r3 - add lr, r4, #2*FILTER_OUT_STRIDE - mov r7, #(4*FILTER_OUT_STRIDE) - lsl r1, r1, #1 - lsl r3, r3, #1 - add r8, r5, #7 - bic r8, r8, #7 // Aligned width - sub r1, r1, r8 - sub r3, r3, r8 - sub r7, r7, r8, lsl #1 - mov r8, r5 - blt 2f -1: - vld1.8 {d0}, [r2, :64]! - vld1.8 {d16}, [r12, :64]! - vld1.16 {q1}, [r4, :128]! - vld1.16 {q9}, [lr, :128]! - subs r5, r5, #8 - vshll.u8 q0, d0, #4 // u - vshll.u8 q8, d16, #4 // u - vsub.i16 q1, q1, q0 // t1 - u - vsub.i16 q9, q9, q8 // t1 - u - vshll.u16 q2, d0, #7 // u << 7 - vshll.u16 q3, d1, #7 // u << 7 - vshll.u16 q10, d16, #7 // u << 7 - vshll.u16 q11, d17, #7 // u << 7 - vmlal.s16 q2, d2, d31 // v - vmlal.s16 q3, d3, d31 // v - vmlal.s16 q10, d18, d31 // v - vmlal.s16 q11, d19, d31 // v - vrshrn.i32 d4, q2, #11 - vrshrn.i32 d5, q3, #11 - vrshrn.i32 d20, q10, #11 - vrshrn.i32 d21, q11, #11 - vqmovun.s16 d4, q2 - vqmovun.s16 d20, q10 - vst1.8 {d4}, [r0]! - vst1.8 {d20}, [r9]! - bgt 1b - - sub r6, r6, #2 - cmp r6, #1 - blt 0f - mov r5, r8 - add r0, r0, r1 - add r9, r9, r1 - add r2, r2, r3 - add r12, r12, r3 - add r4, r4, r7 - add lr, lr, r7 - beq 2f - b 1b - -2: - vld1.8 {d0}, [r2, :64]! - vld1.16 {q1}, [r4, :128]! - subs r5, r5, #8 - vshll.u8 q0, d0, #4 // u - vsub.i16 q1, q1, q0 // t1 - u - vshll.u16 q2, d0, #7 // u << 7 - vshll.u16 q3, d1, #7 // u << 7 - vmlal.s16 q2, d2, d31 // v - vmlal.s16 q3, d3, d31 // v - vrshrn.i32 d4, q2, #11 - vrshrn.i32 d5, q3, #11 - vqmovun.s16 d2, q2 - vst1.8 {d2}, [r0]! - bgt 2b -0: - pop {r4-r9,pc} -endfunc - -// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *src, const ptrdiff_t src_stride, -// const int16_t *t1, const int16_t *t2, -// const int w, const int h, -// const int16_t wt[2]); -function sgr_weighted2_8bpc_neon, export=1 - push {r4-r11,lr} - ldrd r4, r5, [sp, #36] - ldrd r6, r7, [sp, #44] - ldr r8, [sp, #52] - cmp r7, #2 - add r10, r0, r1 - add r11, r2, r3 - add r12, r4, #2*FILTER_OUT_STRIDE - add lr, r5, #2*FILTER_OUT_STRIDE - vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1] - mov r8, #4*FILTER_OUT_STRIDE - lsl r1, r1, #1 - lsl r3, r3, #1 - add r9, r6, #7 - bic r9, r9, #7 // Aligned width - sub r1, r1, r9 - sub r3, r3, r9 - sub r8, r8, r9, lsl #1 - mov r9, r6 - blt 2f -1: - vld1.8 {d0}, [r2, :64]! - vld1.8 {d16}, [r11, :64]! - vld1.16 {q1}, [r4, :128]! - vld1.16 {q9}, [r12, :128]! - vld1.16 {q2}, [r5, :128]! - vld1.16 {q10}, [lr, :128]! - subs r6, r6, #8 - vshll.u8 q0, d0, #4 // u - vshll.u8 q8, d16, #4 // u - vsub.i16 q1, q1, q0 // t1 - u - vsub.i16 q2, q2, q0 // t2 - u - vsub.i16 q9, q9, q8 // t1 - u - vsub.i16 q10, q10, q8 // t2 - u - vshll.u16 q3, d0, #7 // u << 7 - vshll.u16 q0, d1, #7 // u << 7 - vshll.u16 q11, d16, #7 // u << 7 - vshll.u16 q8, d17, #7 // u << 7 - vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) - vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) - vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) - vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) - vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u) - vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u) - vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u) - vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u) - vrshrn.i32 d6, q3, #11 - vrshrn.i32 d7, q0, #11 - vrshrn.i32 d22, q11, #11 - vrshrn.i32 d23, q8, #11 - vqmovun.s16 d6, q3 - vqmovun.s16 d22, q11 - vst1.8 {d6}, [r0]! - vst1.8 {d22}, [r10]! - bgt 1b - - subs r7, r7, #2 - cmp r7, #1 - blt 0f - mov r6, r9 - add r0, r0, r1 - add r10, r10, r1 - add r2, r2, r3 - add r11, r11, r3 - add r4, r4, r8 - add r12, r12, r8 - add r5, r5, r8 - add lr, lr, r8 - beq 2f - b 1b - -2: - vld1.8 {d0}, [r2, :64]! - vld1.16 {q1}, [r4, :128]! - vld1.16 {q2}, [r5, :128]! - subs r6, r6, #8 - vshll.u8 q0, d0, #4 // u - vsub.i16 q1, q1, q0 // t1 - u - vsub.i16 q2, q2, q0 // t2 - u - vshll.u16 q3, d0, #7 // u << 7 - vshll.u16 q0, d1, #7 // u << 7 - vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) - vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) - vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) - vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) - vrshrn.i32 d6, q3, #11 - vrshrn.i32 d7, q0, #11 - vqmovun.s16 d6, q3 - vst1.8 {d6}, [r0]! - bgt 1b -0: - pop {r4-r11,pc} -endfunc diff --git a/src/arm/32/looprestoration_common.S b/src/arm/32/looprestoration_common.S new file mode 100644 index 0000000000..f8fbbbe960 --- /dev/null +++ b/src/arm/32/looprestoration_common.S @@ -0,0 +1,441 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define SUM_STRIDE (384+16) + +// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_v_neon, export=1 + push {r4-r9,lr} + ldr r4, [sp, #28] + add r12, r3, #2 // Number of output rows to move back + mov lr, r3 // Number of input rows to move back + add r2, r2, #2 // Actual summed width + mov r7, #(4*SUM_STRIDE) // sumsq stride + mov r8, #(2*SUM_STRIDE) // sum stride + sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride + sub r1, r1, #(2*SUM_STRIDE) // sum -= stride + + tst r4, #4 // LR_HAVE_TOP + beq 0f + // If have top, read from row -2. + sub r5, r0, #(4*SUM_STRIDE) + sub r6, r1, #(2*SUM_STRIDE) + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add r5, r0, #(4*SUM_STRIDE) + add r6, r1, #(2*SUM_STRIDE) +1: + + tst r4, #8 // LR_HAVE_BOTTOM + beq 1f + // LR_HAVE_BOTTOM + add r3, r3, #2 // Sum all h+2 lines with the main loop + add lr, lr, #2 +1: + mov r9, r3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into q8-q13 and q0-q2 taking top + // padding into consideration. + tst r4, #4 // LR_HAVE_TOP + vld1.32 {q8, q9}, [r5, :128], r7 + vld1.16 {q0}, [r6, :128], r8 + beq 2f + // LR_HAVE_TOP + vld1.32 {q10, q11}, [r5, :128], r7 + vld1.16 {q1}, [r6, :128], r8 + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + b 3f +2: // !LR_HAVE_TOP + vmov q10, q8 + vmov q11, q9 + vmov q1, q0 + vmov q12, q8 + vmov q13, q9 + vmov q2, q0 + +3: + subs r3, r3, #1 +.macro add3 + vadd.i32 q8, q8, q10 + vadd.i32 q9, q9, q11 + vadd.i16 q0, q0, q1 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i16 q0, q0, q2 + vst1.32 {q8, q9}, [r0, :128], r7 + vst1.16 {q0}, [r1, :128], r8 +.endm + add3 + vmov q8, q10 + vmov q9, q11 + vmov q0, q1 + vmov q10, q12 + vmov q11, q13 + vmov q1, q2 + ble 4f + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + b 3b + +4: + tst r4, #8 // LR_HAVE_BOTTOM + bne 5f + // !LR_HAVE_BOTTOM + // Produce two more rows, extending the already loaded rows. + add3 + vmov q8, q10 + vmov q9, q11 + vmov q0, q1 + add3 + +5: // End of one vertical slice. + subs r2, r2, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + mls r5, r7, lr, r5 + mls r6, r8, lr, r6 + // Output pointers + mls r0, r7, r12, r0 + mls r1, r8, r12, r1 + add r0, r0, #32 + add r1, r1, #16 + add r5, r5, #32 + add r6, r6, #16 + mov r3, r9 + b 1b + +0: + pop {r4-r9,pc} +.purgem add3 +endfunc + +// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_v_neon, export=1 + push {r4-r9,lr} + vpush {q5-q7} + ldr r4, [sp, #76] + add r12, r3, #2 // Number of output rows to move back + mov lr, r3 // Number of input rows to move back + add r2, r2, #8 // Actual summed width + mov r7, #(4*SUM_STRIDE) // sumsq stride + mov r8, #(2*SUM_STRIDE) // sum stride + sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride + sub r1, r1, #(2*SUM_STRIDE) // sum -= stride + + tst r4, #4 // LR_HAVE_TOP + beq 0f + // If have top, read from row -2. + sub r5, r0, #(4*SUM_STRIDE) + sub r6, r1, #(2*SUM_STRIDE) + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_TOP + // If we don't have top, read from row 0 even if + // we start writing to row -1. + add r5, r0, #(4*SUM_STRIDE) + add r6, r1, #(2*SUM_STRIDE) +1: + + tst r4, #8 // LR_HAVE_BOTTOM + beq 0f + // LR_HAVE_BOTTOM + add r3, r3, #2 // Handle h+2 lines with the main loop + add lr, lr, #2 + b 1f +0: + // !LR_HAVE_BOTTOM + sub r3, r3, #1 // Handle h-1 lines with the main loop +1: + mov r9, r3 // Backup of h for next loops + +1: + // Start of horizontal loop; start one vertical filter slice. + // Start loading rows into q6-q15 and q0-q3,q5 taking top + // padding into consideration. + tst r4, #4 // LR_HAVE_TOP + vld1.32 {q6, q7}, [r5, :128], r7 + vld1.16 {q0}, [r6, :128], r8 + beq 2f + // LR_HAVE_TOP + vld1.32 {q10, q11}, [r5, :128], r7 + vld1.16 {q2}, [r6, :128], r8 + vmov q8, q6 + vmov q9, q7 + vmov q1, q0 + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + b 3f +2: // !LR_HAVE_TOP + vmov q8, q6 + vmov q9, q7 + vmov q1, q0 + vmov q10, q6 + vmov q11, q7 + vmov q2, q0 + vmov q12, q6 + vmov q13, q7 + vmov q3, q0 + +3: + cmp r3, #0 + beq 4f + vld1.32 {q14, q15}, [r5, :128], r7 + vld1.16 {q5}, [r6, :128], r8 + +3: + // Start of vertical loop + subs r3, r3, #2 +.macro add5 + vadd.i32 q6, q6, q8 + vadd.i32 q7, q7, q9 + vadd.i16 q0, q0, q1 + vadd.i32 q6, q6, q10 + vadd.i32 q7, q7, q11 + vadd.i16 q0, q0, q2 + vadd.i32 q6, q6, q12 + vadd.i32 q7, q7, q13 + vadd.i16 q0, q0, q3 + vadd.i32 q6, q6, q14 + vadd.i32 q7, q7, q15 + vadd.i16 q0, q0, q5 + vst1.32 {q6, q7}, [r0, :128], r7 + vst1.16 {q0}, [r1, :128], r8 +.endm + add5 +.macro shift2 + vmov q6, q10 + vmov q7, q11 + vmov q0, q2 + vmov q8, q12 + vmov q9, q13 + vmov q1, q3 + vmov q10, q14 + vmov q11, q15 + vmov q2, q5 +.endm + shift2 + add r0, r0, r7 + add r1, r1, r8 + ble 5f + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + vld1.32 {q14, q15}, [r5, :128], r7 + vld1.16 {q5}, [r6, :128], r8 + b 3b + +4: + // h == 1, !LR_HAVE_BOTTOM. + // Pad the last row with the only content row, and add. + vmov q14, q12 + vmov q15, q13 + vmov q5, q3 + add5 + shift2 + add r0, r0, r7 + add r1, r1, r8 + add5 + b 6f + +5: + tst r4, #8 // LR_HAVE_BOTTOM + bne 6f + // !LR_HAVE_BOTTOM + cmp r3, #0 + bne 5f + // The intended three edge rows left; output the one at h-2 and + // the past edge one at h. + vld1.32 {q12, q13}, [r5, :128], r7 + vld1.16 {q3}, [r6, :128], r8 + // Pad the past-edge row from the last content row. + vmov q14, q12 + vmov q15, q13 + vmov q5, q3 + add5 + shift2 + add r0, r0, r7 + add r1, r1, r8 + // The last two rows are already padded properly here. + add5 + b 6f + +5: + // r3 == -1, two rows left, output one. + // Pad the last two rows from the mid one. + vmov q12, q10 + vmov q13, q11 + vmov q3, q2 + vmov q14, q10 + vmov q15, q11 + vmov q5, q2 + add5 + add r0, r0, r7 + add r1, r1, r8 + b 6f + +6: // End of one vertical slice. + subs r2, r2, #8 + ble 0f + // Move pointers back up to the top and loop horizontally. + // Input pointers + mls r5, r7, lr, r5 + mls r6, r8, lr, r6 + // Output pointers + mls r0, r7, r12, r0 + mls r1, r8, r12, r1 + add r0, r0, #32 + add r1, r1, #16 + add r5, r5, #32 + add r6, r6, #16 + mov r3, r9 + b 1b + +0: + vpop {q5-q7} + pop {r4-r9,pc} +.purgem add5 +endfunc + +// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength); +// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, +// const int w, const int h, const int strength); +function sgr_calc_ab1_neon, export=1 + push {r4-r5,lr} + vpush {q4-q7} + ldr r4, [sp, #76] + add r3, r3, #2 // h += 2 + vmov.i32 q15, #9 // n + movw r5, #455 + mov lr, #SUM_STRIDE + b sgr_calc_ab_neon +endfunc + +function sgr_calc_ab2_neon, export=1 + push {r4-r5,lr} + vpush {q4-q7} + ldr r4, [sp, #76] + add r3, r3, #3 // h += 3 + asr r3, r3, #1 // h /= 2 + vmov.i32 q15, #25 // n + mov r5, #164 + mov lr, #(2*SUM_STRIDE) +endfunc + +function sgr_calc_ab_neon + movrel r12, X(sgr_x_by_x) + vld1.8 {q8, q9}, [r12, :128]! + vmov.i8 q11, #5 + vmov.i8 d10, #55 // idx of last 5 + vld1.8 {q10}, [r12, :128] + vmov.i8 d11, #72 // idx of last 4 + vmov.i8 d12, #101 // idx of last 3 + vmov.i8 d13, #169 // idx of last 2 + vmov.i8 d14, #254 // idx of last 1 + vmov.i8 d15, #32 // elements consumed in first vtbl + add r2, r2, #2 // w += 2 + add r12, r2, #7 + bic r12, r12, #7 // aligned w + sub r12, lr, r12 // increment between rows + vmov.i16 q13, #256 + vdup.32 q12, r4 + vdup.32 q14, r5 // one_by_x + sub r0, r0, #(4*(SUM_STRIDE)) + sub r1, r1, #(2*(SUM_STRIDE)) + mov r4, r2 // backup of w + vsub.i8 q8, q8, q11 + vsub.i8 q9, q9, q11 + vsub.i8 q10, q10, q11 +1: + subs r2, r2, #8 + vld1.32 {q0, q1}, [r0, :128] // a + vld1.16 {q2}, [r1, :128] // b + vmul.i32 q0, q0, q15 // a * n + vmul.i32 q1, q1, q15 // a * n + vmull.u16 q3, d4, d4 // b * b + vmull.u16 q4, d5, d5 // b * b + vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0) + vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0) + vmul.i32 q0, q0, q12 // p * s + vmul.i32 q1, q1, q12 // p * s + vqshrn.u32 d0, q0, #16 + vqshrn.u32 d1, q1, #16 + vqrshrn.u16 d0, q0, #4 // imin(z, 255) + + vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5 + vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4 + vtbl.8 d1, {q8, q9}, d0 + vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3 + vsub.i8 d9, d0, d15 // indices for vtbx + vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2 + vadd.i8 d2, d2, d3 + vtbx.8 d1, {q10}, d9 + vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1 + vadd.i8 d6, d6, d7 + vadd.i8 d8, d8, d22 + vadd.i8 d2, d2, d6 + vadd.i8 d1, d1, d8 + vadd.i8 d1, d1, d2 + vmovl.u8 q0, d1 // x + + vmull.u16 q1, d0, d4 // x * BB[i] + vmull.u16 q2, d1, d5 // x * BB[i] + vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x + vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x + vrshr.s32 q1, q1, #12 // AA[i] + vrshr.s32 q2, q2, #12 // AA[i] + vsub.i16 q0, q13, q0 // 256 - x + + vst1.32 {q1, q2}, [r0, :128]! + vst1.16 {q0}, [r1, :128]! + bgt 1b + + subs r3, r3, #1 + ble 0f + add r0, r0, r12, lsl #2 + add r1, r1, r12, lsl #1 + mov r2, r4 + b 1b +0: + vpop {q4-q7} + pop {r4-r5,pc} +endfunc diff --git a/src/arm/32/looprestoration_tmpl.S b/src/arm/32/looprestoration_tmpl.S new file mode 100644 index 0000000000..0686820b04 --- /dev/null +++ b/src/arm/32/looprestoration_tmpl.S @@ -0,0 +1,477 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" + +#define FILTER_OUT_STRIDE 384 + +// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter1_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r6, [sp, #108] + sub r7, r3, #(4*SUM_STRIDE) + add r8, r3, #(4*SUM_STRIDE) + sub r9, r4, #(2*SUM_STRIDE) + add r10, r4, #(2*SUM_STRIDE) + mov r11, #SUM_STRIDE + mov r12, #FILTER_OUT_STRIDE + add lr, r5, #3 + bic lr, lr, #3 // Aligned width + sub r2, r2, lr + sub r12, r12, lr + sub r11, r11, lr + sub r11, r11, #4 // We read 4 extra elements from both a and b + mov lr, r5 + vmov.i16 q14, #3 + vmov.i32 q15, #3 +1: + vld1.16 {q0}, [r9, :128]! + vld1.16 {q1}, [r4, :128]! + vld1.16 {q2}, [r10, :128]! + vld1.32 {q8, q9}, [r7, :128]! + vld1.32 {q10, q11}, [r3, :128]! + vld1.32 {q12, q13}, [r8, :128]! + +2: + subs r5, r5, #4 + vext.8 d6, d0, d1, #2 // -stride + vext.8 d7, d2, d3, #2 // 0 + vext.8 d8, d4, d5, #2 // +stride + vext.8 d9, d0, d1, #4 // +1-stride + vext.8 d10, d2, d3, #4 // +1 + vext.8 d11, d4, d5, #4 // +1+stride + vadd.i16 d2, d2, d6 // -1, -stride + vadd.i16 d7, d7, d8 // 0, +stride + vadd.i16 d0, d0, d9 // -1-stride, +1-stride + vadd.i16 d2, d2, d7 + vadd.i16 d4, d4, d11 // -1+stride, +1+stride + vadd.i16 d2, d2, d10 // +1 + vadd.i16 d0, d0, d4 + + vext.8 q3, q8, q9, #4 // -stride + vshl.i16 d2, d2, #2 + vext.8 q4, q8, q9, #8 // +1-stride + vext.8 q5, q10, q11, #4 // 0 + vext.8 q6, q10, q11, #8 // +1 + vmla.i16 d2, d0, d28 // * 3 -> a + vadd.i32 q3, q3, q10 // -stride, -1 + vadd.i32 q8, q8, q4 // -1-stride, +1-stride + vadd.i32 q5, q5, q6 // 0, +1 + vadd.i32 q8, q8, q12 // -1+stride + vadd.i32 q3, q3, q5 + vext.8 q7, q12, q13, #4 // +stride + vext.8 q10, q12, q13, #8 // +1+stride + vld1.32 {d24[0]}, [r1, :32]! // src + vadd.i32 q3, q3, q7 // +stride + vadd.i32 q8, q8, q10 // +1+stride + vshl.i32 q3, q3, #2 + vmla.i32 q3, q8, q15 // * 3 -> b + vmovl.u8 q12, d24 // src + vmov d0, d1 + vmlal.u16 q3, d2, d24 // b + a * src + vmov d2, d3 + vrshrn.i32 d6, q3, #9 + vmov d4, d5 + vst1.16 {d6}, [r0]! + + ble 3f + vmov q8, q9 + vmov q10, q11 + vmov q12, q13 + vld1.16 {d1}, [r9, :64]! + vld1.16 {d3}, [r4, :64]! + vld1.16 {d5}, [r10, :64]! + vld1.32 {q9}, [r7, :128]! + vld1.32 {q11}, [r3, :128]! + vld1.32 {q13}, [r8, :128]! + b 2b + +3: + subs r6, r6, #1 + ble 0f + mov r5, lr + add r0, r0, r12, lsl #1 + add r1, r1, r2 + add r3, r3, r11, lsl #2 + add r7, r7, r11, lsl #2 + add r8, r8, r11, lsl #2 + add r4, r4, r11, lsl #1 + add r9, r9, r11, lsl #1 + add r10, r10, r11, lsl #1 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp, +// const pixel *src, const ptrdiff_t stride, +// const int32_t *a, const int16_t *b, +// const int w, const int h); +function sgr_finish_filter2_8bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r6, [sp, #108] + add r7, r3, #(4*(SUM_STRIDE)) + sub r3, r3, #(4*(SUM_STRIDE)) + add r8, r4, #(2*(SUM_STRIDE)) + sub r4, r4, #(2*(SUM_STRIDE)) + mov r9, #(2*SUM_STRIDE) + mov r10, #FILTER_OUT_STRIDE + add r11, r5, #7 + bic r11, r11, #7 // Aligned width + sub r2, r2, r11 + sub r10, r10, r11 + sub r9, r9, r11 + sub r9, r9, #4 // We read 4 extra elements from a + sub r12, r9, #4 // We read 8 extra elements from b + mov lr, r5 + +1: + vld1.16 {q0, q1}, [r4, :128]! + vld1.16 {q2, q3}, [r8, :128]! + vld1.32 {q8, q9}, [r3, :128]! + vld1.32 {q11, q12}, [r7, :128]! + vld1.32 {q10}, [r3, :128]! + vld1.32 {q13}, [r7, :128]! + +2: + vmov.i16 q14, #5 + vmov.i16 q15, #6 + subs r5, r5, #8 + vext.8 q4, q0, q1, #4 // +1-stride + vext.8 q5, q2, q3, #4 // +1+stride + vext.8 q6, q0, q1, #2 // -stride + vext.8 q7, q2, q3, #2 // +stride + vadd.i16 q0, q0, q4 // -1-stride, +1-stride + vadd.i16 q5, q2, q5 // -1+stride, +1+stride + vadd.i16 q2, q6, q7 // -stride, +stride + vadd.i16 q0, q0, q5 + + vext.8 q4, q8, q9, #8 // +1-stride + vext.8 q5, q9, q10, #8 + vext.8 q6, q11, q12, #8 // +1+stride + vext.8 q7, q12, q13, #8 + vmul.i16 q0, q0, q14 // * 5 + vmla.i16 q0, q2, q15 // * 6 + vadd.i32 q4, q4, q8 // -1-stride, +1-stride + vadd.i32 q5, q5, q9 + vadd.i32 q6, q6, q11 // -1+stride, +1+stride + vadd.i32 q7, q7, q12 + vadd.i32 q4, q4, q6 + vadd.i32 q5, q5, q7 + vext.8 q6, q8, q9, #4 // -stride + vext.8 q7, q9, q10, #4 + vext.8 q8, q11, q12, #4 // +stride + vext.8 q11, q12, q13, #4 + + vld1.8 {d4}, [r1, :64]! + + vmov.i32 q14, #5 + vmov.i32 q15, #6 + + vadd.i32 q6, q6, q8 // -stride, +stride + vadd.i32 q7, q7, q11 + vmul.i32 q4, q4, q14 // * 5 + vmla.i32 q4, q6, q15 // * 6 + vmul.i32 q5, q5, q14 // * 5 + vmla.i32 q5, q7, q15 // * 6 + + vmovl.u8 q2, d4 + vmlal.u16 q4, d0, d4 // b + a * src + vmlal.u16 q5, d1, d5 // b + a * src + vmov q0, q1 + vrshrn.i32 d8, q4, #9 + vrshrn.i32 d9, q5, #9 + vmov q2, q3 + vst1.16 {q4}, [r0, :128]! + + ble 3f + vmov q8, q10 + vmov q11, q13 + vld1.16 {q1}, [r4, :128]! + vld1.16 {q3}, [r8, :128]! + vld1.32 {q9, q10}, [r3, :128]! + vld1.32 {q12, q13}, [r7, :128]! + b 2b + +3: + subs r6, r6, #1 + ble 0f + mov r5, lr + add r0, r0, r10, lsl #1 + add r1, r1, r2 + add r3, r3, r9, lsl #2 + add r7, r7, r9, lsl #2 + add r4, r4, r12, lsl #1 + add r8, r8, r12, lsl #1 + + vld1.32 {q8, q9}, [r3, :128]! + vld1.16 {q0, q1}, [r4, :128]! + vld1.32 {q10}, [r3, :128]! + + vmov.i16 q12, #5 + vmov.i16 q13, #6 + +4: + subs r5, r5, #8 + vext.8 q3, q0, q1, #4 // +1 + vext.8 q2, q0, q1, #2 // 0 + vadd.i16 q0, q0, q3 // -1, +1 + + vext.8 q4, q8, q9, #4 // 0 + vext.8 q5, q9, q10, #4 + vext.8 q6, q8, q9, #8 // +1 + vext.8 q7, q9, q10, #8 + vmul.i16 q2, q2, q13 // * 6 + vmla.i16 q2, q0, q12 // * 5 -> a + vld1.8 {d22}, [r1, :64]! + vadd.i32 q8, q8, q6 // -1, +1 + vadd.i32 q9, q9, q7 + vmovl.u8 q11, d22 + vmul.i32 q4, q4, q15 // * 6 + vmla.i32 q4, q8, q14 // * 5 -> b + vmul.i32 q5, q5, q15 // * 6 + vmla.i32 q5, q9, q14 // * 5 -> b + + vmlal.u16 q4, d4, d22 // b + a * src + vmlal.u16 q5, d5, d23 + vmov q0, q1 + vrshrn.i32 d8, q4, #8 + vrshrn.i32 d9, q5, #8 + vmov q8, q10 + vst1.16 {q4}, [r0, :128]! + + ble 5f + vld1.16 {q1}, [r4, :128]! + vld1.32 {q9, q10}, [r3, :128]! + b 4b + +5: + subs r6, r6, #1 + ble 0f + mov r5, lr + sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started + sub r4, r4, r11, lsl #1 + add r0, r0, r10, lsl #1 + add r1, r1, r2 + sub r3, r3, #16 + sub r4, r4, #16 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int w, const int h, +// const int wt); +function sgr_weighted1_8bpc_neon, export=1 + push {r4-r9,lr} + ldrd r4, r5, [sp, #28] + ldrd r6, r7, [sp, #36] + vdup.16 d31, r7 + cmp r6, #2 + add r9, r0, r1 + add r12, r2, r3 + add lr, r4, #2*FILTER_OUT_STRIDE + mov r7, #(4*FILTER_OUT_STRIDE) + lsl r1, r1, #1 + lsl r3, r3, #1 + add r8, r5, #7 + bic r8, r8, #7 // Aligned width + sub r1, r1, r8 + sub r3, r3, r8 + sub r7, r7, r8, lsl #1 + mov r8, r5 + blt 2f +1: + vld1.8 {d0}, [r2, :64]! + vld1.8 {d16}, [r12, :64]! + vld1.16 {q1}, [r4, :128]! + vld1.16 {q9}, [lr, :128]! + subs r5, r5, #8 + vshll.u8 q0, d0, #4 // u + vshll.u8 q8, d16, #4 // u + vsub.i16 q1, q1, q0 // t1 - u + vsub.i16 q9, q9, q8 // t1 - u + vshll.u16 q2, d0, #7 // u << 7 + vshll.u16 q3, d1, #7 // u << 7 + vshll.u16 q10, d16, #7 // u << 7 + vshll.u16 q11, d17, #7 // u << 7 + vmlal.s16 q2, d2, d31 // v + vmlal.s16 q3, d3, d31 // v + vmlal.s16 q10, d18, d31 // v + vmlal.s16 q11, d19, d31 // v + vrshrn.i32 d4, q2, #11 + vrshrn.i32 d5, q3, #11 + vrshrn.i32 d20, q10, #11 + vrshrn.i32 d21, q11, #11 + vqmovun.s16 d4, q2 + vqmovun.s16 d20, q10 + vst1.8 {d4}, [r0]! + vst1.8 {d20}, [r9]! + bgt 1b + + sub r6, r6, #2 + cmp r6, #1 + blt 0f + mov r5, r8 + add r0, r0, r1 + add r9, r9, r1 + add r2, r2, r3 + add r12, r12, r3 + add r4, r4, r7 + add lr, lr, r7 + beq 2f + b 1b + +2: + vld1.8 {d0}, [r2, :64]! + vld1.16 {q1}, [r4, :128]! + subs r5, r5, #8 + vshll.u8 q0, d0, #4 // u + vsub.i16 q1, q1, q0 // t1 - u + vshll.u16 q2, d0, #7 // u << 7 + vshll.u16 q3, d1, #7 // u << 7 + vmlal.s16 q2, d2, d31 // v + vmlal.s16 q3, d3, d31 // v + vrshrn.i32 d4, q2, #11 + vrshrn.i32 d5, q3, #11 + vqmovun.s16 d2, q2 + vst1.8 {d2}, [r0]! + bgt 2b +0: + pop {r4-r9,pc} +endfunc + +// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *src, const ptrdiff_t src_stride, +// const int16_t *t1, const int16_t *t2, +// const int w, const int h, +// const int16_t wt[2]); +function sgr_weighted2_8bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + ldr r8, [sp, #52] + cmp r7, #2 + add r10, r0, r1 + add r11, r2, r3 + add r12, r4, #2*FILTER_OUT_STRIDE + add lr, r5, #2*FILTER_OUT_STRIDE + vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1] + mov r8, #4*FILTER_OUT_STRIDE + lsl r1, r1, #1 + lsl r3, r3, #1 + add r9, r6, #7 + bic r9, r9, #7 // Aligned width + sub r1, r1, r9 + sub r3, r3, r9 + sub r8, r8, r9, lsl #1 + mov r9, r6 + blt 2f +1: + vld1.8 {d0}, [r2, :64]! + vld1.8 {d16}, [r11, :64]! + vld1.16 {q1}, [r4, :128]! + vld1.16 {q9}, [r12, :128]! + vld1.16 {q2}, [r5, :128]! + vld1.16 {q10}, [lr, :128]! + subs r6, r6, #8 + vshll.u8 q0, d0, #4 // u + vshll.u8 q8, d16, #4 // u + vsub.i16 q1, q1, q0 // t1 - u + vsub.i16 q2, q2, q0 // t2 - u + vsub.i16 q9, q9, q8 // t1 - u + vsub.i16 q10, q10, q8 // t2 - u + vshll.u16 q3, d0, #7 // u << 7 + vshll.u16 q0, d1, #7 // u << 7 + vshll.u16 q11, d16, #7 // u << 7 + vshll.u16 q8, d17, #7 // u << 7 + vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) + vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) + vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) + vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) + vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u) + vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u) + vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u) + vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u) + vrshrn.i32 d6, q3, #11 + vrshrn.i32 d7, q0, #11 + vrshrn.i32 d22, q11, #11 + vrshrn.i32 d23, q8, #11 + vqmovun.s16 d6, q3 + vqmovun.s16 d22, q11 + vst1.8 {d6}, [r0]! + vst1.8 {d22}, [r10]! + bgt 1b + + subs r7, r7, #2 + cmp r7, #1 + blt 0f + mov r6, r9 + add r0, r0, r1 + add r10, r10, r1 + add r2, r2, r3 + add r11, r11, r3 + add r4, r4, r8 + add r12, r12, r8 + add r5, r5, r8 + add lr, lr, r8 + beq 2f + b 1b + +2: + vld1.8 {d0}, [r2, :64]! + vld1.16 {q1}, [r4, :128]! + vld1.16 {q2}, [r5, :128]! + subs r6, r6, #8 + vshll.u8 q0, d0, #4 // u + vsub.i16 q1, q1, q0 // t1 - u + vsub.i16 q2, q2, q0 // t2 - u + vshll.u16 q3, d0, #7 // u << 7 + vshll.u16 q0, d1, #7 // u << 7 + vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) + vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) + vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) + vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) + vrshrn.i32 d6, q3, #11 + vrshrn.i32 d7, q0, #11 + vqmovun.s16 d6, q3 + vst1.8 {d6}, [r0]! + bgt 1b +0: + pop {r4-r11,pc} +endfunc From f1e344282a93157200ff782752f11bc82684dd9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 26 Nov 2020 22:48:26 +0200 Subject: [PATCH 078/155] arm32: looprestoration: NEON implementation of SGR for 10 bpc Checkasm numbers: Cortex A7 A8 A53 A72 A73 selfguided_3x3_10bpc_neon: 919127.6 717942.8 565717.8 404748.0 372179.8 selfguided_5x5_10bpc_neon: 640310.8 511873.4 370653.3 273593.7 256403.2 selfguided_mix_10bpc_neon: 1533887.0 1252389.5 922111.1 659033.4 613410.6 Corresponding numbers for arm64, for comparison: Cortex A53 A72 A73 selfguided_3x3_10bpc_neon: 500706.0 367199.2 345261.2 selfguided_5x5_10bpc_neon: 361403.3 270550.0 249955.3 selfguided_mix_10bpc_neon: 846172.4 623590.3 578404.8 --- src/arm/32/looprestoration.S | 2 + src/arm/32/looprestoration16.S | 565 ++++++++++++++++++++++++++++ src/arm/32/looprestoration_common.S | 36 +- src/arm/32/looprestoration_tmpl.S | 143 ++++++- 4 files changed, 724 insertions(+), 22 deletions(-) diff --git a/src/arm/32/looprestoration.S b/src/arm/32/looprestoration.S index baa3d469e5..d2be230aaf 100644 --- a/src/arm/32/looprestoration.S +++ b/src/arm/32/looprestoration.S @@ -1239,3 +1239,5 @@ L(box5_variable_shift_tbl): pop {r4-r11,pc} .purgem add5 endfunc + +sgr_funcs 8 diff --git a/src/arm/32/looprestoration16.S b/src/arm/32/looprestoration16.S index 3c0acaa56c..324c2ac173 100644 --- a/src/arm/32/looprestoration16.S +++ b/src/arm/32/looprestoration16.S @@ -718,3 +718,568 @@ L(copy_narrow_tbl): bgt 70b pop {r4,pc} endfunc + +#define SUM_STRIDE (384+16) + +#include "looprestoration_tmpl.S" + +// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box3_h_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + add r5, r5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add r10, r0, #(4*SUM_STRIDE) // sumsq + add r11, r1, #(2*SUM_STRIDE) // sum + add r12, r3, r4 // src + lsl r4, r4, #1 + mov r9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + // With LR_HAVE_RIGHT, align to 8, without it, align to 4. + tst r7, #2 // LR_HAVE_RIGHT + bne 0f + // !LR_HAVE_RIGHT + add lr, r5, #3 + bic lr, lr, #3 + b 1f +0: + add lr, r5, #7 + bic lr, lr, #7 +1: + sub r9, r9, lr, lsl #1 + + // Store the width for the vertical loop + mov r8, r5 + + // Subtract the number of pixels read from the input from the stride + add lr, r5, #14 + bic lr, lr, #7 + sub r4, r4, lr, lsl #1 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r2, #0 + bne 0f + // left == NULL + sub r3, r3, #4 + sub r12, r12, #4 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 2 pixels from the src pointer, + // but shift it as if we had done that. + add r4, r4, #4 + + +1: // Loop vertically + vld1.16 {q0, q1}, [r3]! + vld1.16 {q4, q5}, [r12]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r2, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.16 {d5}, [r2]! + // Move r3/r12 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub r3, r3, #4 + sub r12, r12, #4 + vld1.16 {d13}, [r2]! + vext.8 q1, q0, q1, #12 + vext.8 q0, q2, q0, #12 + vext.8 q5, q4, q5, #12 + vext.8 q4, q6, q4, #12 + b 2f +0: + // !LR_HAVE_LEFT, fill q2 with the leftmost pixel + // and shift q0 to have 2x the first byte at the front. + vdup.16 q2, d0[0] + vdup.16 q6, d8[0] + // Move r3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub r3, r3, #4 + sub r12, r12, #4 + vext.8 q1, q0, q1, #12 + vext.8 q0, q2, q0, #12 + vext.8 q5, q4, q5, #12 + vext.8 q4, q6, q4, #12 + +2: + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub lr, r5, #(2 + 16 - 2 + 1) + lsl lr, lr, #1 + ldrh r11, [r3, lr] + ldrh lr, [r12, lr] + // Fill q14/q15 with the right padding pixel + vdup.16 q14, r11 + vdup.16 q15, lr + // Restore r11 after using it for a temporary value + add r11, r1, #(2*SUM_STRIDE) +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp r5, #10 + bge 4f // If w >= 10, all used input pixels are valid + cmp r5, #6 + bge 5f // If w >= 6, we can filter 4 pixels + b 6f + +4: // Loop horizontally +.macro add3 w +.if \w > 4 + vext.8 q8, q0, q1, #2 + vext.8 q10, q4, q5, #2 + vext.8 q9, q0, q1, #4 + vext.8 q11, q4, q5, #4 + vadd.i16 q2, q0, q8 + vadd.i16 q3, q4, q10 + vadd.i16 q2, q2, q9 + vadd.i16 q3, q3, q11 +.else + vext.8 d16, d0, d1, #2 + vext.8 d20, d8, d9, #2 + vext.8 d18, d0, d1, #4 + vext.8 d22, d8, d9, #4 + vadd.i16 d4, d0, d16 + vadd.i16 d6, d8, d20 + vadd.i16 d4, d4, d18 + vadd.i16 d6, d6, d22 +.endif + + vmull.u16 q6, d0, d0 + vmlal.u16 q6, d16, d16 + vmlal.u16 q6, d18, d18 + vmull.u16 q12, d8, d8 + vmlal.u16 q12, d20, d20 + vmlal.u16 q12, d22, d22 +.if \w > 4 + vmull.u16 q7, d1, d1 + vmlal.u16 q7, d17, d17 + vmlal.u16 q7, d19, d19 + vmull.u16 q13, d9, d9 + vmlal.u16 q13, d21, d21 + vmlal.u16 q13, d23, d23 +.endif +.endm + add3 8 + vst1.16 {q2}, [r1, :128]! + vst1.16 {q3}, [r11, :128]! + vst1.32 {q6, q7}, [r0, :128]! + vst1.32 {q12, q13}, [r10, :128]! + + subs r5, r5, #8 + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vmov q0, q1 + vmov q4, q5 + vld1.16 {q1}, [r3]! + vld1.16 {q5}, [r12]! + + bne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Produce 4 pixels, 6 <= w < 10 + add3 4 + vst1.16 {d4}, [r1, :64]! + vst1.16 {d6}, [r11, :64]! + vst1.32 {q6}, [r0, :128]! + vst1.32 {q12}, [r10, :128]! + + subs r5, r5, #4 // 2 <= w < 6 + vext.8 q0, q0, q1, #8 + vext.8 q4, q4, q5, #8 + +6: // Pad the right edge and produce the last few pixels. + // 2 <= w < 6, 2-5 pixels valid in q0 + sub lr, r5, #2 + // lr = (pixels valid - 2) + adr r11, L(box3_variable_shift_tbl) + ldr lr, [r11, lr, lsl #2] + add r11, r11, lr + bx r11 + + .align 2 +L(box3_variable_shift_tbl): + .word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB + .word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB + .word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB + .word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB + + // Shift q0 right, shifting out invalid pixels, + // shift q0 left to the original offset, shifting in padding pixels. +22: // 2 pixels valid + vext.8 q0, q0, q0, #4 + vext.8 q4, q4, q4, #4 + vext.8 q0, q0, q14, #12 + vext.8 q4, q4, q15, #12 + b 88f +33: // 3 pixels valid + vext.8 q0, q0, q0, #6 + vext.8 q4, q4, q4, #6 + vext.8 q0, q0, q14, #10 + vext.8 q4, q4, q15, #10 + b 88f +44: // 4 pixels valid + vmov d1, d28 + vmov d9, d30 + b 88f +55: // 5 pixels valid + vext.8 q0, q0, q0, #10 + vext.8 q4, q4, q4, #10 + vext.8 q0, q0, q14, #6 + vext.8 q4, q4, q15, #6 + +88: + // Restore r11 after using it for a temporary value above + add r11, r1, #(2*SUM_STRIDE) + + add3 4 + subs r5, r5, #4 + vst1.16 {d4}, [r1, :64]! + vst1.16 {d6}, [r11, :64]! + vst1.32 {q6}, [r0, :128]! + vst1.32 {q12}, [r10, :128]! + ble 9f + vext.8 q0, q0, q0, #8 + vext.8 q4, q4, q4, #8 + // Only one needed pixel left, but do a normal 4 pixel + // addition anyway + add3 4 + vst1.16 {d4}, [r1, :64]! + vst1.16 {d6}, [r11, :64]! + vst1.32 {q6}, [r0, :128]! + vst1.32 {q12}, [r10, :128]! + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r9, lsl #1 + add r10, r10, r9, lsl #1 + add r1, r1, r9 + add r11, r11, r9 + add r3, r3, r4 + add r12, r12, r4 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +.purgem add3 +endfunc + +// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const ptrdiff_t stride, +// const int w, const int h, +// const enum LrEdgeFlags edges); +function sgr_box5_h_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] + add r5, r5, #2 // w += 2 + + // Set up pointers for reading/writing alternate rows + add r10, r0, #(4*SUM_STRIDE) // sumsq + add r11, r1, #(2*SUM_STRIDE) // sum + add r12, r3, r4 // src + lsl r4, r4, #1 + mov r9, #(2*2*SUM_STRIDE) // double sum stride + + // Subtract the aligned width from the output stride. + // With LR_HAVE_RIGHT, align to 8, without it, align to 4. + // Subtract the number of pixels read from the input from the stride. + tst r7, #2 // LR_HAVE_RIGHT + bne 0f + // !LR_HAVE_RIGHT + add lr, r5, #3 + bic lr, lr, #3 + add r8, r5, #13 + b 1f +0: + add lr, r5, #7 + bic lr, lr, #7 + add r8, r5, #15 +1: + sub r9, r9, lr, lsl #1 + bic r8, r8, #7 + sub r4, r4, r8, lsl #1 + + // Store the width for the vertical loop + mov r8, r5 + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst r7, #1 // LR_HAVE_LEFT + beq 2f + // LR_HAVE_LEFT + cmp r2, #0 + bne 0f + // left == NULL + sub r3, r3, #6 + sub r12, r12, #6 + b 1f +0: // LR_HAVE_LEFT, left != NULL +2: // !LR_HAVE_LEFT, increase the stride. + // For this case we don't read the left 3 pixels from the src pointer, + // but shift it as if we had done that. + add r4, r4, #6 + +1: // Loop vertically + vld1.16 {q0, q1}, [r3]! + vld1.16 {q4, q5}, [r12]! + + tst r7, #1 // LR_HAVE_LEFT + beq 0f + cmp r2, #0 + beq 2f + // LR_HAVE_LEFT, left != NULL + vld1.16 {d5}, [r2]! + // Move r3/r12 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub r3, r3, #6 + sub r12, r12, #6 + vld1.16 {d13}, [r2]! + vext.8 q1, q0, q1, #10 + vext.8 q0, q2, q0, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + b 2f +0: + // !LR_HAVE_LEFT, fill q2 with the leftmost pixel + // and shift q0 to have 3x the first pixel at the front. + vdup.16 q2, d0[0] + vdup.16 q6, d8[0] + // Move r3 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub r3, r3, #6 + sub r12, r12, #6 + vext.8 q1, q0, q1, #10 + vext.8 q0, q2, q0, #10 + vext.8 q5, q4, q5, #10 + vext.8 q4, q6, q4, #10 + +2: + tst r7, #2 // LR_HAVE_RIGHT + bne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub lr, r5, #(2 + 16 - 3 + 1) + lsl lr, lr, #1 + ldrh r11, [r3, lr] + ldrh lr, [r12, lr] + // Fill q14/q15 with the right padding pixel + vdup.16 q14, r11 + vdup.16 q15, lr + // Restore r11 after using it for a temporary value + add r11, r1, #(2*SUM_STRIDE) +3: // !LR_HAVE_RIGHT + // If we'll have to pad the right edge we need to quit early here. + cmp r5, #11 + bge 4f // If w >= 11, all used input pixels are valid + cmp r5, #7 + bge 5f // If w >= 7, we can produce 4 pixels + b 6f + +4: // Loop horizontally +.macro add5 w +.if \w > 4 + vext.8 q8, q0, q1, #2 + vext.8 q10, q4, q5, #2 + vext.8 q9, q0, q1, #4 + vext.8 q11, q4, q5, #4 + vadd.i16 q2, q0, q8 + vadd.i16 q3, q4, q10 + vadd.i16 q2, q2, q9 + vadd.i16 q3, q3, q11 +.else + vext.8 d16, d0, d1, #2 + vext.8 d20, d8, d9, #2 + vext.8 d18, d0, d1, #4 + vext.8 d22, d8, d9, #4 + vadd.i16 d4, d0, d16 + vadd.i16 d6, d8, d20 + vadd.i16 d4, d4, d18 + vadd.i16 d6, d6, d22 +.endif + + vmull.u16 q6, d0, d0 + vmlal.u16 q6, d16, d16 + vmlal.u16 q6, d18, d18 + vmull.u16 q12, d8, d8 + vmlal.u16 q12, d20, d20 + vmlal.u16 q12, d22, d22 +.if \w > 4 + vmull.u16 q7, d1, d1 + vmlal.u16 q7, d17, d17 + vmlal.u16 q7, d19, d19 + vmull.u16 q13, d9, d9 + vmlal.u16 q13, d21, d21 + vmlal.u16 q13, d23, d23 +.endif + +.if \w > 4 + vext.8 q8, q0, q1, #6 + vext.8 q10, q4, q5, #6 + vext.8 q9, q0, q1, #8 + vext.8 q11, q4, q5, #8 + vadd.i16 q2, q2, q8 + vadd.i16 q3, q3, q10 + vadd.i16 q2, q2, q9 + vadd.i16 q3, q3, q11 +.else + vext.8 d16, d0, d1, #6 + // d18 would be equal to d1; using d1 instead + vext.8 d20, d8, d9, #6 + // d22 would be equal to d9; using d9 instead + vadd.i16 d4, d4, d16 + vadd.i16 d6, d6, d20 + vadd.i16 d4, d4, d1 + vadd.i16 d6, d6, d9 +.endif + + vmlal.u16 q6, d16, d16 + vmlal.u16 q6, d1, d1 + vmlal.u16 q12, d20, d20 + vmlal.u16 q12, d9, d9 +.if \w > 4 + vmlal.u16 q7, d17, d17 + vmlal.u16 q7, d19, d19 + vmlal.u16 q13, d21, d21 + vmlal.u16 q13, d23, d23 +.endif +.endm + add5 8 + vst1.16 {q2}, [r1, :128]! + vst1.16 {q3}, [r11, :128]! + vst1.32 {q6, q7}, [r0, :128]! + vst1.32 {q12, q13}, [r10, :128]! + + subs r5, r5, #8 + ble 9f + tst r7, #2 // LR_HAVE_RIGHT + vmov q0, q1 + vmov q4, q5 + vld1.16 {q1}, [r3]! + vld1.16 {q5}, [r12]! + bne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +5: // Produce 4 pixels, 7 <= w < 11 + add5 4 + vst1.16 {d4}, [r1, :64]! + vst1.16 {d6}, [r11, :64]! + vst1.32 {q6}, [r0, :128]! + vst1.32 {q12}, [r10, :128]! + + subs r5, r5, #4 // 3 <= w < 7 + vext.8 q0, q0, q1, #8 + vext.8 q4, q4, q5, #8 + +6: // Pad the right edge and produce the last few pixels. + // w < 7, w+1 pixels valid in q0/q4 + sub lr, r5, #1 + // lr = pixels valid - 2 + adr r11, L(box5_variable_shift_tbl) + ldr lr, [r11, lr, lsl #2] + vmov q1, q14 + vmov q5, q15 + add r11, r11, lr + bx r11 + + .align 2 +L(box5_variable_shift_tbl): + .word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB + .word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB + .word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB + .word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB + .word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB + .word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB + + // Shift q0 right, shifting out invalid pixels, + // shift q0 left to the original offset, shifting in padding pixels. +22: // 2 pixels valid + vext.8 q0, q0, q0, #4 + vext.8 q4, q4, q4, #4 + vext.8 q0, q0, q14, #12 + vext.8 q4, q4, q15, #12 + b 88f +33: // 3 pixels valid + vext.8 q0, q0, q0, #6 + vext.8 q4, q4, q4, #6 + vext.8 q0, q0, q14, #10 + vext.8 q4, q4, q15, #10 + b 88f +44: // 4 pixels valid + vmov d1, d28 + vmov d9, d30 + b 88f +55: // 5 pixels valid + vext.8 q0, q0, q0, #10 + vext.8 q4, q4, q4, #10 + vext.8 q0, q0, q14, #6 + vext.8 q4, q4, q15, #6 + b 88f +66: // 6 pixels valid + vext.8 q0, q0, q0, #12 + vext.8 q4, q4, q4, #12 + vext.8 q0, q0, q14, #4 + vext.8 q4, q4, q15, #4 + b 88f +77: // 7 pixels valid + vext.8 q0, q0, q0, #14 + vext.8 q4, q4, q4, #14 + vext.8 q0, q0, q14, #2 + vext.8 q4, q4, q15, #2 + +88: + // Restore r11 after using it for a temporary value above + add r11, r1, #(2*SUM_STRIDE) + + add5 4 + subs r5, r5, #4 + vst1.16 {d4}, [r1, :64]! + vst1.16 {d6}, [r11, :64]! + vst1.32 {q6}, [r0, :128]! + vst1.32 {q12}, [r10, :128]! + ble 9f + vext.8 q0, q0, q1, #8 + vext.8 q4, q4, q5, #8 + add5 4 + vst1.16 {d4}, [r1, :64]! + vst1.16 {d6}, [r11, :64]! + vst1.32 {q6}, [r0, :128]! + vst1.32 {q12}, [r10, :128]! + +9: + subs r6, r6, #2 + ble 0f + // Jump to the next row and loop horizontally + add r0, r0, r9, lsl #1 + add r10, r10, r9, lsl #1 + add r1, r1, r9 + add r11, r11, r9 + add r3, r3, r4 + add r12, r12, r4 + mov r5, r8 + b 1b +0: + vpop {q4-q7} + pop {r4-r11,pc} +.purgem add5 +endfunc + +sgr_funcs 16 diff --git a/src/arm/32/looprestoration_common.S b/src/arm/32/looprestoration_common.S index f8fbbbe960..b080bb5115 100644 --- a/src/arm/32/looprestoration_common.S +++ b/src/arm/32/looprestoration_common.S @@ -336,14 +336,17 @@ function sgr_box5_v_neon, export=1 endfunc // void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, -// const int w, const int h, const int strength); +// const int w, const int h, const int strength, +// const int bitdepth_max); // void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, -// const int w, const int h, const int strength); +// const int w, const int h, const int strength, +// const int bitdepth_max); function sgr_calc_ab1_neon, export=1 - push {r4-r5,lr} + push {r4-r7,lr} vpush {q4-q7} - ldr r4, [sp, #76] + ldrd r4, r5, [sp, #84] add r3, r3, #2 // h += 2 + clz r6, r5 vmov.i32 q15, #9 // n movw r5, #455 mov lr, #SUM_STRIDE @@ -351,10 +354,11 @@ function sgr_calc_ab1_neon, export=1 endfunc function sgr_calc_ab2_neon, export=1 - push {r4-r5,lr} + push {r4-r7,lr} vpush {q4-q7} - ldr r4, [sp, #76] + ldrd r4, r5, [sp, #84] add r3, r3, #3 // h += 3 + clz r6, r5 asr r3, r3, #1 // h /= 2 vmov.i32 q15, #25 // n mov r5, #164 @@ -363,7 +367,9 @@ endfunc function sgr_calc_ab_neon movrel r12, X(sgr_x_by_x) + sub r6, r6, #24 // -bitdepth_min_8 vld1.8 {q8, q9}, [r12, :128]! + add r7, r6, r6 // -2*bitdepth_min_8 vmov.i8 q11, #5 vmov.i8 d10, #55 // idx of last 5 vld1.8 {q10}, [r12, :128] @@ -376,9 +382,7 @@ function sgr_calc_ab_neon add r12, r2, #7 bic r12, r12, #7 // aligned w sub r12, lr, r12 // increment between rows - vmov.i16 q13, #256 vdup.32 q12, r4 - vdup.32 q14, r5 // one_by_x sub r0, r0, #(4*(SUM_STRIDE)) sub r1, r1, #(2*(SUM_STRIDE)) mov r4, r2 // backup of w @@ -386,13 +390,18 @@ function sgr_calc_ab_neon vsub.i8 q9, q9, q11 vsub.i8 q10, q10, q11 1: - subs r2, r2, #8 vld1.32 {q0, q1}, [r0, :128] // a vld1.16 {q2}, [r1, :128] // b + vdup.32 q13, r7 // -2*bitdepth_min_8 + vdup.16 q14, r6 // -bitdepth_min_8 + subs r2, r2, #8 + vrshl.s32 q0, q0, q13 + vrshl.s32 q1, q1, q13 + vrshl.s16 q4, q2, q14 vmul.i32 q0, q0, q15 // a * n vmul.i32 q1, q1, q15 // a * n - vmull.u16 q3, d4, d4 // b * b - vmull.u16 q4, d5, d5 // b * b + vmull.u16 q3, d8, d8 // b * b + vmull.u16 q4, d9, d9 // b * b vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0) vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0) vmul.i32 q0, q0, q12 // p * s @@ -417,6 +426,9 @@ function sgr_calc_ab_neon vadd.i8 d1, d1, d2 vmovl.u8 q0, d1 // x + vmov.i16 q13, #256 + vdup.32 q14, r5 // one_by_x + vmull.u16 q1, d0, d4 // x * BB[i] vmull.u16 q2, d1, d5 // x * BB[i] vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x @@ -437,5 +449,5 @@ function sgr_calc_ab_neon b 1b 0: vpop {q4-q7} - pop {r4-r5,pc} + pop {r4-r7,pc} endfunc diff --git a/src/arm/32/looprestoration_tmpl.S b/src/arm/32/looprestoration_tmpl.S index 0686820b04..324aa6251d 100644 --- a/src/arm/32/looprestoration_tmpl.S +++ b/src/arm/32/looprestoration_tmpl.S @@ -29,11 +29,12 @@ #define FILTER_OUT_STRIDE 384 -// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp, +.macro sgr_funcs bpc +// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp, // const pixel *src, const ptrdiff_t stride, // const int32_t *a, const int16_t *b, // const int w, const int h); -function sgr_finish_filter1_8bpc_neon, export=1 +function sgr_finish_filter1_\bpc\()bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] @@ -46,7 +47,11 @@ function sgr_finish_filter1_8bpc_neon, export=1 mov r12, #FILTER_OUT_STRIDE add lr, r5, #3 bic lr, lr, #3 // Aligned width +.if \bpc == 8 sub r2, r2, lr +.else + sub r2, r2, lr, lsl #1 +.endif sub r12, r12, lr sub r11, r11, lr sub r11, r11, #4 // We read 4 extra elements from both a and b @@ -90,12 +95,18 @@ function sgr_finish_filter1_8bpc_neon, export=1 vadd.i32 q3, q3, q5 vext.8 q7, q12, q13, #4 // +stride vext.8 q10, q12, q13, #8 // +1+stride +.if \bpc == 8 vld1.32 {d24[0]}, [r1, :32]! // src +.else + vld1.16 {d24}, [r1, :64]! // src +.endif vadd.i32 q3, q3, q7 // +stride vadd.i32 q8, q8, q10 // +1+stride vshl.i32 q3, q3, #2 vmla.i32 q3, q8, q15 // * 3 -> b +.if \bpc == 8 vmovl.u8 q12, d24 // src +.endif vmov d0, d1 vmlal.u16 q3, d2, d24 // b + a * src vmov d2, d3 @@ -133,11 +144,11 @@ function sgr_finish_filter1_8bpc_neon, export=1 pop {r4-r11,pc} endfunc -// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp, +// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp, // const pixel *src, const ptrdiff_t stride, // const int32_t *a, const int16_t *b, // const int w, const int h); -function sgr_finish_filter2_8bpc_neon, export=1 +function sgr_finish_filter2_\bpc\()bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] @@ -150,7 +161,11 @@ function sgr_finish_filter2_8bpc_neon, export=1 mov r10, #FILTER_OUT_STRIDE add r11, r5, #7 bic r11, r11, #7 // Aligned width +.if \bpc == 8 sub r2, r2, r11 +.else + sub r2, r2, r11, lsl #1 +.endif sub r10, r10, r11 sub r9, r9, r11 sub r9, r9, #4 // We read 4 extra elements from a @@ -195,7 +210,11 @@ function sgr_finish_filter2_8bpc_neon, export=1 vext.8 q8, q11, q12, #4 // +stride vext.8 q11, q12, q13, #4 +.if \bpc == 8 vld1.8 {d4}, [r1, :64]! +.else + vld1.8 {q2}, [r1, :128]! +.endif vmov.i32 q14, #5 vmov.i32 q15, #6 @@ -207,7 +226,9 @@ function sgr_finish_filter2_8bpc_neon, export=1 vmul.i32 q5, q5, q14 // * 5 vmla.i32 q5, q7, q15 // * 6 +.if \bpc == 8 vmovl.u8 q2, d4 +.endif vmlal.u16 q4, d0, d4 // b + a * src vmlal.u16 q5, d1, d5 // b + a * src vmov q0, q1 @@ -255,10 +276,16 @@ function sgr_finish_filter2_8bpc_neon, export=1 vext.8 q7, q9, q10, #8 vmul.i16 q2, q2, q13 // * 6 vmla.i16 q2, q0, q12 // * 5 -> a +.if \bpc == 8 vld1.8 {d22}, [r1, :64]! +.else + vld1.16 {q11}, [r1, :128]! +.endif vadd.i32 q8, q8, q6 // -1, +1 vadd.i32 q9, q9, q7 +.if \bpc == 8 vmovl.u8 q11, d22 +.endif vmul.i32 q4, q4, q15 // * 6 vmla.i32 q4, q8, q14 // * 5 -> b vmul.i32 q5, q5, q15 // * 6 @@ -293,16 +320,22 @@ function sgr_finish_filter2_8bpc_neon, export=1 pop {r4-r11,pc} endfunc -// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride, +// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *t1, const int w, const int h, -// const int wt); -function sgr_weighted1_8bpc_neon, export=1 +// const int wt, const int bitdepth_max); +function sgr_weighted1_\bpc\()bpc_neon, export=1 push {r4-r9,lr} ldrd r4, r5, [sp, #28] ldrd r6, r7, [sp, #36] +.if \bpc == 16 + ldr r8, [sp, #44] +.endif vdup.16 d31, r7 cmp r6, #2 +.if \bpc == 16 + vdup.16 q14, r8 +.endif add r9, r0, r1 add r12, r2, r3 add lr, r4, #2*FILTER_OUT_STRIDE @@ -311,19 +344,34 @@ function sgr_weighted1_8bpc_neon, export=1 lsl r3, r3, #1 add r8, r5, #7 bic r8, r8, #7 // Aligned width +.if \bpc == 8 sub r1, r1, r8 sub r3, r3, r8 +.else + sub r1, r1, r8, lsl #1 + sub r3, r3, r8, lsl #1 +.endif sub r7, r7, r8, lsl #1 mov r8, r5 blt 2f 1: +.if \bpc == 8 vld1.8 {d0}, [r2, :64]! vld1.8 {d16}, [r12, :64]! +.else + vld1.16 {q0}, [r2, :128]! + vld1.16 {q8}, [r12, :128]! +.endif vld1.16 {q1}, [r4, :128]! vld1.16 {q9}, [lr, :128]! subs r5, r5, #8 +.if \bpc == 8 vshll.u8 q0, d0, #4 // u vshll.u8 q8, d16, #4 // u +.else + vshl.i16 q0, q0, #4 // u + vshl.i16 q8, q8, #4 // u +.endif vsub.i16 q1, q1, q0 // t1 - u vsub.i16 q9, q9, q8 // t1 - u vshll.u16 q2, d0, #7 // u << 7 @@ -334,6 +382,7 @@ function sgr_weighted1_8bpc_neon, export=1 vmlal.s16 q3, d3, d31 // v vmlal.s16 q10, d18, d31 // v vmlal.s16 q11, d19, d31 // v +.if \bpc == 8 vrshrn.i32 d4, q2, #11 vrshrn.i32 d5, q3, #11 vrshrn.i32 d20, q10, #11 @@ -342,6 +391,16 @@ function sgr_weighted1_8bpc_neon, export=1 vqmovun.s16 d20, q10 vst1.8 {d4}, [r0]! vst1.8 {d20}, [r9]! +.else + vqrshrun.s32 d4, q2, #11 + vqrshrun.s32 d5, q3, #11 + vqrshrun.s32 d20, q10, #11 + vqrshrun.s32 d21, q11, #11 + vmin.u16 q2, q2, q14 + vmin.u16 q10, q10, q14 + vst1.16 {q2}, [r0]! + vst1.16 {q10}, [r9]! +.endif bgt 1b sub r6, r6, #2 @@ -358,60 +417,97 @@ function sgr_weighted1_8bpc_neon, export=1 b 1b 2: +.if \bpc == 8 vld1.8 {d0}, [r2, :64]! +.else + vld1.16 {q0}, [r2, :128]! +.endif vld1.16 {q1}, [r4, :128]! subs r5, r5, #8 +.if \bpc == 8 vshll.u8 q0, d0, #4 // u +.else + vshl.i16 q0, q0, #4 // u +.endif vsub.i16 q1, q1, q0 // t1 - u vshll.u16 q2, d0, #7 // u << 7 vshll.u16 q3, d1, #7 // u << 7 vmlal.s16 q2, d2, d31 // v vmlal.s16 q3, d3, d31 // v +.if \bpc == 8 vrshrn.i32 d4, q2, #11 vrshrn.i32 d5, q3, #11 vqmovun.s16 d2, q2 vst1.8 {d2}, [r0]! +.else + vqrshrun.s32 d4, q2, #11 + vqrshrun.s32 d5, q3, #11 + vmin.u16 q2, q2, q14 + vst1.16 {q2}, [r0]! +.endif bgt 2b 0: pop {r4-r9,pc} endfunc -// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *t1, const int16_t *t2, // const int w, const int h, -// const int16_t wt[2]); -function sgr_weighted2_8bpc_neon, export=1 +// const int16_t wt[2], const int bitdepth_max); +function sgr_weighted2_\bpc\()bpc_neon, export=1 push {r4-r11,lr} ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] +.if \bpc == 8 ldr r8, [sp, #52] +.else + ldrd r8, r9, [sp, #52] +.endif cmp r7, #2 add r10, r0, r1 add r11, r2, r3 add r12, r4, #2*FILTER_OUT_STRIDE add lr, r5, #2*FILTER_OUT_STRIDE vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1] +.if \bpc == 16 + vdup.16 q14, r9 +.endif mov r8, #4*FILTER_OUT_STRIDE lsl r1, r1, #1 lsl r3, r3, #1 add r9, r6, #7 bic r9, r9, #7 // Aligned width +.if \bpc == 8 sub r1, r1, r9 sub r3, r3, r9 +.else + sub r1, r1, r9, lsl #1 + sub r3, r3, r9, lsl #1 +.endif sub r8, r8, r9, lsl #1 mov r9, r6 blt 2f 1: +.if \bpc == 8 vld1.8 {d0}, [r2, :64]! vld1.8 {d16}, [r11, :64]! +.else + vld1.16 {q0}, [r2, :128]! + vld1.16 {q8}, [r11, :128]! +.endif vld1.16 {q1}, [r4, :128]! vld1.16 {q9}, [r12, :128]! vld1.16 {q2}, [r5, :128]! vld1.16 {q10}, [lr, :128]! subs r6, r6, #8 +.if \bpc == 8 vshll.u8 q0, d0, #4 // u vshll.u8 q8, d16, #4 // u +.else + vshl.i16 q0, q0, #4 // u + vshl.i16 q8, q8, #4 // u +.endif vsub.i16 q1, q1, q0 // t1 - u vsub.i16 q2, q2, q0 // t2 - u vsub.i16 q9, q9, q8 // t1 - u @@ -428,6 +524,7 @@ function sgr_weighted2_8bpc_neon, export=1 vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u) vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u) vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u) +.if \bpc == 8 vrshrn.i32 d6, q3, #11 vrshrn.i32 d7, q0, #11 vrshrn.i32 d22, q11, #11 @@ -436,6 +533,16 @@ function sgr_weighted2_8bpc_neon, export=1 vqmovun.s16 d22, q11 vst1.8 {d6}, [r0]! vst1.8 {d22}, [r10]! +.else + vqrshrun.s32 d6, q3, #11 + vqrshrun.s32 d7, q0, #11 + vqrshrun.s32 d22, q11, #11 + vqrshrun.s32 d23, q8, #11 + vmin.u16 q3, q3, q14 + vmin.u16 q11, q11, q14 + vst1.16 {q3}, [r0]! + vst1.16 {q11}, [r10]! +.endif bgt 1b subs r7, r7, #2 @@ -454,11 +561,19 @@ function sgr_weighted2_8bpc_neon, export=1 b 1b 2: +.if \bpc == 8 vld1.8 {d0}, [r2, :64]! +.else + vld1.16 {q0}, [r2, :128]! +.endif vld1.16 {q1}, [r4, :128]! vld1.16 {q2}, [r5, :128]! subs r6, r6, #8 +.if \bpc == 8 vshll.u8 q0, d0, #4 // u +.else + vshl.i16 q0, q0, #4 // u +.endif vsub.i16 q1, q1, q0 // t1 - u vsub.i16 q2, q2, q0 // t2 - u vshll.u16 q3, d0, #7 // u << 7 @@ -467,11 +582,19 @@ function sgr_weighted2_8bpc_neon, export=1 vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) +.if \bpc == 8 vrshrn.i32 d6, q3, #11 vrshrn.i32 d7, q0, #11 vqmovun.s16 d6, q3 vst1.8 {d6}, [r0]! +.else + vqrshrun.s32 d6, q3, #11 + vqrshrun.s32 d7, q0, #11 + vmin.u16 q3, q3, q14 + vst1.16 {q3}, [r0]! +.endif bgt 1b 0: pop {r4-r11,pc} endfunc +.endm From 05a570c3abcea2f12472034c298b80aebafe0a4d Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 2 Dec 2020 14:10:52 +0100 Subject: [PATCH 079/155] Add miscellaneous minor wiener optimizations Combine horizontal and vertical filter pointers into a single parameter when calling the wiener DSP function. Eliminate the +128 filter coefficient handling where possible. --- src/arm/32/looprestoration.S | 12 ++++------- src/arm/32/looprestoration16.S | 35 +++++++++---------------------- src/arm/64/looprestoration.S | 10 ++++----- src/arm/64/looprestoration16.S | 30 +++++++------------------- src/x86/looprestoration.asm | 21 ++++++++----------- src/x86/looprestoration_ssse3.asm | 21 ++++++++----------- 6 files changed, 44 insertions(+), 85 deletions(-) diff --git a/src/arm/32/looprestoration.S b/src/arm/32/looprestoration.S index d2be230aaf..79afdc394c 100644 --- a/src/arm/32/looprestoration.S +++ b/src/arm/32/looprestoration.S @@ -30,7 +30,7 @@ // void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], // const pixel *src, ptrdiff_t stride, -// const int16_t fh[7], const intptr_t w, +// const int16_t fh[8], intptr_t w, // int h, enum LrEdgeFlags edges); function wiener_filter_h_8bpc_neon, export=1 push {r4-r11,lr} @@ -38,7 +38,7 @@ function wiener_filter_h_8bpc_neon, export=1 ldrd r4, r5, [sp, #52] ldrd r6, r7, [sp, #60] mov r8, r5 - vld1.16 {q0}, [r4] + vld1.16 {q0}, [r4, :128] movw r9, #(1 << 14) - (1 << 2) vdup.16 q14, r9 vmov.s16 q15, #2048 @@ -358,18 +358,14 @@ endfunc // void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride, // const int16_t *mid, int w, int h, -// const int16_t fv[7], enum LrEdgeFlags edges, +// const int16_t fv[8], enum LrEdgeFlags edges, // ptrdiff_t mid_stride); function wiener_filter_v_8bpc_neon, export=1 push {r4-r7,lr} ldrd r4, r5, [sp, #20] ldrd r6, r7, [sp, #28] mov lr, r4 - vmov.s16 q1, #0 - mov r12, #128 - vld1.16 {q0}, [r5] - vmov.s16 d2[3], r12 - vadd.s16 q0, q0, q1 + vld1.16 {q0}, [r5, :128] // Calculate the number of rows to move back when looping vertically mov r12, r4 diff --git a/src/arm/32/looprestoration16.S b/src/arm/32/looprestoration16.S index 324c2ac173..a2ebbeff3a 100644 --- a/src/arm/32/looprestoration16.S +++ b/src/arm/32/looprestoration16.S @@ -39,7 +39,7 @@ function wiener_filter_h_16bpc_neon, export=1 ldrd r4, r5, [sp, #100] ldrd r6, r7, [sp, #108] ldr r8, [sp, #116] // bitdepth_max - vld1.16 {q0}, [r4] + vld1.16 {q0}, [r4, :128] clz r8, r8 vmov.i32 q14, #1 sub r9, r8, #38 // -(bitdepth + 6) @@ -151,16 +151,14 @@ function wiener_filter_h_16bpc_neon, export=1 b 6f 4: // Loop horizontally - vext.8 q10, q2, q3, #6 vext.8 q8, q2, q3, #2 vext.8 q9, q2, q3, #4 - vshll.u16 q6, d20, #7 - vshll.u16 q7, d21, #7 - vmlal.s16 q6, d4, d0[0] + vext.8 q10, q2, q3, #6 + vmull.s16 q6, d4, d0[0] vmlal.s16 q6, d16, d0[1] vmlal.s16 q6, d18, d0[2] vmlal.s16 q6, d20, d0[3] - vmlal.s16 q7, d5, d0[0] + vmull.s16 q7, d5, d0[0] vmlal.s16 q7, d17, d0[1] vmlal.s16 q7, d19, d0[2] vmlal.s16 q7, d21, d0[3] @@ -173,14 +171,12 @@ function wiener_filter_h_16bpc_neon, export=1 vmlal.s16 q7, d17, d1[0] vmlal.s16 q7, d19, d1[1] vmlal.s16 q7, d21, d1[2] - vext.8 q10, q4, q5, #6 vext.8 q2, q4, q5, #2 - vshll.u16 q8, d20, #7 - vshll.u16 q9, d21, #7 - vmlal.s16 q8, d8, d0[0] + vext.8 q10, q4, q5, #6 + vmull.s16 q8, d8, d0[0] vmlal.s16 q8, d4, d0[1] vmlal.s16 q8, d20, d0[3] - vmlal.s16 q9, d9, d0[0] + vmull.s16 q9, d9, d0[0] vmlal.s16 q9, d5, d0[1] vmlal.s16 q9, d21, d0[3] vext.8 q2, q4, q5, #4 @@ -233,8 +229,7 @@ function wiener_filter_h_16bpc_neon, export=1 vext.8 d17, d4, d5, #4 vext.8 d19, d5, d6, #2 vext.8 d20, d5, d6, #4 - vshll.u16 q6, d18, #7 - vmlal.s16 q6, d4, d0[0] + vmull.s16 q6, d4, d0[0] vmlal.s16 q6, d16, d0[1] vmlal.s16 q6, d17, d0[2] vmlal.s16 q6, d18, d0[3] @@ -247,8 +242,7 @@ function wiener_filter_h_16bpc_neon, export=1 vext.8 d17, d8, d9, #4 vext.8 d19, d9, d10, #2 vext.8 d20, d9, d10, #4 - vshll.u16 q7, d18, #7 - vmlal.s16 q7, d8, d0[0] + vmull.s16 q7, d8, d0[0] vmlal.s16 q7, d16, d0[1] vmlal.s16 q7, d17, d0[2] vmlal.s16 q7, d18, d0[3] @@ -356,14 +350,9 @@ L(variable_shift_tbl): vadd.i32 q8, q9 vpadd.i32 d12, d12, d13 vpadd.i32 d13, d16, d17 - vdup.16 d14, d4[3] - vdup.16 d15, d8[3] vpadd.i32 d12, d12, d13 - vtrn.16 d14, d15 vadd.i32 d12, d12, d28 - vshll.u16 q7, d14, #7 vmvn.i16 d20, #0x8000 // 0x7fff = (1 << 15) - 1 - vadd.i32 d12, d12, d14 vrshl.s32 d12, d12, d26 vqmovun.s32 d12, q6 vmin.u16 d12, d12, d20 @@ -401,14 +390,10 @@ function wiener_filter_v_16bpc_neon, export=1 ldrd r4, r5, [sp, #52] ldrd r6, r7, [sp, #60] ldr lr, [sp, #68] // bitdepth_max - vmov.i16 q1, #0 - mov r12, #128 - vld1.16 {q0}, [r5] + vld1.16 {q0}, [r5, :128] vdup.16 q5, lr clz lr, lr - vmov.i16 d2[3], r12 sub lr, lr, #11 // round_bits_v - vadd.i16 q0, q0, q1 vdup.32 q4, lr mov lr, r4 vneg.s32 q4, q4 // -round_bits_v diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S index 1e864c29ac..c3b7918f2e 100644 --- a/src/arm/64/looprestoration.S +++ b/src/arm/64/looprestoration.S @@ -30,7 +30,7 @@ // void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], // const pixel *src, ptrdiff_t stride, -// const int16_t fh[7], const intptr_t w, +// const int16_t fh[8], intptr_t w, // int h, enum LrEdgeFlags edges); function wiener_filter_h_8bpc_neon, export=1 mov w8, w5 @@ -308,13 +308,11 @@ endfunc // void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride, // const int16_t *mid, int w, int h, -// const int16_t fv[7], enum LrEdgeFlags edges, +// const int16_t fv[8], enum LrEdgeFlags edges, // ptrdiff_t mid_stride); function wiener_filter_v_8bpc_neon, export=1 mov w8, w4 ld1 {v0.8h}, [x5] - movi v1.8h, #128 - add v1.8h, v1.8h, v0.8h // Calculate the number of rows to move back when looping vertically mov w11, w4 @@ -359,14 +357,14 @@ function wiener_filter_v_8bpc_neon, export=1 smull v2.4s, v16.4h, v0.h[0] smlal v2.4s, v17.4h, v0.h[1] smlal v2.4s, v18.4h, v0.h[2] - smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v19.4h, v0.h[3] smlal v2.4s, v20.4h, v0.h[4] smlal v2.4s, v21.4h, v0.h[5] smlal v2.4s, v22.4h, v0.h[6] smull2 v3.4s, v16.8h, v0.h[0] smlal2 v3.4s, v17.8h, v0.h[1] smlal2 v3.4s, v18.8h, v0.h[2] - smlal2 v3.4s, v19.8h, v1.h[3] + smlal2 v3.4s, v19.8h, v0.h[3] smlal2 v3.4s, v20.8h, v0.h[4] smlal2 v3.4s, v21.8h, v0.h[5] smlal2 v3.4s, v22.8h, v0.h[6] diff --git a/src/arm/64/looprestoration16.S b/src/arm/64/looprestoration16.S index 450413d857..669d993132 100644 --- a/src/arm/64/looprestoration16.S +++ b/src/arm/64/looprestoration16.S @@ -143,12 +143,6 @@ function wiener_filter_h_16bpc_neon, export=1 b 6f 4: // Loop horizontally -.macro ushll_sz d0, d1, src, shift, wd - ushll \d0\().4s, \src\().4h, \shift -.ifc \wd, .8h - ushll2 \d1\().4s, \src\().8h, \shift -.endif -.endm .macro add_sz d0, d1, s0, s1, c, wd add \d0\().4s, \s0\().4s, \c\().4s .ifc \wd, .8h @@ -172,14 +166,13 @@ function wiener_filter_h_16bpc_neon, export=1 // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - ext v18.16b, v2.16b, v3.16b, #6 ext v16.16b, v2.16b, v3.16b, #2 ext v17.16b, v2.16b, v3.16b, #4 + ext v18.16b, v2.16b, v3.16b, #6 ext v19.16b, v2.16b, v3.16b, #8 ext v20.16b, v2.16b, v3.16b, #10 - ushll_sz v6, v7, v18, #7, \wd ext v21.16b, v2.16b, v3.16b, #12 - smlal v6.4s, v2.4h, v0.h[0] + smull v6.4s, v2.4h, v0.h[0] smlal v6.4s, v16.4h, v0.h[1] smlal v6.4s, v17.4h, v0.h[2] smlal v6.4s, v18.4h, v0.h[3] @@ -187,7 +180,7 @@ function wiener_filter_h_16bpc_neon, export=1 smlal v6.4s, v20.4h, v0.h[5] smlal v6.4s, v21.4h, v0.h[6] .ifc \wd, .8h - smlal2 v7.4s, v2.8h, v0.h[0] + smull2 v7.4s, v2.8h, v0.h[0] smlal2 v7.4s, v16.8h, v0.h[1] smlal2 v7.4s, v17.8h, v0.h[2] smlal2 v7.4s, v18.8h, v0.h[3] @@ -195,14 +188,13 @@ function wiener_filter_h_16bpc_neon, export=1 smlal2 v7.4s, v20.8h, v0.h[5] smlal2 v7.4s, v21.8h, v0.h[6] .endif - ext v21.16b, v4.16b, v5.16b, #6 ext v19.16b, v4.16b, v5.16b, #2 ext v20.16b, v4.16b, v5.16b, #4 + ext v21.16b, v4.16b, v5.16b, #6 ext v22.16b, v4.16b, v5.16b, #8 ext v23.16b, v4.16b, v5.16b, #10 - ushll_sz v16, v17, v21, #7, \wd ext v24.16b, v4.16b, v5.16b, #12 - smlal v16.4s, v4.4h, v0.h[0] + smull v16.4s, v4.4h, v0.h[0] smlal v16.4s, v19.4h, v0.h[1] smlal v16.4s, v20.4h, v0.h[2] smlal v16.4s, v21.4h, v0.h[3] @@ -210,7 +202,7 @@ function wiener_filter_h_16bpc_neon, export=1 smlal v16.4s, v23.4h, v0.h[5] smlal v16.4s, v24.4h, v0.h[6] .ifc \wd, .8h - smlal2 v17.4s, v4.8h, v0.h[0] + smull2 v17.4s, v4.8h, v0.h[0] smlal2 v17.4s, v19.8h, v0.h[1] smlal2 v17.4s, v20.8h, v0.h[2] smlal2 v17.4s, v21.8h, v0.h[3] @@ -329,13 +321,9 @@ L(variable_shift_tbl): add v16.4s, v16.4s, v17.4s addv s6, v6.4s addv s7, v16.4s - dup v16.4h, v2.h[3] - ins v16.h[1], v4.h[3] ins v6.s[1], v7.s[0] mvni v24.4h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 - ushll v16.4s, v16.4h, #7 add v6.2s, v6.2s, v30.2s - add v6.2s, v6.2s, v16.2s srshl v6.2s, v6.2s, v29.2s sqxtun v6.4h, v6.4s umin v6.4h, v6.4h, v24.4h @@ -371,9 +359,7 @@ function wiener_filter_v_16bpc_neon, export=1 ld1 {v0.8h}, [x5] dup v31.8h, w8 clz w8, w8 - movi v1.8h, #128 sub w8, w8, #11 // round_bits_v - add v1.8h, v1.8h, v0.8h dup v30.4s, w8 mov w8, w4 neg v30.4s, v30.4s // -round_bits_v @@ -421,14 +407,14 @@ function wiener_filter_v_16bpc_neon, export=1 smull v2.4s, v16.4h, v0.h[0] smlal v2.4s, v17.4h, v0.h[1] smlal v2.4s, v18.4h, v0.h[2] - smlal v2.4s, v19.4h, v1.h[3] + smlal v2.4s, v19.4h, v0.h[3] smlal v2.4s, v20.4h, v0.h[4] smlal v2.4s, v21.4h, v0.h[5] smlal v2.4s, v22.4h, v0.h[6] smull2 v3.4s, v16.8h, v0.h[0] smlal2 v3.4s, v17.8h, v0.h[1] smlal2 v3.4s, v18.8h, v0.h[2] - smlal2 v3.4s, v19.8h, v1.h[3] + smlal2 v3.4s, v19.8h, v0.h[3] smlal2 v3.4s, v20.8h, v0.h[4] smlal2 v3.4s, v21.8h, v0.h[5] smlal2 v3.4s, v22.8h, v0.h[6] diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm index fc6e9f124e..7e1e83f7f1 100644 --- a/src/x86/looprestoration.asm +++ b/src/x86/looprestoration.asm @@ -40,7 +40,6 @@ pw_16: times 2 dw 16 pw_256: times 2 dw 256 pw_2048: times 2 dw 2048 pw_16380: times 2 dw 16380 -pw_0_128: dw 0, 128 pw_5_6: dw 5, 6 pd_6: dd 6 pd_1024: dd 1024 @@ -52,14 +51,14 @@ cextern sgr_x_by_x SECTION .text INIT_YMM avx2 -cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge +cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, flt, w, h, edge mov edged, edgem - vpbroadcastb m15, [fhq+0] + vpbroadcastb m15, [fltq+0] movifnidn wd, wm - vpbroadcastb m14, [fhq+2] + vpbroadcastb m14, [fltq+2] mov hd, hm - vpbroadcastb m13, [fhq+4] - vpbroadcastw m12, [fhq+6] + vpbroadcastb m13, [fltq+4] + vpbroadcastw m12, [fltq+6] vpbroadcastd m11, [pw_2048] vpbroadcastd m10, [pw_16380] lea r11, [pb_right_ext_mask] @@ -207,18 +206,16 @@ cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge jg .loop RET -cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, fv, edge - movifnidn fvq, fvmp +cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, flt, edge + movifnidn fltq, fltmp mov edged, edgem movifnidn hd, hm - vpbroadcastd m10, [fvq] - vpbroadcastd m11, [fvq+4] - vpbroadcastd m0, [pw_0_128] + vpbroadcastd m10, [fltq+16] + vpbroadcastd m11, [fltq+20] vpbroadcastd m12, [pd_1024] DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr rorx ylimd, edged, 2 - paddw m11, m0 and ylimd, 2 ; have_bottom sub ylimd, 3 diff --git a/src/x86/looprestoration_ssse3.asm b/src/x86/looprestoration_ssse3.asm index d11f68e736..b7d7f0ffaa 100644 --- a/src/x86/looprestoration_ssse3.asm +++ b/src/x86/looprestoration_ssse3.asm @@ -52,7 +52,6 @@ pw_256: times 8 dw 256 pw_2048: times 8 dw 2048 pw_16380: times 8 dw 16380 pw_5_6: times 4 dw 5, 6 -pw_0_128: times 4 dw 0, 128 pd_1024: times 4 dd 1024 %if ARCH_X86_32 pd_256: times 4 dd 256 @@ -129,12 +128,12 @@ SECTION .text %macro WIENER_H 0 %if ARCH_X86_64 -cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge +cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, flt, w, h, edge mov edged, edgem movifnidn wd, wm mov hd, hm %else -cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge +cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, flt, w, h, edge mov r5, edgem mov [esp+12], r5 mov wd, wm @@ -146,7 +145,7 @@ cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge %define m12 m3 %endif - movq m15, [fhq] + movq m15, [fltq] %if cpuflag(ssse3) pshufb m12, m15, [PIC_sym(pb_6_7)] pshufb m13, m15, [PIC_sym(pb_4)] @@ -438,14 +437,13 @@ cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge %macro WIENER_V 0 %if ARCH_X86_64 -cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge +cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, flt, edge mov edged, edgem - movifnidn fvq, fvmp + movifnidn fltq, fltmp movifnidn hd, hm - movq m15, [fvq] + movq m15, [fltq+16] pshufd m14, m15, q1111 pshufd m15, m15, q0000 - paddw m14, [pw_0_128] mova m12, [pd_1024] DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr @@ -455,7 +453,7 @@ cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge shr ylimd, 2 sub ylimd, 3 %else -cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge +cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, flt, edge %define ylimd [esp+12] mov r5d, edgem @@ -463,15 +461,14 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge shr r5d, 2 sub r5d, 3 mov ylimd, r5d - mov fvq, fvmp + mov fltq, fltmp mov edged, edgem SETUP_PIC edged - movq m0, [fvq] + movq m0, [fltq+16] pshufd m1, m0, q1111 pshufd m0, m0, q0000 - paddw m1, [PIC_sym(pw_0_128)] mova [esp+0x50], m0 mova [esp+0x40], m1 From a84f68e7c1e567b6b03bf61f462ae22e312a9202 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Fri, 4 Dec 2020 22:00:11 +0100 Subject: [PATCH 080/155] x86: Rename looprestoration_ssse3.asm to looprestoration_sse.asm It contains both SSE2 and SSSE3 code. --- src/x86/{looprestoration_ssse3.asm => looprestoration_sse.asm} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/x86/{looprestoration_ssse3.asm => looprestoration_sse.asm} (100%) diff --git a/src/x86/looprestoration_ssse3.asm b/src/x86/looprestoration_sse.asm similarity index 100% rename from src/x86/looprestoration_ssse3.asm rename to src/x86/looprestoration_sse.asm From 2bde1d83724391a346eece4ab1a33070123af0c9 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 2 Dec 2020 14:10:57 +0100 Subject: [PATCH 081/155] x86: Rewrite wiener SSE2/SSSE3/AVX2 asm The previous implementation did two separate passes in the horizontal and vertical directions, with the intermediate values being stored in a buffer on the stack. This caused bad cache thrashing. By interleaving the horizontal and vertical passes in combination with a ring buffer for storing only a few rows at a time the performance is improved by a significant amount. Also split the function into 7-tap and 5-tap versions. The latter is faster and fairly common (always for chroma, sometimes for luma). --- src/x86/looprestoration.asm | 942 +++++++++++++------ src/x86/looprestoration_sse.asm | 1542 ++++++++++++++++++++----------- 2 files changed, 1686 insertions(+), 798 deletions(-) diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm index 7e1e83f7f1..cde5889ad3 100644 --- a/src/x86/looprestoration.asm +++ b/src/x86/looprestoration.asm @@ -29,20 +29,25 @@ %if ARCH_X86_64 SECTION_RODATA 32 + +wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 +wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 +wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1 +wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 pb_right_ext_mask: times 32 db 0xff times 32 db 0 -pb_14x0_1_2: times 14 db 0 - db 1, 2 -pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 - db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 -pb_15: times 16 db 15 -pw_16: times 2 dw 16 -pw_256: times 2 dw 256 -pw_2048: times 2 dw 2048 -pw_16380: times 2 dw 16380 -pw_5_6: dw 5, 6 -pd_6: dd 6 -pd_1024: dd 1024 + +pb_3: times 4 db 3 +pb_m5: times 4 db -5 +pw_16: times 2 dw 16 +pw_256: times 2 dw 256 +pw_2056: times 2 dw 2056 +pw_m16380: times 2 dw -16380 +pw_5_6: dw 5, 6 +pd_1024: dd 1024 pd_0xf0080029: dd 0xf0080029 pd_0xf00801c7: dd 0xf00801c7 @@ -50,277 +55,662 @@ cextern sgr_x_by_x SECTION .text -INIT_YMM avx2 -cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, flt, w, h, edge - mov edged, edgem - vpbroadcastb m15, [fltq+0] - movifnidn wd, wm - vpbroadcastb m14, [fltq+2] - mov hd, hm - vpbroadcastb m13, [fltq+4] - vpbroadcastw m12, [fltq+6] - vpbroadcastd m11, [pw_2048] - vpbroadcastd m10, [pw_16380] - lea r11, [pb_right_ext_mask] - - DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim - - ; if (edge & has_right) align_w_to_32 - ; else w -= 32, and use that as limit in x loop - test edgeb, 2 ; has_right - jnz .align - mov xlimq, -3 - jmp .loop -.align: - add wd, 31 - and wd, ~31 - xor xlimd, xlimd - - ; main y loop for vertical filter -.loop: - mov srcptrq, srcq - mov dstptrq, dstq - lea xq, [wq+xlimq] +%macro REPX 2-* + %xdefine %%f(x) %1 +%rep %0 - 1 + %rotate 1 + %%f(%1) +%endrep +%endmacro - ; load left edge pixels - test edgeb, 1 ; have_left - jz .emu_left - test leftq, leftq ; left == NULL for the edge-extended bottom/top - jz .load_left_combined - movd xm0, [leftq] - add leftq, 4 - pinsrd xm0, [srcq], 1 - pslldq xm0, 9 - jmp .left_load_done -.load_left_combined: - movq xm0, [srcq-3] - pslldq xm0, 10 - jmp .left_load_done -.emu_left: - movd xm0, [srcq] - pshufb xm0, [pb_14x0_1_2] +DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers - ; load right edge pixels -.left_load_done: - cmp xd, 32 - jg .main_load - test xd, xd - jg .load_and_splat - je .splat_right - - ; for very small images (w=[1-2]), edge-extend the original cache, - ; ugly, but only runs in very odd cases - add wd, wd - pshufb xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16] - shr wd, 1 - - ; main x loop, mostly this starts in .main_load -.splat_right: - ; no need to load new pixels, just extend them from the (possibly previously - ; extended) previous load into m0 - pshufb xm1, xm0, [pb_15] - jmp .main_loop -.load_and_splat: - ; load new pixels and extend edge for right-most - movu m1, [srcptrq+3] - sub r11, xq - movu m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32] - add r11, xq - vpbroadcastb m3, [srcptrq+2+xq] - pand m1, m2 - pandn m3, m2, m3 - por m1, m3 - jmp .main_loop -.main_load: - ; load subsequent line - movu m1, [srcptrq+3] +INIT_YMM avx2 +cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h + mov fltq, fltmp + mov edged, r8m + mov wd, wm + mov hd, r6m + vbroadcasti128 m6, [wiener_shufA] + vpbroadcastb m11, [fltq+ 0] ; x0 x0 + vbroadcasti128 m7, [wiener_shufB] + vpbroadcastd m12, [fltq+ 2] + vbroadcasti128 m8, [wiener_shufC] + packsswb m12, m12 ; x1 x2 + vpbroadcastw m13, [fltq+ 6] ; x3 + vbroadcasti128 m9, [wiener_shufD] + add lpfq, wq + vpbroadcastd m10, [pw_m16380] + lea t1, [rsp+wq*2+16] + vpbroadcastd m14, [fltq+16] ; y0 y1 + add dstq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + neg wq + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + mov [rsp+8*1], lpf_strideq + add r7, lpf_strideq + mov [rsp+8*0], r7 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] .main_loop: - vinserti128 m0, xm1, 1 - - palignr m2, m1, m0, 10 - palignr m3, m1, m0, 11 - palignr m4, m1, m0, 12 - palignr m5, m1, m0, 13 - palignr m6, m1, m0, 14 - palignr m7, m1, m0, 15 - - punpcklbw m0, m2, m1 - punpckhbw m2, m1 - punpcklbw m8, m3, m7 - punpckhbw m3, m7 - punpcklbw m7, m4, m6 - punpckhbw m4, m6 - pxor m9, m9 - punpcklbw m6, m5, m9 - punpckhbw m5, m9 - - pmaddubsw m0, m15 - pmaddubsw m2, m15 - pmaddubsw m8, m14 - pmaddubsw m3, m14 - pmaddubsw m7, m13 - pmaddubsw m4, m13 - paddw m0, m8 - paddw m2, m3 - psllw m8, m6, 7 - psllw m3, m5, 7 - psubw m8, m10 - psubw m3, m10 - pmullw m6, m12 - pmullw m5, m12 - paddw m0, m7 - paddw m2, m4 - paddw m0, m6 - paddw m2, m5 - ; for a signed overflow to happen we need filter and pixels as follow: - ; filter => -5,-23,-17,90,-17,-23,-5 - ; pixels => 255,255,255,0,255,255,255 or 0,0,0,255,0,0,0 - ; m0 would fall in the range [-59A6;+59A6] = [A65A;59A6] - ; m8 would fall in the range [-3FFC;+3F84] = [C004;3F84] - ; 32-bit arithmetic m0+m8 = [-99A2;+992A] = [FFFF665E;992A] - ; => signed 16-bit overflow occurs - paddsw m0, m8 ; paddsw clips this range to [-8000;+7FFF] - paddsw m2, m3 - psraw m0, 3 ; shift changes the range to [-1000;+FFF] - psraw m2, 3 - paddw m0, m11 ; adding back 800 (removed in m8) changes the - paddw m2, m11 ; range to [-800;+17FF] as defined in the spec - mova [dstptrq], xm0 ; (note that adding another 800 would give us - mova [dstptrq+16], xm2; the same range as in the C code => [0;1FFF]) - vextracti128 [dstptrq+32], m0, 1 - vextracti128 [dstptrq+48], m2, 1 - vextracti128 xm0, m1, 1 - add srcptrq, 32 - add dstptrq, 64 - sub xq, 32 - cmp xd, 32 - jg .main_load - test xd, xd - jg .load_and_splat - cmp xd, xlimd - jg .splat_right - - add srcq, strideq - add dstq, 384*2 - dec hd - jg .loop + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp+8*0] + call .hv_bottom + add lpfq, [rsp+8*1] + call .hv_bottom +.v1: + call .v RET +.no_top: + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r7, [r7+lpf_strideq*2] + mov [rsp+8*0], r7 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +.v2: + call .v + jmp .v1 +.extend_right: + movd xm2, r10d + vpbroadcastd m0, [pb_3] + vpbroadcastd m1, [pb_m5] + vpbroadcastb m2, xm2 + movu m3, [pb_0to31] + psubb m0, m2 + psubb m1, m2 + pminub m0, m3 + pminub m1, m3 + pshufb m4, m0 + pshufb m5, m1 + ret +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .h_main +.h_extend_left: + vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located + mova m4, [lpfq+r10] ; before the start of the buffer + palignr m4, m5, 12 + pshufb m4, [wiener_l_shuf] + jmp .h_main +.h_top: + mov r10, wq + movu m4, [lpfq+r10-4] + test edgeb, 1 ; LR_HAVE_LEFT + jnz .h_main + pshufb m4, [wiener_l_shuf] + jmp .h_main +.h_loop: + movu m4, [lpfq+r10-4] +.h_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m11 + pshufb m1, m5, m6 + pmaddubsw m1, m11 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + paddw m0, m2 + pshufb m2, m4, m8 + pmaddubsw m2, m12 + paddw m1, m3 + pshufb m3, m5, m8 + pmaddubsw m3, m12 + pshufb m4, m9 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m9 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m10 + paddw m5, m10 + paddw m0, m2 + vpbroadcastd m2, [pw_2056] + paddw m1, m3 + paddsw m0, m4 + paddsw m1, m5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m2 + paddw m1, m2 + mova [t1+r10*2+ 0], m0 + mova [t1+r10*2+32], m1 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .hv_main +.hv_extend_left: + movu m4, [lpfq+r10-4] + pshufb m4, [wiener_l_shuf] + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+r10-4] +.hv_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -34 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m11 + pshufb m1, m5, m6 + pmaddubsw m1, m11 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + paddw m0, m2 + pshufb m2, m4, m8 + pmaddubsw m2, m12 + paddw m1, m3 + pshufb m3, m5, m8 + pmaddubsw m3, m12 + pshufb m4, m9 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m9 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m10 + paddw m5, m10 + paddw m0, m2 + paddw m1, m3 + mova m2, [t4+r10*2] + paddw m2, [t2+r10*2] + mova m3, [t3+r10*2] + paddsw m0, m4 + vpbroadcastd m4, [pw_2056] + paddsw m1, m5 + mova m5, [t5+r10*2] + paddw m5, [t1+r10*2] + psraw m0, 3 + psraw m1, 3 + paddw m0, m4 + paddw m1, m4 + paddw m4, m0, [t6+r10*2] + mova [t0+r10*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m5 + pmaddwd m3, m14 + punpckhwd m4, m5 + pmaddwd m4, m14 + paddd m0, m3 + paddd m4, m2 + mova m2, [t4+r10*2+32] + paddw m2, [t2+r10*2+32] + mova m3, [t3+r10*2+32] + mova m5, [t5+r10*2+32] + paddw m5, [t1+r10*2+32] + psrad m0, 11 + psrad m4, 11 + packssdw m0, m4 + paddw m4, m1, [t6+r10*2+32] + mova [t0+r10*2+32], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m5 + pmaddwd m3, m14 + punpckhwd m4, m5 + pmaddwd m4, m14 + paddd m1, m3 + paddd m2, m4 + psrad m1, 11 + psrad m2, 11 + packssdw m1, m2 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, dst_strideq + ret +.v: + mov r10, wq +.v_loop: + mova m2, [t4+r10*2+ 0] + paddw m2, [t2+r10*2+ 0] + mova m4, [t3+r10*2+ 0] + mova m6, [t1+r10*2+ 0] + paddw m8, m6, [t6+r10*2+ 0] + paddw m6, [t5+r10*2+ 0] + mova m3, [t4+r10*2+32] + paddw m3, [t2+r10*2+32] + mova m5, [t3+r10*2+32] + mova m7, [t1+r10*2+32] + paddw m9, m7, [t6+r10*2+32] + paddw m7, [t5+r10*2+32] + punpcklwd m0, m2, m4 + pmaddwd m0, m15 + punpckhwd m2, m4 + pmaddwd m2, m15 + punpcklwd m4, m8, m6 + pmaddwd m4, m14 + punpckhwd m6, m8, m6 + pmaddwd m6, m14 + punpcklwd m1, m3, m5 + pmaddwd m1, m15 + punpckhwd m3, m5 + pmaddwd m3, m15 + punpcklwd m5, m9, m7 + pmaddwd m5, m14 + punpckhwd m7, m9, m7 + pmaddwd m7, m14 + paddd m0, m4 + paddd m2, m6 + paddd m1, m5 + paddd m3, m7 + REPX {psrad x, 11}, m0, m2, m1, m3 + packssdw m0, m2 + packssdw m1, m3 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, dst_strideq + ret -cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, flt, edge - movifnidn fltq, fltmp - mov edged, edgem - movifnidn hd, hm - vpbroadcastd m10, [fltq+16] - vpbroadcastd m11, [fltq+20] - vpbroadcastd m12, [pd_1024] - - DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr - rorx ylimd, edged, 2 - and ylimd, 2 ; have_bottom - sub ylimd, 3 - - ; main x loop for vertical filter, does one column of 16 pixels -.loop_x: - mova m3, [midq] ; middle line - - ; load top pixels - test edgeb, 4 ; have_top - jz .emu_top - mova m0, [midq-384*4] - mova m2, [midq-384*2] - mova m1, m0 - jmp .load_bottom_pixels -.emu_top: - mova m0, m3 - mova m1, m3 - mova m2, m3 - - ; load bottom pixels -.load_bottom_pixels: - mov yd, hd - mov mptrq, midq - mov dstptrq, dstq - add yd, ylimd - jg .load_threelines - - ; the remainder here is somewhat messy but only runs in very weird - ; circumstances at the bottom of the image in very small blocks (h=[1-3]), - ; so performance is not terribly important here... - je .load_twolines - cmp yd, -1 - je .load_oneline - ; h == 1 case - mova m5, m3 - mova m4, m3 - mova m6, m3 - jmp .loop -.load_oneline: - ; h == 2 case - mova m4, [midq+384*2] - mova m5, m4 - mova m6, m4 - jmp .loop -.load_twolines: - ; h == 3 case - mova m4, [midq+384*2] - mova m5, [midq+384*4] - mova m6, m5 - jmp .loop -.load_threelines: - ; h > 3 case - mova m4, [midq+384*2] - mova m5, [midq+384*4] - ; third line loaded in main loop below - - ; main y loop for vertical filter -.loop_load: - ; load one line into m6. if that pixel is no longer available, do - ; nothing, since m6 still has the data from the previous line in it. We - ; try to structure the loop so that the common case is evaluated fastest - mova m6, [mptrq+384*6] -.loop: - paddw m0, m6 - paddw m7, m1, m5 - paddw m8, m2, m4 - punpcklwd m9, m0, m7 - punpckhwd m0, m7 - punpcklwd m7, m8, m3 - punpckhwd m8, m3 - pmaddwd m9, m10 - pmaddwd m0, m10 - pmaddwd m7, m11 - pmaddwd m8, m11 - add mptrq, 384*2 - paddd m7, m9 - paddd m0, m8 - paddd m7, m12 - paddd m0, m12 - psrad m7, 11 - psrad m0, 11 - packssdw m7, m0 - vextracti128 xm0, m7, 1 - packuswb xm7, xm0 - mova [dstptrq], xm7 - ; shift pixels one position - mova m0, m1 - mova m1, m2 - mova m2, m3 - mova m3, m4 - mova m4, m5 - mova m5, m6 - add dstptrq, strideq - dec yd - jg .loop_load - ; for the bottom pixels, continue using m6 (as extended edge) - cmp yd, ylimd - jg .loop - add midq, 32 - add dstq, 16 - sub wd, 16 - jg .loop_x +cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h + mov fltq, fltmp + mov edged, r8m + mov wd, wm + mov hd, r6m + vbroadcasti128 m6, [wiener_shufB] + vpbroadcastd m12, [fltq+ 2] + vbroadcasti128 m7, [wiener_shufC] + packsswb m12, m12 ; x1 x2 + vpbroadcastw m13, [fltq+ 6] ; x3 + vbroadcasti128 m8, [wiener_shufD] + add lpfq, wq + vpbroadcastd m9, [pw_m16380] + vpbroadcastd m10, [pw_2056] + lea t1, [rsp+wq*2+16] + mova m11, [wiener_l_shuf] + vpbroadcastd m14, [fltq+16] ; __ y1 + add dstq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + neg wq + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + mov [rsp+8*1], lpf_strideq + add r7, lpf_strideq + mov [rsp+8*0], r7 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, [rsp+8*0] + call .hv_bottom + add lpfq, [rsp+8*1] + call .hv_bottom +.end: RET +.no_top: + lea r7, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r7, [r7+lpf_strideq*2] + mov [rsp+8*0], r7 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, dst_strideq +.v1: + call .v + jmp .end +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .h_main +.h_extend_left: + vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located + mova m4, [lpfq+r10] ; before the start of the buffer + palignr m4, m5, 12 + pshufb m4, m11 + jmp .h_main +.h_top: + mov r10, wq + movu m4, [lpfq+r10-4] + test edgeb, 1 ; LR_HAVE_LEFT + jnz .h_main + pshufb m4, m11 + jmp .h_main +.h_loop: + movu m4, [lpfq+r10-4] +.h_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -33 + jl .h_have_right + call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right +.h_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m12 + pshufb m1, m5, m6 + pmaddubsw m1, m12 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + pshufb m4, m8 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m8 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m9 + paddw m5, m9 + paddw m0, m2 + paddw m1, m3 + paddsw m0, m4 + paddsw m1, m5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m10 + paddw m1, m10 + mova [t1+r10*2+ 0], m0 + mova [t1+r10*2+32], m1 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .hv_main +.hv_extend_left: + movu m4, [lpfq+r10-4] + pshufb m4, m11 + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+r10-4] +.hv_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -33 + jl .hv_have_right + call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right +.hv_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m12 + pshufb m1, m5, m6 + pmaddubsw m1, m12 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + pshufb m4, m8 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m8 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m9 + paddw m5, m9 + paddw m0, m2 + paddw m1, m3 + mova m2, [t3+r10*2] + paddw m2, [t1+r10*2] + mova m3, [t2+r10*2] + paddsw m0, m4 + paddsw m1, m5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m10 + paddw m1, m10 + paddw m4, m0, [t4+r10*2] + mova [t0+r10*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m4 + pmaddwd m3, m14 + punpckhwd m4, m4 + pmaddwd m4, m14 + paddd m0, m3 + paddd m4, m2 + mova m2, [t3+r10*2+32] + paddw m2, [t1+r10*2+32] + mova m3, [t2+r10*2+32] + psrad m0, 11 + psrad m4, 11 + packssdw m0, m4 + paddw m4, m1, [t4+r10*2+32] + mova [t0+r10*2+32], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m4 + pmaddwd m3, m14 + punpckhwd m4, m4 + pmaddwd m4, m14 + paddd m1, m3 + paddd m2, m4 + psrad m1, 11 + psrad m2, 11 + packssdw m1, m2 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 + add dstq, dst_strideq + ret +.v: + mov r10, wq + psrld m13, m14, 16 ; y1 __ +.v_loop: + mova m6, [t1+r10*2+ 0] + paddw m2, m6, [t3+r10*2+ 0] + mova m4, [t2+r10*2+ 0] + mova m7, [t1+r10*2+32] + paddw m3, m7, [t3+r10*2+32] + mova m5, [t2+r10*2+32] + paddw m6, [t4+r10*2+ 0] + paddw m7, [t4+r10*2+32] + punpcklwd m0, m2, m4 + pmaddwd m0, m15 + punpckhwd m2, m4 + pmaddwd m2, m15 + punpcklwd m1, m3, m5 + pmaddwd m1, m15 + punpckhwd m3, m5 + pmaddwd m3, m15 + punpcklwd m5, m7, m6 + pmaddwd m4, m5, m14 + punpckhwd m7, m6 + pmaddwd m6, m7, m14 + pmaddwd m5, m13 + pmaddwd m7, m13 + paddd m0, m4 + paddd m2, m6 + paddd m1, m5 + paddd m3, m7 + REPX {psrad x, 11}, m0, m2, m1, m3 + packssdw m0, m2 + packssdw m1, m3 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + ret -INIT_YMM avx2 cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim mov xlimd, edgem movifnidn wd, wm diff --git a/src/x86/looprestoration_sse.asm b/src/x86/looprestoration_sse.asm index b7d7f0ffaa..5d3ca49211 100644 --- a/src/x86/looprestoration_sse.asm +++ b/src/x86/looprestoration_sse.asm @@ -29,34 +29,33 @@ SECTION_RODATA 16 -pb_right_ext_mask: times 16 db 0xff - times 16 db 0 -pb_14x0_1_2: times 14 db 0 - db 1, 2 -pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 - db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 -pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 -pb_0: times 16 db 0 -pb_2: times 16 db 2 -pb_3: times 16 db 3 -pb_4: times 16 db 4 -pb_15: times 16 db 15 -pb_0_1: times 8 db 0, 1 -pb_6_7: times 8 db 6, 7 -pb_14_15: times 8 db 14, 15 -pw_1: times 8 dw 1 -pw_16: times 8 dw 16 -pw_128: times 8 dw 128 -pw_255: times 8 dw 255 -pw_256: times 8 dw 256 -pw_2048: times 8 dw 2048 -pw_16380: times 8 dw 16380 -pw_5_6: times 4 dw 5, 6 -pd_1024: times 4 dd 1024 +wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4 +wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 +wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 +wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1 +wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +pb_unpcklwdw: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 + +pb_right_ext_mask: times 24 db 0xff + times 8 db 0 +pb_0: times 16 db 0 +pb_3: times 16 db 3 +pb_15: times 16 db 15 +pb_0_1: times 8 db 0, 1 +pb_14_15: times 8 db 14, 15 +pw_1: times 8 dw 1 +pw_16: times 8 dw 16 +pw_128: times 8 dw 128 +pw_256: times 8 dw 256 +pw_2048: times 8 dw 2048 +pw_2056: times 8 dw 2056 +pw_m16380: times 8 dw -16380 +pw_5_6: times 4 dw 5, 6 +pd_1024: times 4 dd 1024 %if ARCH_X86_32 -pd_256: times 4 dd 256 -pd_512: times 4 dd 512 -pd_2048: times 4 dd 2048 +pd_512: times 4 dd 512 +pd_2048: times 4 dd 2048 %endif pd_0xF0080029: times 4 dd 0xF0080029 pd_0xF00801C7: times 4 dd 0XF00801C7 @@ -95,539 +94,1037 @@ SECTION .text %define PIC_sym(sym) (sym) %endif -%macro PALIGNR 4 ; dst, src1, src2, shift - %if cpuflag(ssse3) - palignr %1, %2, %3, %4 - %else - %assign %%i regnumof%+%1 + 1 - %define %%tmp m %+ %%i - psrldq %1, %3, %4 - pslldq %%tmp, %2, 16-%4 - por %1, %%tmp - %endif -%endmacro - -%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero - %if cpuflag(ssse3) - pmaddubsw %1, %2 - %else - %if %5 == 1 - pxor %3, %3 - %endif - punpckhbw %4, %1, %3 - punpcklbw %1, %3 - pmaddwd %4, %2 - pmaddwd %1, %2 - packssdw %1, %4 - %endif -%endmacro - -;;;;;;;;;;;;;;;;;;;;;; -;; wiener ;; -;;;;;;;;;;;;;;;;;;;;;; - -%macro WIENER_H 0 +%macro WIENER 0 %if ARCH_X86_64 -cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, flt, w, h, edge - mov edged, edgem - movifnidn wd, wm - mov hd, hm +DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers +cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h, x + %define base 0 + mov fltq, fltmp + mov edged, r8m + mov wd, wm + mov hd, r6m + movq m14, [fltq] + add lpfq, wq + lea t1, [rsp+wq*2+16] + mova m15, [pw_2056] + add dstq, wq + movq m7, [fltq+16] + neg wq +%if cpuflag(ssse3) + pshufb m14, [wiener_init] + mova m8, [wiener_shufA] + pshufd m12, m14, q2222 ; x0 x0 + mova m9, [wiener_shufB] + pshufd m13, m14, q3333 ; x1 x2 + mova m10, [wiener_shufC] + punpcklqdq m14, m14 ; x3 + mova m11, [wiener_shufD] %else -cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, flt, w, h, edge - mov r5, edgem - mov [esp+12], r5 - mov wd, wm - mov hd, hm - SETUP_PIC hd - %define m15 m0 - %define m14 m1 - %define m13 m2 - %define m12 m3 + mova m10, [pw_m16380] + punpcklwd m14, m14 + pshufd m11, m14, q0000 ; x0 + pshufd m12, m14, q1111 ; x1 + pshufd m13, m14, q2222 ; x2 + pshufd m14, m14, q3333 ; x3 %endif - - movq m15, [fltq] +%else +DECLARE_REG_TMP 4, 0, _, 5 %if cpuflag(ssse3) - pshufb m12, m15, [PIC_sym(pb_6_7)] - pshufb m13, m15, [PIC_sym(pb_4)] - pshufb m14, m15, [PIC_sym(pb_2)] - pshufb m15, m15, [PIC_sym(pb_0)] + %define m10 [base+wiener_shufC] + %define m11 [base+wiener_shufD] + %define stk_off 96 %else - pshuflw m12, m15, q3333 - punpcklbw m15, m15 - pshufhw m13, m15, q0000 - pshuflw m14, m15, q2222 - pshuflw m15, m15, q0000 - punpcklqdq m12, m12 - punpckhqdq m13, m13 - punpcklqdq m14, m14 - punpcklqdq m15, m15 - psraw m13, 8 - psraw m14, 8 - psraw m15, 8 + %define m10 [base+pw_m16380] + %define m11 [stk+96] + %define stk_off 112 %endif - -%if ARCH_X86_64 - mova m11, [pw_2048] - mova m10, [pw_16380] - lea r11, [pb_right_ext_mask] - - DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim +cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride + %define base r6-pb_right_ext_mask-21 + %define stk esp + %define dstq leftq + %define edgeb byte edged + %define edged [stk+ 8] + %define dstmp [stk+12] + %define hd dword [stk+16] + %define wq [stk+20] + %define dst_strideq [stk+24] + %define leftmp [stk+28] + %define t2 [stk+32] + %define t4 [stk+36] + %define t5 [stk+40] + %define t6 [stk+44] + %define m8 [base+wiener_shufA] + %define m9 [base+wiener_shufB] + %define m12 [stk+48] + %define m13 [stk+64] + %define m14 [stk+80] + %define m15 [base+pw_2056] + mov r1, r7m ; flt + mov r0, r0m ; dst + mov r5, r5m ; w + mov lpfq, lpfm + mov r2, r8m ; edge + mov r4, r6m ; h + movq m3, [r1+ 0] + movq m7, [r1+16] + add r0, r5 + mov r1, r1m ; dst_stride + add lpfq, r5 + mov edged, r2 + mov r2, r2m ; left + mov dstmp, r0 + lea t1, [rsp+r5*2+stk_off] + mov hd, r4 + neg r5 + mov lpf_strideq, lpf_stridem + LEA r6, pb_right_ext_mask+21 + mov wq, r5 + mov dst_strideq, r1 + mov leftmp, r2 +%if cpuflag(ssse3) + pshufb m3, [base+wiener_init] + pshufd m1, m3, q2222 + pshufd m2, m3, q3333 + punpcklqdq m3, m3 %else - %define m10 [PIC_sym(pw_16380)] - %define m11 [PIC_sym(pw_2048)] - %define m12 [esp+0x14] - %define m13 [esp+0x24] - %define m14 [esp+0x34] - %define m15 [esp+0x44] - mova m12, m3 - mova m13, m2 - mova m14, m1 - mova m15, m0 - - DEFINE_ARGS dst, left, src, stride, x, w, h, edge - %define srcptrq srcq - %define dstptrq dstq - %define hd dword [esp+ 0] - %define edgeb byte [esp+12] - %define xlimd dword [esp+16] -%endif - - ; if (edge & has_right) align_w_to_16 - ; else w -= 3, and use that as limit in x loop - test edgeb, 2 ; has_right - jnz .align - mov xlimd, -3 - jmp .loop -.align: - add wd, 15 - and wd, ~15 + punpcklwd m3, m3 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m11, m0 +%endif + mova m12, m1 + mova m13, m2 + mova m14, m3 +%endif + pshufd m6, m7, q0000 ; y0 y1 + pshufd m7, m7, q1111 ; y2 y3 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea t3, [lpfq+lpf_strideq*4] + mov lpfq, dstmp + mov [rsp+gprsize*1], lpf_strideq + add t3, lpf_strideq + mov [rsp+gprsize*0], t3 ; below + mov t4, t1 + add t1, 384*2 + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp+gprsize*0] + call .hv_bottom + add lpfq, [rsp+gprsize*1] + call .hv_bottom +.v1: + call mangle(private_prefix %+ _wiener_filter7_ssse3).v + RET +.no_top: + lea t3, [lpfq+lpf_strideq*4] + mov lpfq, dstmp + mov [rsp+gprsize*1], lpf_strideq + lea t3, [t3+lpf_strideq*2] + mov [rsp+gprsize*0], t3 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call mangle(private_prefix %+ _wiener_filter7_ssse3).v +.v2: + call mangle(private_prefix %+ _wiener_filter7_ssse3).v + jmp .v1 +.extend_right: + movd m2, [lpfq-4] %if ARCH_X86_64 - xor xlimd, xlimd + push r0 + lea r0, [pb_right_ext_mask+21] + movu m0, [r0+xq+0] + movu m1, [r0+xq+8] + pop r0 %else - mov xlimd, 0 + movu m0, [r6+xq+0] + movu m1, [r6+xq+8] %endif - - ; main y loop for vertical filter -.loop: -%if ARCH_X86_64 - mov srcptrq, srcq - mov dstptrq, dstq - lea xd, [wq+xlimq] +%if cpuflag(ssse3) + pshufb m2, [base+pb_3] %else - mov [esp+8], srcq - mov [esp+4], dstq - mov xd, xlimd - add xd, wd + punpcklbw m2, m2 + pshuflw m2, m2, q3333 + punpcklqdq m2, m2 %endif - - ; load left edge pixels - test edgeb, 1 ; have_left - jz .emu_left - test leftq, leftq ; left == NULL for the edge-extended bottom/top - jz .load_left_combined - movd m0, [leftq] - movd m1, [srcq] - punpckldq m0, m1 - pslldq m0, 9 - add leftq, 4 - jmp .left_load_done -.load_left_combined: - movq m0, [srcq-3] - pslldq m0, 10 - jmp .left_load_done -.emu_left: - movd m0, [srcq] + pand m4, m0 + pand m5, m1 + pandn m0, m2 + pandn m1, m2 + por m4, m0 + por m5, m1 + ret +.h: + %define stk esp+4 ; offset due to call + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movifnidn leftq, leftmp + mova m4, [lpfq+xq] + movd m5, [leftq] + add leftq, 4 + pslldq m4, 4 + por m4, m5 + movifnidn leftmp, leftq + jmp .h_main +.h_extend_left: %if cpuflag(ssse3) - pshufb m0, [PIC_sym(pb_14x0_1_2)] + mova m4, [lpfq+xq] + pshufb m4, [base+wiener_l_shuf] %else - pslldq m1, m0, 13 - punpcklbw m0, m0 - pshuflw m0, m0, q0000 - punpcklqdq m0, m0 - psrldq m0, 2 - por m0, m1 -%endif - - ; load right edge pixels -.left_load_done: - cmp xd, 16 - jg .main_load - test xd, xd - jg .load_and_splat - je .splat_right - - ; for very small images (w=[1-2]), edge-extend the original cache, - ; ugly, but only runs in very odd cases + mova m5, [lpfq+xq] + pshufd m4, m5, q2103 + punpcklbw m5, m5 + punpcklwd m5, m5 + movss m4, m5 +%endif + jmp .h_main +.h_top: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+xq-4] +.h_main: + movu m5, [lpfq+xq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp xd, -18 + jl .h_have_right + call .extend_right +.h_have_right: +%macro %%h7 0 %if cpuflag(ssse3) - add wd, wd - %if ARCH_X86_64 - pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16] - %else - pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16] - %endif - shr wd, 1 + pshufb m0, m4, m8 + pmaddubsw m0, m12 + pshufb m1, m5, m8 + pmaddubsw m1, m12 + pshufb m2, m4, m9 + pmaddubsw m2, m13 + pshufb m3, m5, m9 + pmaddubsw m3, m13 + paddw m0, m2 + pshufb m2, m4, m10 + pmaddubsw m2, m13 + paddw m1, m3 + pshufb m3, m5, m10 + pmaddubsw m3, m13 + pshufb m4, m11 + paddw m0, m2 + pmullw m2, m14, m4 + pshufb m5, m11 + paddw m1, m3 + pmullw m3, m14, m5 + psllw m4, 7 + psllw m5, 7 + paddw m0, m2 + mova m2, [base+pw_m16380] + paddw m1, m3 + paddw m4, m2 + paddw m5, m2 + paddsw m0, m4 + paddsw m1, m5 %else - shl wd, 4 - pcmpeqd m2, m2 - movd m3, wd - psrldq m2, 2 - punpckhbw m1, m0, m0 - pshufhw m1, m1, q1122 - psllq m1, m3 - pand m0, m2 - pandn m2, m1 - por m0, m2 - shr wd, 4 + psrldq m0, m4, 1 + pslldq m1, m4, 1 + pxor m3, m3 + punpcklbw m0, m3 + punpckhbw m1, m3 + paddw m0, m1 + pmullw m0, m11 + psrldq m1, m4, 2 + pslldq m2, m4, 2 + punpcklbw m1, m3 + punpckhbw m2, m3 + paddw m1, m2 + pmullw m1, m12 + paddw m0, m1 + pshufd m2, m4, q0321 + punpcklbw m2, m3 + pmullw m1, m14, m2 + paddw m0, m1 + psrldq m1, m4, 3 + pslldq m4, 3 + punpcklbw m1, m3 + punpckhbw m4, m3 + paddw m1, m4 + pmullw m1, m13 + paddw m0, m1 + psllw m2, 7 + paddw m2, m10 + paddsw m0, m2 + psrldq m1, m5, 1 + pslldq m2, m5, 1 + punpcklbw m1, m3 + punpckhbw m2, m3 + paddw m1, m2 + pmullw m1, m11 + psrldq m2, m5, 2 + pslldq m4, m5, 2 + punpcklbw m2, m3 + punpckhbw m4, m3 + paddw m2, m4 + pmullw m2, m12 + paddw m1, m2 + pshufd m4, m5, q0321 + punpcklbw m4, m3 + pmullw m2, m14, m4 + paddw m1, m2 + psrldq m2, m5, 3 + pslldq m5, 3 + punpcklbw m2, m3 + punpckhbw m5, m3 + paddw m2, m5 + pmullw m2, m13 + paddw m1, m2 + psllw m4, 7 + paddw m4, m10 + paddsw m1, m4 %endif - - ; main x loop, mostly this starts in .main_load -.splat_right: - ; no need to load new pixels, just extend them from the (possibly previously - ; extended) previous load into m0 +%endmacro + %%h7 + psraw m0, 3 + psraw m1, 3 + paddw m0, m15 + paddw m1, m15 + mova [t1+xq*2+ 0], m0 + mova [t1+xq*2+16], m1 + add xq, 16 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movifnidn leftq, leftmp + mova m4, [lpfq+xq] + movd m5, [leftq] + add leftq, 4 + pslldq m4, 4 + por m4, m5 + movifnidn leftmp, leftq + jmp .hv_main +.hv_extend_left: %if cpuflag(ssse3) - pshufb m1, m0, [PIC_sym(pb_15)] + mova m4, [lpfq+xq] + pshufb m4, [base+wiener_l_shuf] %else - punpckhbw m1, m0, m0 - pshufhw m1, m1, q3333 - punpckhqdq m1, m1 -%endif - jmp .main_loop -.load_and_splat: - ; load new pixels and extend edge for right-most - movu m1, [srcptrq+3] + mova m5, [lpfq+xq] + pshufd m4, m5, q2103 + punpcklbw m5, m5 + punpcklwd m5, m5 + movss m4, m5 +%endif + jmp .hv_main +.hv_bottom: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+xq-4] +.hv_main: + movu m5, [lpfq+xq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp xd, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + %%h7 %if ARCH_X86_64 - sub r11, xq - movu m2, [r11+16] - add r11, xq + mova m2, [t4+xq*2] + paddw m2, [t2+xq*2] %else - sub PIC_reg, xd - movu m2, [PIC_sym(pb_right_ext_mask)+16] - add PIC_reg, xd + mov r2, t4 + mova m2, [r2+xq*2] + mov r2, t2 + paddw m2, [r2+xq*2] + mov r2, t5 %endif - movd m3, [srcptrq+2+xq] -%if cpuflag(ssse3) - pshufb m3, [PIC_sym(pb_0)] + mova m3, [t3+xq*2] +%if ARCH_X86_64 + mova m5, [t5+xq*2] %else - punpcklbw m3, m3 - pshuflw m3, m3, q0000 - punpcklqdq m3, m3 -%endif - pand m1, m2 - pxor m2, [PIC_sym(pb_right_ext_mask)] - pand m3, m2 - pxor m2, [PIC_sym(pb_right_ext_mask)] - por m1, m3 - jmp .main_loop -.main_load: - ; load subsequent line - movu m1, [srcptrq+3] -.main_loop: + mova m5, [r2+xq*2] + mov r2, t6 +%endif + paddw m5, [t1+xq*2] + psraw m0, 3 + psraw m1, 3 + paddw m0, m15 + paddw m1, m15 %if ARCH_X86_64 - PALIGNR m2, m1, m0, 10 - PALIGNR m3, m1, m0, 11 - PALIGNR m4, m1, m0, 12 - PALIGNR m5, m1, m0, 13 - PALIGNR m6, m1, m0, 14 - PALIGNR m7, m1, m0, 15 - - punpcklbw m0, m2, m1 - punpckhbw m2, m1 - punpcklbw m8, m3, m7 - punpckhbw m3, m7 - punpcklbw m7, m4, m6 - punpckhbw m4, m6 - PMADDUBSW m0, m15, m6, m9, 1 - PMADDUBSW m2, m15, m6, m9, 0 - PMADDUBSW m8, m14, m6, m9, 0 - PMADDUBSW m3, m14, m6, m9, 0 - PMADDUBSW m7, m13, m6, m9, 0 - PMADDUBSW m4, m13, m6, m9, 0 - paddw m0, m8 - paddw m2, m3 - %if cpuflag(ssse3) - pxor m6, m6 - %endif - punpcklbw m3, m5, m6 - punpckhbw m5, m6 - psllw m8, m3, 7 - psllw m6, m5, 7 - psubw m8, m10 - psubw m6, m10 - pmullw m3, m12 - pmullw m5, m12 - paddw m0, m7 - paddw m2, m4 - paddw m0, m3 - paddw m2, m5 - paddsw m0, m8 ; see the avx2 for an explanation - paddsw m2, m6 ; of how the clipping works here - psraw m0, 3 - psraw m2, 3 - paddw m0, m11 - paddw m2, m11 - mova [dstptrq+ 0], m0 - mova [dstptrq+16], m2 + paddw m4, m0, [t6+xq*2] %else - PALIGNR m2, m1, m0, 10 - punpcklbw m3, m2, m1 - punpckhbw m2, m1 - PMADDUBSW m3, m15, m4, m5, 1 - PMADDUBSW m2, m15, m4, m5, 0 - PALIGNR m4, m1, m0, 11 - PALIGNR m5, m1, m0, 15 - punpcklbw m6, m4, m5 - punpckhbw m4, m5 - PMADDUBSW m6, m14, m5, m7, 1 - PMADDUBSW m4, m14, m5, m7, 0 - paddw m3, m6 - paddw m2, m4 - PALIGNR m4, m1, m0, 12 - PALIGNR m5, m1, m0, 14 - punpcklbw m6, m4, m5 - punpckhbw m4, m5 - PMADDUBSW m6, m13, m5, m7, 1 - PMADDUBSW m4, m13, m5, m7, 0 - paddw m3, m6 - paddw m2, m4 - PALIGNR m6, m1, m0, 13 - %if cpuflag(ssse3) - pxor m5, m5 - %endif - punpcklbw m4, m6, m5 - punpckhbw m6, m5 - psllw m5, m4, 7 - psllw m7, m6, 7 - psubw m5, m10 - psubw m7, m10 - pmullw m4, m12 - pmullw m6, m12 - paddw m3, m4 - paddw m2, m6 - paddsw m3, m5 - paddsw m2, m7 - psraw m3, 3 - psraw m2, 3 - paddw m3, m11 - paddw m2, m11 - mova [dstptrq+ 0], m3 - mova [dstptrq+16], m2 -%endif - - mova m0, m1 - add srcptrq, 16 - add dstptrq, 32 - sub xd, 16 - cmp xd, 16 - jg .main_load - test xd, xd - jg .load_and_splat - cmp xd, xlimd - jg .splat_right + paddw m4, m0, [r2+xq*2] + mov r2, t4 +%endif + mova [t0+xq*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m5 + pmaddwd m3, m6 + punpckhwd m4, m5 + pmaddwd m4, m6 + paddd m0, m3 + mova m3, [t3+xq*2+16] + paddd m4, m2 +%if ARCH_X86_64 + mova m2, [t4+xq*2+16] + paddw m2, [t2+xq*2+16] + mova m5, [t5+xq*2+16] +%else + mova m2, [r2+xq*2+16] + mov r2, t2 + paddw m2, [r2+xq*2+16] + mov r2, t5 + mova m5, [r2+xq*2+16] + mov r2, t6 +%endif + paddw m5, [t1+xq*2+16] + psrad m0, 11 + psrad m4, 11 + packssdw m0, m4 +%if ARCH_X86_64 + paddw m4, m1, [t6+xq*2+16] +%else + paddw m4, m1, [r2+xq*2+16] + mov dstq, dstmp +%endif + mova [t0+xq*2+16], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m5 + pmaddwd m3, m6 + punpckhwd m4, m5 + pmaddwd m4, m6 + paddd m1, m3 + paddd m2, m4 + psrad m1, 11 + psrad m2, 11 + packssdw m1, m2 + packuswb m0, m1 + mova [dstq+xq], m0 + add xq, 16 + jl .hv_loop + add dstq, dst_strideq +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 +%else + mov dstmp, dstq + mov r1, t5 + mov r2, t4 + mov t6, r1 + mov t5, r2 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, r1 +%endif + ret +%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code +.v: + mov xq, wq +.v_loop: +%if ARCH_X86_64 + mova m1, [t4+xq*2] + paddw m1, [t2+xq*2] +%else + mov r2, t4 + mova m1, [r2+xq*2] + mov r2, t2 + paddw m1, [r2+xq*2] + mov r2, t6 +%endif + mova m2, [t3+xq*2] + mova m4, [t1+xq*2] +%if ARCH_X86_64 + paddw m3, m4, [t6+xq*2] + paddw m4, [t5+xq*2] +%else + paddw m3, m4, [r2+xq*2] + mov r2, t5 + paddw m4, [r2+xq*2] + mov r2, t4 +%endif + punpcklwd m0, m1, m2 + pmaddwd m0, m7 + punpckhwd m1, m2 + pmaddwd m1, m7 + punpcklwd m2, m3, m4 + pmaddwd m2, m6 + punpckhwd m3, m4 + pmaddwd m3, m6 + paddd m0, m2 + paddd m1, m3 +%if ARCH_X86_64 + mova m2, [t4+xq*2+16] + paddw m2, [t2+xq*2+16] +%else + mova m2, [r2+xq*2+16] + mov r2, t2 + paddw m2, [r2+xq*2+16] + mov r2, t6 +%endif + mova m3, [t3+xq*2+16] + mova m5, [t1+xq*2+16] +%if ARCH_X86_64 + paddw m4, m5, [t6+xq*2+16] + paddw m5, [t5+xq*2+16] +%else + paddw m4, m5, [r2+xq*2+16] + mov r2, t5 + paddw m5, [r2+xq*2+16] + movifnidn dstq, dstmp +%endif + psrad m0, 11 + psrad m1, 11 + packssdw m0, m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m5 + pmaddwd m3, m6 + punpckhwd m4, m5 + pmaddwd m4, m6 + paddd m1, m3 + paddd m2, m4 + psrad m1, 11 + psrad m2, 11 + packssdw m1, m2 + packuswb m0, m1 + mova [dstq+xq], m0 + add xq, 16 + jl .v_loop + add dstq, dst_strideq +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 +%else + mov dstmp, dstq + mov r1, t5 + mov r2, t4 + mov t6, r1 + mov t5, r2 +%endif + mov t4, t3 + mov t3, t2 + mov t2, t1 + ret +%endif -%if ARCH_X86_32 - mov srcq, [esp+8] - mov dstq, [esp+4] +%if ARCH_X86_64 +cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h, x + mov fltq, fltmp + mov edged, r8m + mov wd, wm + mov hd, r6m + movq m14, [fltq] + add lpfq, wq + mova m8, [pw_m16380] + lea t1, [rsp+wq*2+16] + mova m15, [pw_2056] + add dstq, wq + movq m7, [fltq+16] + neg wq +%if cpuflag(ssse3) + pshufb m14, [wiener_init] + mova m9, [wiener_shufB] + pshufd m13, m14, q3333 ; x1 x2 + mova m10, [wiener_shufC] + punpcklqdq m14, m14 ; x3 + mova m11, [wiener_shufD] + mova m12, [wiener_l_shuf] +%else + punpcklwd m14, m14 + pshufd m11, m14, q1111 ; x1 + pshufd m13, m14, q2222 ; x2 + pshufd m14, m14, q3333 ; x3 %endif - add srcq, strideq - add dstq, 384*2 - dec hd - jg .loop +%else +%if cpuflag(ssse3) + %define stk_off 80 +%else + %define m11 [stk+80] + %define stk_off 96 +%endif +cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride + %define stk esp + %define leftmp [stk+28] + %define m8 [base+pw_m16380] + %define m12 [base+wiener_l_shuf] + %define m14 [stk+48] + mov r1, r7m ; flt + mov r0, r0m ; dst + mov r5, r5m ; w + mov lpfq, lpfm + mov r2, r8m ; edge + mov r4, r6m ; h + movq m2, [r1+ 0] + movq m7, [r1+16] + add r0, r5 + mov r1, r1m ; dst_stride + add lpfq, r5 + mov edged, r2 + mov r2, r2m ; left + mov dstmp, r0 + lea t1, [rsp+r5*2+stk_off] + mov hd, r4 + neg r5 + mov lpf_strideq, lpf_stridem + LEA r6, pb_right_ext_mask+21 + mov wq, r5 + mov dst_strideq, r1 + mov leftmp, r2 +%if cpuflag(ssse3) + pshufb m2, [base+wiener_init] + pshufd m1, m2, q3333 + punpcklqdq m2, m2 +%else + punpcklwd m2, m2 + pshufd m0, m2, q1111 + pshufd m1, m2, q2222 + pshufd m2, m2, q3333 + mova m11, m0 +%endif + mova m13, m1 + mova m14, m2 +%endif + pshufd m6, m7, q0000 ; __ y1 + pshufd m7, m7, q1111 ; y2 y3 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea xq, [lpfq+lpf_strideq*4] + mov lpfq, dstmp + mov t3, t1 + add t1, 384*2 + mov [rsp+gprsize*1], lpf_strideq + add xq, lpf_strideq + mov [rsp+gprsize*0], xq ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, [rsp+gprsize*0] + call .hv_bottom + add lpfq, [rsp+gprsize*1] + call .hv_bottom +.end: RET +.no_top: + lea t3, [lpfq+lpf_strideq*4] + mov lpfq, dstmp + mov [rsp+gprsize*1], lpf_strideq + lea t3, [t3+lpf_strideq*2] + mov [rsp+gprsize*0], t3 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, dst_strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call mangle(private_prefix %+ _wiener_filter5_ssse3).v + add dstq, dst_strideq + mov t4, t3 + mov t3, t2 + mov t2, t1 + movifnidn dstmp, dstq +.v1: + call mangle(private_prefix %+ _wiener_filter5_ssse3).v + jmp .end +.h: + %define stk esp+4 + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movifnidn leftq, leftmp + mova m4, [lpfq+xq] + movd m5, [leftq] + add leftq, 4 + pslldq m4, 4 + por m4, m5 + movifnidn leftmp, leftq + jmp .h_main +.h_extend_left: +%if cpuflag(ssse3) + mova m4, [lpfq+xq] + pshufb m4, m12 +%else + mova m5, [lpfq+xq] + pshufd m4, m5, q2103 + punpcklbw m5, m5 + punpcklwd m5, m5 + movss m4, m5 +%endif + jmp .h_main +.h_top: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+xq-4] +.h_main: + movu m5, [lpfq+xq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp xd, -17 + jl .h_have_right + call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right +.h_have_right: +%macro %%h5 0 +%if cpuflag(ssse3) + pshufb m0, m4, m9 + pmaddubsw m0, m13 + pshufb m1, m5, m9 + pmaddubsw m1, m13 + pshufb m2, m4, m10 + pmaddubsw m2, m13 + pshufb m3, m5, m10 + pmaddubsw m3, m13 + pshufb m4, m11 + paddw m0, m2 + pmullw m2, m14, m4 + pshufb m5, m11 + paddw m1, m3 + pmullw m3, m14, m5 + psllw m4, 7 + psllw m5, 7 + paddw m4, m8 + paddw m5, m8 + paddw m0, m2 + paddw m1, m3 + paddsw m0, m4 + paddsw m1, m5 +%else + psrldq m0, m4, 2 + pslldq m1, m4, 2 + pxor m3, m3 + punpcklbw m0, m3 + punpckhbw m1, m3 + paddw m0, m1 + pmullw m0, m11 + pshufd m2, m4, q0321 + punpcklbw m2, m3 + pmullw m1, m14, m2 + paddw m0, m1 + psrldq m1, m4, 3 + pslldq m4, 3 + punpcklbw m1, m3 + punpckhbw m4, m3 + paddw m1, m4 + pmullw m1, m13 + paddw m0, m1 + psllw m2, 7 + paddw m2, m8 + paddsw m0, m2 + psrldq m1, m5, 2 + pslldq m4, m5, 2 + punpcklbw m1, m3 + punpckhbw m4, m3 + paddw m1, m4 + pmullw m1, m11 + pshufd m4, m5, q0321 + punpcklbw m4, m3 + pmullw m2, m14, m4 + paddw m1, m2 + psrldq m2, m5, 3 + pslldq m5, 3 + punpcklbw m2, m3 + punpckhbw m5, m3 + paddw m2, m5 + pmullw m2, m13 + paddw m1, m2 + psllw m4, 7 + paddw m4, m8 + paddsw m1, m4 +%endif %endmacro - -%macro WIENER_V 0 + %%h5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m15 + paddw m1, m15 + mova [t1+xq*2+ 0], m0 + mova [t1+xq*2+16], m1 + add xq, 16 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, dst_strideq + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movifnidn leftq, leftmp + mova m4, [lpfq+xq] + movd m5, [leftq] + add leftq, 4 + pslldq m4, 4 + por m4, m5 + movifnidn leftmp, leftq + jmp .hv_main +.hv_extend_left: +%if cpuflag(ssse3) + mova m4, [lpfq+xq] + pshufb m4, m12 +%else + mova m5, [lpfq+xq] + pshufd m4, m5, q2103 + punpcklbw m5, m5 + punpcklwd m5, m5 + movss m4, m5 +%endif + jmp .hv_main +.hv_bottom: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+xq-4] +.hv_main: + movu m5, [lpfq+xq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp xd, -17 + jl .hv_have_right + call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right +.hv_have_right: + %%h5 + mova m2, [t3+xq*2] + paddw m2, [t1+xq*2] + psraw m0, 3 + psraw m1, 3 + paddw m0, m15 + paddw m1, m15 %if ARCH_X86_64 -cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, flt, edge - mov edged, edgem - movifnidn fltq, fltmp - movifnidn hd, hm - movq m15, [fltq+16] - pshufd m14, m15, q1111 - pshufd m15, m15, q0000 - mova m12, [pd_1024] - - DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr - - mov ylimd, edged - and ylimd, 8 ; have_bottom - shr ylimd, 2 - sub ylimd, 3 + mova m3, [t2+xq*2] + paddw m4, m0, [t4+xq*2] %else -cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, flt, edge - %define ylimd [esp+12] - - mov r5d, edgem - and r5d, 8 - shr r5d, 2 - sub r5d, 3 - mov ylimd, r5d - mov fltq, fltmp - mov edged, edgem - - SETUP_PIC edged - - movq m0, [fltq+16] - pshufd m1, m0, q1111 - pshufd m0, m0, q0000 - mova [esp+0x50], m0 - mova [esp+0x40], m1 - - DEFINE_ARGS dst, stride, mid, w, h, y, edge - %define mptrq midq - %define dstptrq dstq - %define edgeb byte [esp] + mov r2, t2 + mova m3, [r2+xq*2] + mov r2, t4 + paddw m4, m0, [r2+xq*2] %endif - - ; main x loop for vertical filter, does one column of 16 pixels -.loop_x: - mova m3, [midq] ; middle line - - ; load top pixels - test edgeb, 4 ; have_top - jz .emu_top - mova m0, [midq-384*4] - mova m2, [midq-384*2] - mova m1, m0 - jmp .load_bottom_pixels -.emu_top: - mova m0, m3 - mova m1, m3 - mova m2, m3 - - ; load bottom pixels -.load_bottom_pixels: - mov yd, hd + mova [t0+xq*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m4 + pmaddwd m3, m6 + punpckhwd m4, m4 + pmaddwd m4, m6 + paddd m0, m3 + paddd m4, m2 + mova m2, [t3+xq*2+16] + paddw m2, [t1+xq*2+16] + psrad m0, 11 + psrad m4, 11 + packssdw m0, m4 %if ARCH_X86_64 - mov mptrq, midq - mov dstptrq, dstq - add yd, ylimd + mova m3, [t2+xq*2+16] + paddw m4, m1, [t4+xq*2+16] %else - mov [esp+8], midq - mov [esp+4], dstq - add yd, ylimd + paddw m4, m1, [r2+xq*2+16] + mov r2, t2 + mova m3, [r2+xq*2+16] + mov dstq, dstmp %endif - jg .load_threelines - - ; the remainder here is somewhat messy but only runs in very weird - ; circumstances at the bottom of the image in very small blocks (h=[1-3]), - ; so performance is not terribly important here... - je .load_twolines - cmp yd, -1 - je .load_oneline - ; h == 1 case - mova m5, m3 - mova m4, m3 - mova m6, m3 - jmp .loop -.load_oneline: - ; h == 2 case - mova m4, [midq+384*2] - mova m5, m4 - mova m6, m4 - jmp .loop -.load_twolines: - ; h == 3 case - mova m4, [midq+384*2] - mova m5, [midq+384*4] - mova m6, m5 - jmp .loop -.load_threelines: - ; h > 3 case - mova m4, [midq+384*2] - mova m5, [midq+384*4] - ; third line loaded in main loop below - - ; main y loop for vertical filter -.loop_load: - ; load one line into m6. if that pixel is no longer available, do - ; nothing, since m6 still has the data from the previous line in it. We - ; try to structure the loop so that the common case is evaluated fastest - mova m6, [mptrq+384*6] -.loop: + mova [t0+xq*2+16], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m4 + pmaddwd m3, m6 + punpckhwd m4, m4 + pmaddwd m4, m6 + paddd m1, m3 + paddd m2, m4 + psrad m1, 11 + psrad m2, 11 + packssdw m1, m2 + packuswb m0, m1 + mova [dstq+xq], m0 + add xq, 16 + jl .hv_loop + add dstq, dst_strideq + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 + movifnidn dstmp, dstq + ret +%if cpuflag(ssse3) +.v: + mov xq, wq +.v_loop: + mova m3, [t1+xq*2] + paddw m1, m3, [t3+xq*2] %if ARCH_X86_64 - paddw m7, m0, m6 - paddw m8, m1, m5 - paddw m9, m2, m4 - punpcklwd m10, m7, m8 - punpckhwd m7, m8 - punpcklwd m11, m9, m3 - punpckhwd m9, m3 - pmaddwd m10, m15 - pmaddwd m7, m15 - pmaddwd m11, m14 - pmaddwd m9, m14 - paddd m10, m12 - paddd m7, m12 - paddd m10, m11 - paddd m7, m9 - psrad m10, 11 - psrad m7, 11 - packssdw m10, m7 - packuswb m10, m10 - movq [dstptrq], m10 + mova m2, [t2+xq*2] + paddw m3, [t4+xq*2] %else - mova [esp+0x30], m1 - mova [esp+0x20], m2 - mova [esp+0x10], m3 - paddw m0, m6 - paddw m1, m5 - paddw m2, m4 - punpcklwd m7, m2, m3 - punpckhwd m2, m3 - punpcklwd m3, m0, m1 - punpckhwd m0, m1 - mova m1, [esp+0x50] - pmaddwd m3, m1 - pmaddwd m0, m1 - mova m1, [esp+0x40] - pmaddwd m7, m1 - pmaddwd m2, m1 - paddd m3, [PIC_sym(pd_1024)] - paddd m0, [PIC_sym(pd_1024)] - paddd m3, m7 - paddd m0, m2 - psrad m3, 11 - psrad m0, 11 - packssdw m3, m0 - packuswb m3, m3 - movq [dstq], m3 - mova m1, [esp+0x30] - mova m2, [esp+0x20] - mova m3, [esp+0x10] -%endif - ; shift pixels one position - mova m0, m1 - mova m1, m2 - mova m2, m3 - mova m3, m4 - mova m4, m5 - mova m5, m6 - add mptrq, 384*2 - add dstptrq, strideq - dec yd - jg .loop_load - ; for the bottom pixels, continue using m6 (as extended edge) - cmp yd, ylimd - jg .loop - -%if ARCH_X86_32 - mov midq, [esp+8] - mov dstq, [esp+4] + mov r2, t2 + mova m2, [r2+xq*2] + mov r2, t4 + paddw m3, [r2+xq*2] +%endif + punpcklwd m0, m1, m2 + pmaddwd m0, m7 + punpckhwd m1, m2 + pmaddwd m1, m7 + punpcklwd m2, m3 + pmaddwd m2, m6 + punpckhwd m3, m3 + pmaddwd m3, m6 + paddd m0, m2 + paddd m1, m3 + mova m4, [t1+xq*2+16] + paddw m2, m4, [t3+xq*2+16] +%if ARCH_X86_64 + mova m3, [t2+xq*2+16] + paddw m4, [t4+xq*2+16] +%else + paddw m4, [r2+xq*2+16] + mov r2, t2 + mova m3, [r2+xq*2+16] + mov dstq, dstmp +%endif + psrad m0, 11 + psrad m1, 11 + packssdw m0, m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4 + pmaddwd m3, m6 + punpckhwd m4, m4 + pmaddwd m4, m6 + paddd m1, m3 + paddd m2, m4 + psrad m1, 11 + psrad m2, 11 + packssdw m1, m2 + packuswb m0, m1 + mova [dstq+xq], m0 + add xq, 16 + jl .v_loop + ret %endif - add midq, 16 - add dstq, 8 - sub wd, 8 - jg .loop_x - RET %endmacro INIT_XMM sse2 -WIENER_H -WIENER_V +WIENER INIT_XMM ssse3 -WIENER_H -WIENER_V +WIENER ;;;;;;;;;;;;;;;;;;;;;;;;;; ;; self-guided ;; @@ -698,7 +1195,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim neg xq mov wq, xq %if ARCH_X86_64 - lea r10, [pb_right_ext_mask+16] + lea r10, [pb_right_ext_mask+24] %endif .loop_y: mov xq, wq @@ -734,7 +1231,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim %if ARCH_X86_64 movu m4, [r10+xq*2] %else - movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2] + movu m4, [PIC_sym(pb_right_ext_mask)+xd*2+24] %endif pand m2, m4 pandn m4, m3 @@ -1132,7 +1629,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y psubw m1, m4 ; aa movq m0, [srcq] XCHG_PIC_REG - punpcklbw m0, [PIC_sym(pb_right_ext_mask)+16] + punpcklbw m0, [PIC_sym(pb_0)] punpcklwd m4, m1, [PIC_sym(pw_16)] punpckhwd m1, [PIC_sym(pw_16)] punpcklwd m2, m0, [PIC_sym(pw_16)] @@ -1266,7 +1763,7 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge lea sumsqq, [sumsqq+wq*4-4] neg wq %if ARCH_X86_64 - lea r10, [pb_right_ext_mask+16] + lea r10, [pb_right_ext_mask+24] %else mov wm, xd %define wq wm @@ -1313,7 +1810,7 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge %if ARCH_X86_64 movu m4, [r10+xq*2] %else - movu m4, [PIC_sym(pb_right_ext_mask+16)+xd*2] + movu m4, [PIC_sym(pb_right_ext_mask)+xd*2+24] XCHG_PIC_REG %endif pand m2, m4 @@ -1880,6 +2377,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y %endif RET +%undef t2 cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt movifnidn wd, wm movd m0, wtm From ed676b0a5db78cc710d91927df01ca9f19e10adc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 9 Dec 2020 14:06:47 +0200 Subject: [PATCH 082/155] arm: loopfilter: Compare L != 0 before doing a splat --- src/arm/32/loopfilter.S | 2 +- src/arm/64/loopfilter.S | 2 +- src/arm/64/loopfilter16.S | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arm/32/loopfilter.S b/src/arm/32/loopfilter.S index 25f993d176..331bdbdbb1 100644 --- a/src/arm/32/loopfilter.S +++ b/src/arm/32/loopfilter.S @@ -783,11 +783,11 @@ function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1 vld1.8 {d6[]}, [r5] // sharp[1] sub r5, r5, #8 vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0] + vtst.32 d2, d1, d2 // L != 0 vmul.i32 d1, d1, d4 // L .ifc \type, y vdup.32 d15, r2 // vmask[2] .endif - vtst.32 d2, d1, d2 // L != 0 vdup.32 d14, r7 // vmask[1] vmov r10, r11, d2 orrs r10, r10, r11 diff --git a/src/arm/64/loopfilter.S b/src/arm/64/loopfilter.S index d45f2085a3..b26d954555 100644 --- a/src/arm/64/loopfilter.S +++ b/src/arm/64/loopfilter.S @@ -1034,11 +1034,11 @@ function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1 ld1r {v6.16b}, [x5] // sharp[1] sub x5, x5, #8 bif v1.16b, v0.16b, v3.16b // if (!l[0][0]) L = l[offset][0] + cmtst v2.4s, v1.4s, v2.4s // L != 0 mul v1.4s, v1.4s, v4.4s // L .ifc \type, y dup v15.4s, w2 // vmask[2] .endif - cmtst v2.4s, v1.4s, v2.4s // L != 0 dup v14.4s, w7 // vmask[1] mov x16, v2.d[0] mov x17, v2.d[1] diff --git a/src/arm/64/loopfilter16.S b/src/arm/64/loopfilter16.S index a7319189d0..147a93d028 100644 --- a/src/arm/64/loopfilter16.S +++ b/src/arm/64/loopfilter16.S @@ -808,11 +808,11 @@ function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 ld1r {v6.8b}, [x5] // sharp[1] sub x5, x5, #8 bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0] + cmtst v2.2s, v1.2s, v2.2s // L != 0 mul v1.2s, v1.2s, v4.2s // L .ifc \type, y dup v15.2s, w2 // vmask[2] .endif - cmtst v2.2s, v1.2s, v2.2s // L != 0 dup v14.2s, w7 // vmask[1] mov x16, v2.d[0] cmp x16, #0 From feb209f00410a2ad5a6914667019250c7d7d0904 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 10 Dec 2020 12:43:07 +0200 Subject: [PATCH 083/155] arm32: loopfilter: Fix a misindented/aligned operand --- src/arm/32/loopfilter.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arm/32/loopfilter.S b/src/arm/32/loopfilter.S index 331bdbdbb1..97b960534f 100644 --- a/src/arm/32/loopfilter.S +++ b/src/arm/32/loopfilter.S @@ -515,7 +515,7 @@ function lpf_v_8_8_neon lpf_8_wd8 sub r10, r0, r1, lsl #1 - sub r10, r10, r1 + sub r10, r10, r1 vst1.8 {d21}, [r10, :64], r1 // p2 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d22}, [r10, :64], r1 // p1 From 54322ed08eec141c983aec441ea58dcfbc81f2f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 9 Dec 2020 11:44:19 +0200 Subject: [PATCH 084/155] arm64: loopfilter16: Fix conditions for skipping parts of the filtering As the arm64 16 bpc loopfilter operates on a 8 pixel region at a time, inspect 2 bits (corresponding to 4 pixels each) from these registers, as we also shift them down by 2 bits at the end of the loop. This should allow skipping the loopfilter altogether (or using a smaller filter) in more cases. --- src/arm/64/loopfilter16.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arm/64/loopfilter16.S b/src/arm/64/loopfilter16.S index 147a93d028..e703020555 100644 --- a/src/arm/64/loopfilter16.S +++ b/src/arm/64/loopfilter16.S @@ -785,7 +785,7 @@ function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 orr w6, w6, w7 // vmask[0] |= vmask[1] 1: - tst w6, #0x0f + tst w6, #0x03 .ifc \dir, v ld1 {v0.8b}, [x4], #8 ld1 {v1.8b}, [x3], #8 @@ -847,14 +847,14 @@ function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 ushl v10.8h, v10.8h, v31.8h .ifc \type, y - tst w2, #0x0f + tst w2, #0x03 b.eq 2f // wd16 bl lpf_\dir\()_16_8_neon b 8f 2: .endif - tst w7, #0x0f + tst w7, #0x03 b.eq 3f .ifc \type, y // wd8 From 530e304a20273ded0d4e5cbe00f0381b79882439 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 2 Dec 2020 12:03:23 +0200 Subject: [PATCH 085/155] arm32: loopfilter: NEON implementation of loopfilter for 16 bpc This operates on 4 pixels as a time, while the arm64 version operated on 8 pixels at a time. As the registers only fit one single 4 pixel wide slice (with one single set of input parameters and mask bits), the high level logic for calculating those input parameters is done with GPRs and scalar instructions instead of SIMD as in the other implementations. --- src/arm/32/loopfilter16.S | 860 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 860 insertions(+) create mode 100644 src/arm/32/loopfilter16.S diff --git a/src/arm/32/loopfilter16.S b/src/arm/32/loopfilter16.S new file mode 100644 index 0000000000..e673075068 --- /dev/null +++ b/src/arm/32/loopfilter16.S @@ -0,0 +1,860 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +.macro loop_filter wd +function lpf_4_wd\wd\()_neon + vabd.u16 d0, d22, d23 // abs(p1 - p0) + vabd.u16 d1, d25, d24 // abs(q1 - q0) + vabd.u16 d2, d23, d24 // abs(p0 - q0) + vabd.u16 d3, d22, d25 // abs(p1 - q1) +.if \wd >= 6 + vabd.u16 d4, d21, d22 // abs(p2 - p1) + vabd.u16 d5, d26, d25 // abs(q2 - q1) +.endif +.if \wd >= 8 + vabd.u16 d6, d20, d21 // abs(p3 - p2) + vabd.u16 d7, d27, d26 // abs(q3 - q3) +.endif +.if \wd >= 6 + vmax.u16 d4, d4, d5 +.endif + vqadd.u16 d2, d2, d2 // abs(p0 - q0) * 2 +.if \wd >= 8 + vmax.u16 d6, d6, d7 +.endif + vshr.u16 d3, d3, #1 +.if \wd >= 8 + vmax.u16 d4, d4, d6 +.endif + vmax.u16 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0)) + vqadd.u16 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 +.if \wd >= 6 + vmax.u16 d4, d0, d4 + vcge.u16 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I +.else + vcge.u16 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I +.endif + vcge.u16 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E + vand d1, d1, d2 // fm && wd >= 4 (implicit) +.if \wd >= 6 + vmov d14, d1 // fm && wd > 4 (implicit) +.endif +.if \wd >= 16 + vmov d15, d1 // fm && wd == 16 (implicit) +.endif + + vmov r10, r11, d1 + orrs r10, r10, r11 + beq 9f // if (!fm || wd < 4) return; + +.if \wd >= 6 + vmov.i16 d10, #1 + vabd.u16 d2, d21, d23 // abs(p2 - p0) + vabd.u16 d3, d22, d23 // abs(p1 - p0) + vabd.u16 d4, d25, d24 // abs(q1 - q0) + vabd.u16 d5, d26, d24 // abs(q2 - q0) + vdup.16 d9, r9 // bitdepth_min_8 +.if \wd >= 8 + vabd.u16 d6, d20, d23 // abs(p3 - p0) + vabd.u16 d7, d27, d24 // abs(q3 - q0) +.endif + vmax.u16 d2, d2, d3 + vmax.u16 d4, d4, d5 +.if \wd >= 8 + vmax.u16 d6, d6, d7 +.endif + vmax.u16 d2, d2, d4 + vshl.u16 d10, d10, d9 // F = 1 << bitdepth_min_8 +.if \wd >= 8 + vmax.u16 d2, d2, d6 +.endif + +.if \wd == 16 + vabd.u16 d3, d17, d23 // abs(p6 - p0) + vabd.u16 d4, d18, d23 // abs(p5 - p0) + vabd.u16 d5, d19, d23 // abs(p4 - p0) +.endif + vcge.u16 d2, d10, d2 // flat8in +.if \wd == 16 + vabd.u16 d6, d28, d24 // abs(q4 - q0) + vabd.u16 d7, d29, d24 // abs(q5 - q0) + vabd.u16 d8, d30, d24 // abs(q6 - q0) +.endif + vand d14, d2, d14 // flat8in && fm && wd > 4 + vbic d1, d1, d14 // fm && wd >= 4 && !flat8in +.if \wd == 16 + vmax.u16 d3, d3, d4 + vmax.u16 d5, d5, d6 +.endif + vmov r10, r11, d1 +.if \wd == 16 + vmax.u16 d7, d7, d8 + vmax.u16 d3, d3, d5 + vmax.u16 d3, d3, d7 + vcge.u16 d3, d10, d3 // flat8out +.endif + orrs r10, r10, r11 +.if \wd == 16 + vand d15, d15, d3 // flat8out && fm && wd == 16 + vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16 + vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out +.endif + beq 1f // skip wd == 4 case +.endif + + vdup.16 d3, r8 // bitdepth_max + vsub.u16 d2, d22, d25 // p1 - q1 + vshr.u16 d3, d3, #1 // 128 << bitdepth_min_8 - 1 + vcgt.u16 d0, d0, d12 // hev + vmvn d9, d3 // - 128 * (1 << bitdepth_min_8) + vmin.s16 d2, d2, d3 // iclip_diff(p1 - q1) + vmax.s16 d2, d2, d9 // iclip_diff(p1 - q1) + vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1) + vsub.u16 d2, d24, d23 + vmov.i16 d6, #3 + vbic d0, d1, d0 // (fm && wd >= 4 && !hev) + vmul.i16 d2, d2, d6 + vmov.i16 d6, #4 + vadd.i16 d2, d2, d4 + vmin.s16 d2, d2, d3 // f = iclip_diff() + vmov.i16 d7, #3 + vmax.s16 d2, d2, d9 // f = iclip_diff() + vqadd.s16 d4, d6, d2 // f + 4 + vqadd.s16 d5, d7, d2 // f + 3 + vmin.s16 d4, d4, d3 // imin(f + 4, 128 << bitdepth_min_8 - 1) + vmin.s16 d5, d5, d3 // imin(f + 3, 128 << bitdepth_min_8 - 1) + vshr.s16 d4, d4, #3 // f1 + vshr.s16 d5, d5, #3 // f2 + vmov.i16 d9, #0 + vdup.16 d3, r8 // bitdepth_max + vqadd.s16 d2, d23, d5 // p0 + f2 + vqsub.s16 d6, d24, d4 // q0 - f1 + vrshr.s16 d4, d4, #1 // (f1 + 1) >> 1 + vmin.s16 d2, d2, d3 // out p0 = iclip_pixel() + vmin.s16 d6, d6, d3 // out q0 = iclip_pixel() + vmax.s16 d2, d2, d9 // out p0 = iclip_pixel() + vmax.s16 d6, d6, d9 // out q0 = iclip_pixel() + vbit d23, d2, d1 // if (fm && wd >= 4) + vbit d24, d6, d1 // if (fm && wd >= 4) + vqadd.s16 d2, d22, d4 // p1 + f + vqsub.s16 d6, d25, d4 // q1 - f + vmin.s16 d2, d2, d3 // out p1 = iclip_pixel() + vmin.s16 d6, d6, d3 // out q1 = iclip_pixel() + vmax.s16 d2, d2, d9 // out p1 = iclip_pixel() + vmax.s16 d6, d6, d9 // out q1 = iclip_pixel() + vbit d22, d2, d0 // if (fm && wd >= 4 && !hev) + vbit d25, d6, d0 // if (fm && wd >= 4 && !hev) +1: + +.if \wd == 6 + vmov r10, r11, d14 + orrs r10, r10, r11 + beq 2f // skip if there's no flat8in + + vadd.i16 d0, d21, d21 // p2 * 2 + vadd.i16 d2, d21, d22 // p2 + p1 + vadd.i16 d4, d22, d23 // p1 + p0 + vadd.i16 d6, d23, d24 // p0 + q0 + vadd.i16 d8, d0, d2 + vadd.i16 d10, d4, d6 + vadd.i16 d12, d24, d25 // q0 + q1 + vadd.i16 d8, d8, d10 + vsub.i16 d12, d12, d0 + vadd.i16 d10, d25, d26 // q1 + q2 + vrshr.u16 d0, d8, #3 // out p1 + + vadd.i16 d8, d8, d12 + vsub.i16 d10, d10, d2 + vadd.i16 d12, d26, d26 // q2 + q2 + vrshr.u16 d1, d8, #3 // out p0 + + vadd.i16 d8, d8, d10 + vsub.i16 d12, d12, d4 + vrshr.u16 d2, d8, #3 // out q0 + + vbit d22, d0, d14 // p1 if (flat8in) + vadd.i16 d8, d8, d12 + vbit d23, d1, d14 // p0 if (flat8in) + vrshr.u16 d3, d8, #3 // out q1 + vbit d24, d2, d14 // q0 if (flat8in) + vbit d25, d3, d14 // q1 if (flat8in) +.elseif \wd >= 8 + vmov r10, r11, d14 + orrs r10, r10, r11 +.if \wd == 8 + beq 8f // skip if there's no flat8in +.else + beq 2f // skip if there's no flat8in +.endif + + vadd.i16 d0, d20, d21 // p3 + p2 + vadd.i16 d2, d22, d25 // p1 + q1 + vadd.i16 d4, d20, d22 // p3 + p1 + vadd.i16 d6, d23, d26 // p0 + q2 + vadd.i16 d8, d0, d0 // 2 * (p3 + p2) + vadd.i16 d9, d23, d24 // p0 + q0 + vadd.i16 d8, d8, d4 // + p3 + p1 + vsub.i16 d2, d2, d0 // p1 + q1 - p3 - p2 + vadd.i16 d8, d8, d9 // + p0 + q0 + vsub.i16 d6, d6, d4 // p0 + q2 - p3 - p1 + vrshr.u16 d10, d8, #3 // out p2 + + vadd.i16 d8, d8, d2 + vadd.i16 d0, d20, d23 // p3 + p0 + vadd.i16 d2, d24, d27 // q0 + q3 + vrshr.u16 d11, d8, #3 // out p1 + + vadd.i16 d8, d8, d6 + vsub.i16 d2, d2, d0 // q0 + q3 - p3 - p0 + vadd.i16 d4, d21, d24 // p2 + q0 + vadd.i16 d6, d25, d27 // q1 + q3 + vrshr.u16 d12, d8, #3 // out p0 + + vadd.i16 d8, d8, d2 + vsub.i16 d6, d6, d4 // q1 + q3 - p2 - q0 + vadd.i16 d0, d22, d25 // p1 + q1 + vadd.i16 d2, d26, d27 // q2 + q3 + vrshr.u16 d13, d8, #3 // out q0 + + vadd.i16 d8, d8, d6 + vsub.i16 d2, d2, d0 // q2 + q3 - p1 - q1 + vrshr.u16 d0, d8, #3 // out q1 + + vadd.i16 d8, d8, d2 + + vbit d21, d10, d14 + vbit d22, d11, d14 + vbit d23, d12, d14 + vrshr.u16 d1, d8, #3 // out q2 + vbit d24, d13, d14 + vbit d25, d0, d14 + vbit d26, d1, d14 +.endif +2: +.if \wd == 16 + vmov r10, r11, d15 + orrs r10, r10, r11 + bne 1f // check if flat8out is needed + vmov r10, r11, d14 + orrs r10, r10, r11 + beq 8f // if there was no flat8in, just write the inner 4 pixels + b 7f // if flat8in was used, write the inner 6 pixels +1: + + vadd.i16 d2, d17, d17 // p6 + p6 + vadd.i16 d4, d17, d18 // p6 + p5 + vadd.i16 d6, d17, d19 // p6 + p4 + vadd.i16 d8, d17, d20 // p6 + p3 + vadd.i16 d12, d2, d4 + vadd.i16 d10, d6, d8 + vadd.i16 d6, d17, d21 // p6 + p2 + vadd.i16 d12, d12, d10 + vadd.i16 d8, d17, d22 // p6 + p1 + vadd.i16 d10, d18, d23 // p5 + p0 + vadd.i16 d6, d6, d8 + vadd.i16 d8, d19, d24 // p4 + q0 + vadd.i16 d12, d12, d6 + vadd.i16 d10, d10, d8 + vadd.i16 d6, d20, d25 // p3 + q1 + vadd.i16 d12, d12, d10 + vsub.i16 d6, d6, d2 + vadd.i16 d2, d21, d26 // p2 + q2 + vrshr.u16 d0, d12, #4 // out p5 + vadd.i16 d12, d12, d6 // - (p6 + p6) + (p3 + q1) + vsub.i16 d2, d2, d4 + vadd.i16 d4, d22, d27 // p1 + q3 + vadd.i16 d6, d17, d19 // p6 + p4 + vrshr.u16 d1, d12, #4 // out p4 + vadd.i16 d12, d12, d2 // - (p6 + p5) + (p2 + q2) + vsub.i16 d4, d4, d6 + vadd.i16 d6, d23, d28 // p0 + q4 + vadd.i16 d8, d17, d20 // p6 + p3 + vrshr.u16 d2, d12, #4 // out p3 + vadd.i16 d12, d12, d4 // - (p6 + p4) + (p1 + q3) + vsub.i16 d6, d6, d8 + vadd.i16 d8, d24, d29 // q0 + q5 + vadd.i16 d4, d17, d21 // p6 + p2 + vrshr.u16 d3, d12, #4 // out p2 + vadd.i16 d12, d12, d6 // - (p6 + p3) + (p0 + q4) + vsub.i16 d8, d8, d4 + vadd.i16 d6, d25, d30 // q1 + q6 + vadd.i16 d10, d17, d22 // p6 + p1 + vrshr.u16 d4, d12, #4 // out p1 + vadd.i16 d12, d12, d8 // - (p6 + p2) + (q0 + q5) + vsub.i16 d6, d6, d10 + vadd.i16 d8, d26, d30 // q2 + q6 + vbif d0, d18, d15 // out p5 + vadd.i16 d10, d18, d23 // p5 + p0 + vrshr.u16 d5, d12, #4 // out p0 + vadd.i16 d12, d12, d6 // - (p6 + p1) + (q1 + q6) + vsub.i16 d8, d8, d10 + vadd.i16 d10, d27, d30 // q3 + q6 + vbif d1, d19, d15 // out p4 + vadd.i16 d18, d19, d24 // p4 + q0 + vrshr.u16 d6, d12, #4 // out q0 + vadd.i16 d12, d12, d8 // - (p5 + p0) + (q2 + q6) + vsub.i16 d10, d10, d18 + vadd.i16 d8, d28, d30 // q4 + q6 + vbif d2, d20, d15 // out p3 + vadd.i16 d18, d20, d25 // p3 + q1 + vrshr.u16 d7, d12, #4 // out q1 + vadd.i16 d12, d12, d10 // - (p4 + q0) + (q3 + q6) + vsub.i16 d18, d8, d18 + vadd.i16 d10, d29, d30 // q5 + q6 + vbif d3, d21, d15 // out p2 + vadd.i16 d20, d21, d26 // p2 + q2 + vrshr.u16 d8, d12, #4 // out q2 + vadd.i16 d12, d12, d18 // - (p3 + q1) + (q4 + q6) + vsub.i16 d10, d10, d20 + vadd.i16 d18, d30, d30 // q6 + q6 + vbif d4, d22, d15 // out p1 + vadd.i16 d20, d22, d27 // p1 + q3 + vrshr.u16 d9, d12, #4 // out q3 + vadd.i16 d12, d12, d10 // - (p2 + q2) + (q5 + q6) + vsub.i16 d18, d18, d20 + vbif d5, d23, d15 // out p0 + vrshr.u16 d10, d12, #4 // out q4 + vadd.i16 d12, d12, d18 // - (p1 + q3) + (q6 + q6) + vrshr.u16 d11, d12, #4 // out q5 + vbif d6, d24, d15 // out q0 + vbif d7, d25, d15 // out q1 + vbif d8, d26, d15 // out q2 + vbif d9, d27, d15 // out q3 + vbif d10, d28, d15 // out q4 + vbif d11, d29, d15 // out q5 +.endif + + bx lr +.if \wd == 16 +7: + // Return to a shorter epilogue, writing only the inner 6 pixels + bx r6 +.endif +.if \wd >= 8 +8: + // Return to a shorter epilogue, writing only the inner 4 pixels + bx r7 +.endif +9: + // Return directly without writing back any pixels + bx r12 +endfunc +.endm + +loop_filter 16 +loop_filter 8 +loop_filter 6 +loop_filter 4 + +.macro lpf_4_wd16 + adr r6, 7f + CONFIG_THUMB + adr r7, 8f + CONFIG_THUMB + bl lpf_4_wd16_neon +.endm + +.macro lpf_4_wd8 + adr r7, 8f + CONFIG_THUMB + bl lpf_4_wd8_neon +.endm + +.macro lpf_4_wd6 + bl lpf_4_wd6_neon +.endm + +.macro lpf_4_wd4 + bl lpf_4_wd4_neon +.endm + +function lpf_v_4_4_neon + mov r12, lr + sub r10, r0, r1, lsl #1 + vld1.16 {d22}, [r10, :64], r1 // p1 + vld1.16 {d24}, [r0, :64], r1 // q0 + vld1.16 {d23}, [r10, :64], r1 // p0 + vld1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + + lpf_4_wd4 + + sub r10, r0, r1, lsl #1 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_4_4_neon + mov r12, lr + sub r10, r0, #4 + add r0, r10, r1, lsl #1 + vld1.16 {d22}, [r10], r1 + vld1.16 {d24}, [r0], r1 + vld1.16 {d23}, [r10], r1 + vld1.16 {d25}, [r0], r1 + add r0, r0, #4 + + transpose_4x4h q11, q12, d22, d23, d24, d25 + + lpf_4_wd4 + + sub r10, r0, r1, lsl #2 + sub r10, r10, #4 + transpose_4x4h q11, q12, d22, d23, d24, d25 + add r0, r10, r1, lsl #1 + + vst1.16 {d22}, [r10], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r10], r1 + vst1.16 {d25}, [r0], r1 + add r0, r0, #4 + bx r12 +endfunc + +function lpf_v_6_4_neon + mov r12, lr + sub r10, r0, r1, lsl #1 + sub r10, r10, r1 + vld1.16 {d21}, [r10, :64], r1 // p2 + vld1.16 {d24}, [r0, :64], r1 // q0 + vld1.16 {d22}, [r10, :64], r1 // p1 + vld1.16 {d25}, [r0, :64], r1 // q1 + vld1.16 {d23}, [r10, :64], r1 // p0 + vld1.16 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + + lpf_4_wd6 + + sub r10, r0, r1, lsl #1 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_6_4_neon + mov r12, lr + sub r10, r0, #8 + vld1.16 {d20}, [r10, :64], r1 + vld1.16 {d24}, [r0, :64], r1 + vld1.16 {d21}, [r10, :64], r1 + vld1.16 {d25}, [r0, :64], r1 + vld1.16 {d22}, [r10, :64], r1 + vld1.16 {d26}, [r0, :64], r1 + vld1.16 {d23}, [r10, :64], r1 + vld1.16 {d27}, [r0, :64], r1 + + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + + lpf_4_wd6 + + sub r0, r0, #4 + transpose_4x4h q11, q12, d22, d23, d24, d25 + sub r10, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + + vst1.16 {d22}, [r10], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r10], r1 + vst1.16 {d25}, [r0], r1 + add r0, r0, #4 + bx r12 +endfunc + +function lpf_v_8_4_neon + mov r12, lr + sub r10, r0, r1, lsl #2 + vld1.16 {d20}, [r10, :64], r1 // p3 + vld1.16 {d24}, [r0, :64], r1 // q0 + vld1.16 {d21}, [r10, :64], r1 // p2 + vld1.16 {d25}, [r0, :64], r1 // q1 + vld1.16 {d22}, [r10, :64], r1 // p1 + vld1.16 {d26}, [r0, :64], r1 // q2 + vld1.16 {d23}, [r10, :64], r1 // p0 + vld1.16 {d27}, [r0, :64], r1 // q3 + sub r0, r0, r1, lsl #2 + + lpf_4_wd8 + + sub r10, r0, r1, lsl #1 + sub r10, r10, r1 + vst1.16 {d21}, [r10, :64], r1 // p2 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d25}, [r0, :64], r1 // q1 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + bx r12 + +8: + sub r10, r0, r1, lsl #1 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_8_4_neon + mov r12, lr + sub r10, r0, #8 + vld1.16 {d20}, [r10, :64], r1 + vld1.16 {d24}, [r0, :64], r1 + vld1.16 {d21}, [r10, :64], r1 + vld1.16 {d25}, [r0, :64], r1 + vld1.16 {d22}, [r10, :64], r1 + vld1.16 {d26}, [r0, :64], r1 + vld1.16 {d23}, [r10, :64], r1 + vld1.16 {d27}, [r0, :64], r1 + + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + + lpf_4_wd8 + + sub r0, r0, r1, lsl #2 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + sub r10, r0, #8 + + vst1.16 {d20}, [r10, :64], r1 + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d21}, [r10, :64], r1 + vst1.16 {d25}, [r0, :64], r1 + vst1.16 {d22}, [r10, :64], r1 + vst1.16 {d26}, [r0, :64], r1 + vst1.16 {d23}, [r10, :64], r1 + vst1.16 {d27}, [r0, :64], r1 + bx r12 +8: + sub r0, r0, #4 + transpose_4x4h q11, q12, d22, d23, d24, d25 + sub r10, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + + vst1.16 {d22}, [r10], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r10], r1 + vst1.16 {d25}, [r0], r1 + add r0, r0, #4 + bx r12 +endfunc + +function lpf_v_16_4_neon + mov r12, lr + + sub r10, r0, r1, lsl #3 + add r10, r10, r1 + vld1.16 {d17}, [r10, :64], r1 // p6 + vld1.16 {d24}, [r0, :64], r1 // q0 + vld1.16 {d18}, [r10, :64], r1 // p5 + vld1.16 {d25}, [r0, :64], r1 // q1 + vld1.16 {d19}, [r10, :64], r1 // p4 + vld1.16 {d26}, [r0, :64], r1 // q2 + vld1.16 {d20}, [r10, :64], r1 // p3 + vld1.16 {d27}, [r0, :64], r1 // q3 + vld1.16 {d21}, [r10, :64], r1 // p2 + vld1.16 {d28}, [r0, :64], r1 // q4 + vld1.16 {d22}, [r10, :64], r1 // p1 + vld1.16 {d29}, [r0, :64], r1 // q5 + vld1.16 {d23}, [r10, :64], r1 // p0 + vld1.16 {d30}, [r0, :64], r1 // q6 + sub r0, r0, r1, lsl #3 + add r0, r0, r1 + + lpf_4_wd16 + + sub r10, r0, r1, lsl #2 + sub r10, r10, r1, lsl #1 + vst1.16 {d0}, [r10, :64], r1 // p5 + vst1.16 {d6}, [r0, :64], r1 // q0 + vst1.16 {d1}, [r10, :64], r1 // p4 + vst1.16 {d7}, [r0, :64], r1 // q1 + vst1.16 {d2}, [r10, :64], r1 // p3 + vst1.16 {d8}, [r0, :64], r1 // q2 + vst1.16 {d3}, [r10, :64], r1 // p2 + vst1.16 {d9}, [r0, :64], r1 // q3 + vst1.16 {d4}, [r10, :64], r1 // p1 + vst1.16 {d10}, [r0, :64], r1 // q4 + vst1.16 {d5}, [r10, :64], r1 // p0 + vst1.16 {d11}, [r0, :64], r1 // q5 + sub r0, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + bx r12 +7: + sub r10, r0, r1 + sub r10, r10, r1, lsl #1 + vst1.16 {d21}, [r10, :64], r1 // p2 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d25}, [r0, :64], r1 // q1 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d26}, [r0, :64], r1 // q2 + sub r0, r0, r1, lsl #1 + sub r0, r0, r1 + bx r12 + +8: + sub r10, r0, r1, lsl #1 + vst1.16 {d22}, [r10, :64], r1 // p1 + vst1.16 {d24}, [r0, :64], r1 // q0 + vst1.16 {d23}, [r10, :64], r1 // p0 + vst1.16 {d25}, [r0, :64], r1 // q1 + sub r0, r0, r1, lsl #1 + bx r12 +endfunc + +function lpf_h_16_4_neon + mov r12, lr + sub r10, r0, #16 + sub r0, r0, #8 + vld1.16 {d16}, [r10, :64], r1 + vld1.16 {d20}, [r0, :64], r1 + vld1.16 {d17}, [r10, :64], r1 + vld1.16 {d21}, [r0, :64], r1 + vld1.16 {d18}, [r10, :64], r1 + vld1.16 {d22}, [r0, :64], r1 + vld1.16 {d19}, [r10, :64], r1 + vld1.16 {d23}, [r0, :64], r1 + sub r10, r10, r1, lsl #2 + sub r0, r0, r1, lsl #2 + add r10, r10, #16 + add r0, r0, #16 + vld1.16 {d24}, [r10, :64], r1 + vld1.16 {d28}, [r0, :64], r1 + vld1.16 {d25}, [r10, :64], r1 + vld1.16 {d29}, [r0, :64], r1 + vld1.16 {d26}, [r10, :64], r1 + vld1.16 {d30}, [r0, :64], r1 + vld1.16 {d27}, [r10, :64], r1 + vld1.16 {d31}, [r0, :64], r1 + sub r0, r0, #8 + + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + transpose_4x4h q14, q15, d28, d29, d30, d31 + + lpf_4_wd16 + + sub r0, r0, r1, lsl #2 + transpose_4x4h q8, q0, d16, d17, d0, d1 + transpose_4x4h q1, q2, d2, d3, d4, d5 + transpose_4x4h q3, q4, d6, d7, d8, d9 + transpose_4x4h q5, q15, d10, d11, d30, d31 + sub r10, r0, #16 + sub r0, r0, #8 + + vst1.16 {d16}, [r10, :64], r1 + vst1.16 {d2}, [r0, :64], r1 + vst1.16 {d17}, [r10, :64], r1 + vst1.16 {d3}, [r0, :64], r1 + vst1.16 {d0}, [r10, :64], r1 + vst1.16 {d4}, [r0, :64], r1 + vst1.16 {d1}, [r10, :64], r1 + vst1.16 {d5}, [r0, :64], r1 + sub r10, r10, r1, lsl #2 + sub r0, r0, r1, lsl #2 + add r10, r10, #16 + add r0, r0, #16 + vst1.16 {d6}, [r10, :64], r1 + vst1.16 {d10}, [r0, :64], r1 + vst1.16 {d7}, [r10, :64], r1 + vst1.16 {d11}, [r0, :64], r1 + vst1.16 {d8}, [r10, :64], r1 + vst1.16 {d30}, [r0, :64], r1 + vst1.16 {d9}, [r10, :64], r1 + vst1.16 {d31}, [r0, :64], r1 + sub r0, r0, #8 + + bx r12 + +7: + sub r0, r0, r1, lsl #2 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + sub r10, r0, #8 + + vst1.16 {d20}, [r10, :64], r1 + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d21}, [r10, :64], r1 + vst1.16 {d25}, [r0, :64], r1 + vst1.16 {d22}, [r10, :64], r1 + vst1.16 {d26}, [r0, :64], r1 + vst1.16 {d23}, [r10, :64], r1 + vst1.16 {d27}, [r0, :64], r1 + bx r12 +8: + sub r0, r0, #4 + transpose_4x4h q11, q12, d22, d23, d24, d25 + sub r10, r0, r1, lsl #2 + sub r0, r0, r1, lsl #1 + + vst1.16 {d22}, [r10], r1 + vst1.16 {d24}, [r0], r1 + vst1.16 {d23}, [r10], r1 + vst1.16 {d25}, [r0], r1 + add r0, r0, #4 + bx r12 +endfunc + +// void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint32_t *const vmask, +// const uint8_t (*l)[4], ptrdiff_t b4_stride, +// const Av1FilterLUT *lut, const int w, +// const int bitdepth_max) + +.macro lpf_func dir, type +function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldr r8, [sp, #112] // bitdepth_max; the 'w' parameter isn't loaded + sub sp, sp, #8 + clz r9, r8 + rsb r9, r9, #24 // bitdepth_min_8 + ldrd r6, r7, [r2] // vmask[0], vmask[1] +.ifc \type, y + ldr r2, [r2, #8] // vmask[2] +.endif + add r5, r5, #128 // Move to sharp part of lut +.ifc \type, y + orr r7, r7, r2 // vmask[1] |= vmask[2] +.endif +.ifc \dir, v + sub r4, r3, r4, lsl #2 +.else + sub r3, r3, #4 + lsl r4, r4, #2 +.endif + orr r6, r6, r7 // vmask[0] |= vmask[1] + +1: + tst r6, #0x01 + strd r6, r7, [sp] +.ifc \dir, v + ldrb r10, [r4], #4 + ldrb r11, [r3], #4 +.else + ldrb r10, [r3] + ldrb r11, [r3, #4] + add r3, r3, r4 +.endif + beq 7f // if (!(vm & bits)) continue; + + orrs r12, r10, r11 + vdup.16 d31, r9 // bitdepth_min_8 + beq 7f // if (!(l[0][0] | l[offset][0])) continue; + cmp r11, #0 // Check for nonzero values in l[0][0] + ldrb r6, [r5], #8 // sharp[0] + it eq + moveq r11, r10 // if (!l[0][0]) L = l[offset][0] + ldrb r12, [r5] // sharp[1] + lsr r6, r11, r6 // L >> sharp[0] + sub r5, r5, #8 + cmp r12, r6 + lsr r10, r11, #4 // H + add r11, r11, #2 // L + 2 + it lt + movlt r6, r12 // imin(L >> sharp[0], sharp[1]) + add r11, r11, r11 // 2*(L + 2) + cmp r6, #1 + lsl r10, r10, r9 // H << bitdepth_min_8 + it lt + movlt r6, #1 // imax(imin(), 1) = limit = I + vdup.16 d12, r10 // H << bitdepth_min_8 + add r11, r11, r6 // 2*(L + 2) + limit = E + lsl r6, r6, r9 // I << bitdepth_min_8 + lsl r11, r11, r9 // E << bitdepth_min_8 + vdup.16 d11, r6 // I << bitdepth_min_8 + vdup.16 d10, r11 // E << bitdepth_min_8 + +.ifc \type, y + tst r2, #0x01 + beq 2f + // wd16 + bl lpf_\dir\()_16_4_neon + b 8f +2: +.endif + tst r7, #0x01 + beq 3f +.ifc \type, y + // wd8 + bl lpf_\dir\()_8_4_neon +.else + // wd6 + bl lpf_\dir\()_6_4_neon +.endif + b 8f +3: + // wd4 + bl lpf_\dir\()_4_4_neon +.ifc \dir, h + b 8f +7: + // For dir h, the functions above increment r0. + // If the whole function is skipped, increment it here instead. + add r0, r0, r1, lsl #2 +.else +7: +.endif +8: + ldrd r6, r7, [sp] +.ifc \type, y + lsr r2, r2, #1 // vmask[2] >>= 1 +.endif +.ifc \dir, v + add r0, r0, #8 +.else + // For dir h, r0 is returned incremented +.endif + lsrs r6, r6, #1 // vmask[0] >>= 1 + lsr r7, r7, #1 // vmask[1] >>= 1 + bne 1b + + add sp, sp, #8 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +lpf_func v, y +lpf_func h, y +lpf_func v, uv +lpf_func h, uv From 00e24945bd0fd7ec6bd78b3ae1bd1e611bb90bf1 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 15 Dec 2020 12:37:22 +0100 Subject: [PATCH 086/155] x86: Fix out-of-bounds read in AVX2 wiener_filter --- src/x86/looprestoration.asm | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm index cde5889ad3..8ebe230db9 100644 --- a/src/x86/looprestoration.asm +++ b/src/x86/looprestoration.asm @@ -200,11 +200,8 @@ cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ jmp .h_main .h_top: mov r10, wq - movu m4, [lpfq+r10-4] test edgeb, 1 ; LR_HAVE_LEFT - jnz .h_main - pshufb m4, [wiener_l_shuf] - jmp .h_main + jz .h_extend_left .h_loop: movu m4, [lpfq+r10-4] .h_main: @@ -525,11 +522,8 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ jmp .h_main .h_top: mov r10, wq - movu m4, [lpfq+r10-4] test edgeb, 1 ; LR_HAVE_LEFT - jnz .h_main - pshufb m4, m11 - jmp .h_main + jz .h_extend_left .h_loop: movu m4, [lpfq+r10-4] .h_main: From 98925ed63ee87a7637e5b8e3a95e240d8fb057ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 24 Nov 2020 12:46:24 +0200 Subject: [PATCH 087/155] arm32: Use ldrd for loading two parameters from the stack --- src/arm/32/ipred.S | 27 +++++++++------------------ src/arm/32/mc.S | 12 ++++-------- src/arm/32/mc16.S | 3 +-- 3 files changed, 14 insertions(+), 28 deletions(-) diff --git a/src/arm/32/ipred.S b/src/arm/32/ipred.S index 788c0625d5..e895696c7b 100644 --- a/src/arm/32/ipred.S +++ b/src/arm/32/ipred.S @@ -1422,8 +1422,7 @@ endfunc function ipred_filter_8bpc_neon, export=1 push {r4-r8, lr} movw r12, #511 - ldr r5, [sp, #28] - ldr r4, [sp, #24] + ldrd r4, r5, [sp, #24] and r5, r5, r12 // 511 movrel r6, X(filter_intra_taps) lsl r5, r5, #6 @@ -1594,8 +1593,7 @@ endfunc // const int w, const int h); function pal_pred_8bpc_neon, export=1 push {r4-r5, lr} - ldr r4, [sp, #12] - ldr r5, [sp, #16] + ldrd r4, r5, [sp, #12] vld1.16 {q0}, [r2, :128] clz lr, r4 adr r12, L(pal_pred_tbl) @@ -1706,8 +1704,7 @@ endfunc // const int16_t *ac, const int alpha); function ipred_cfl_128_8bpc_neon, export=1 push {r4-r8, lr} - ldr r4, [sp, #24] - ldr r5, [sp, #28] + ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz lr, r3 adr r12, L(ipred_cfl_128_tbl) @@ -1834,8 +1831,7 @@ endfunc // const int16_t *ac, const int alpha); function ipred_cfl_top_8bpc_neon, export=1 push {r4-r8, lr} - ldr r4, [sp, #24] - ldr r5, [sp, #28] + ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz lr, r3 adr r12, L(ipred_cfl_top_tbl) @@ -1898,8 +1894,7 @@ endfunc // const int16_t *ac, const int alpha); function ipred_cfl_left_8bpc_neon, export=1 push {r4-r8, lr} - ldr r4, [sp, #24] - ldr r5, [sp, #28] + ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] sub r2, r2, r4 clz lr, r3 @@ -1970,8 +1965,7 @@ endfunc // const int16_t *ac, const int alpha); function ipred_cfl_8bpc_neon, export=1 push {r4-r8, lr} - ldr r4, [sp, #24] - ldr r5, [sp, #28] + ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] sub r2, r2, r4 add r8, r3, r4 // width + height @@ -2134,8 +2128,7 @@ endfunc // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_8bpc_neon, export=1 push {r4-r8,lr} - ldr r4, [sp, #24] - ldr r5, [sp, #28] + ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 @@ -2447,8 +2440,7 @@ endfunc // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_8bpc_neon, export=1 push {r4-r8,lr} - ldr r4, [sp, #24] - ldr r5, [sp, #28] + ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 @@ -2678,8 +2670,7 @@ endfunc // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_8bpc_neon, export=1 push {r4-r8,lr} - ldr r4, [sp, #24] - ldr r5, [sp, #28] + ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 diff --git a/src/arm/32/mc.S b/src/arm/32/mc.S index 1a12d93ad9..65ef62f7fc 100644 --- a/src/arm/32/mc.S +++ b/src/arm/32/mc.S @@ -71,8 +71,7 @@ .macro bidir_fn type function \type\()_8bpc_neon, export=1 push {r4-r6,lr} - ldr r4, [sp, #16] - ldr r5, [sp, #20] + ldrd r4, r5, [sp, #16] clz r4, r4 .ifnc \type, avg ldr lr, [sp, #24] @@ -220,10 +219,8 @@ bidir_fn mask .macro w_mask_fn type function w_mask_\type\()_8bpc_neon, export=1 push {r4-r9,lr} - ldr r4, [sp, #28] - ldr r5, [sp, #32] - ldr r6, [sp, #36] - ldr r7, [sp, #40] + ldrd r4, r5, [sp, #28] + ldrd r6, r7, [sp, #36] clz r8, r4 adr r9, L(w_mask_\type\()_tbl) sub r8, r8, #24 @@ -455,8 +452,7 @@ w_mask_fn 420 function blend_8bpc_neon, export=1 push {r4-r5,lr} - ldr r4, [sp, #12] - ldr r5, [sp, #16] + ldrd r4, r5, [sp, #12] clz lr, r3 adr r3, L(blend_tbl) sub lr, lr, #26 diff --git a/src/arm/32/mc16.S b/src/arm/32/mc16.S index ca5e9991a1..4a10d69f81 100644 --- a/src/arm/32/mc16.S +++ b/src/arm/32/mc16.S @@ -123,8 +123,7 @@ .macro bidir_fn type, bdmax function \type\()_16bpc_neon, export=1 push {r4-r7,lr} - ldr r4, [sp, #20] - ldr r5, [sp, #24] + ldrd r4, r5, [sp, #20] ldr r6, [sp, #28] clz r4, r4 .ifnc \type, avg From 184acc3f9dfcae4704d3b61eb4f1c9d8f0078997 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 24 Nov 2020 14:11:13 +0200 Subject: [PATCH 088/155] arm32: mc: Back up and restore fewer registers in blend_h/blend_v --- src/arm/32/mc.S | 66 ++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/src/arm/32/mc.S b/src/arm/32/mc.S index 65ef62f7fc..bdd1309838 100644 --- a/src/arm/32/mc.S +++ b/src/arm/32/mc.S @@ -562,17 +562,17 @@ L(blend_tbl): endfunc function blend_h_8bpc_neon, export=1 - push {r4-r8,lr} - ldr r4, [sp, #24] + push {r4-r5,lr} + ldr r4, [sp, #12] movrel r5, X(obmc_masks) add r5, r5, r4 sub r4, r4, r4, lsr #2 - clz r6, r3 - adr r7, L(blend_h_tbl) - sub r6, r6, #24 - ldr r6, [r7, r6, lsl #2] - add r7, r7, r6 - bx r7 + clz lr, r3 + adr r12, L(blend_h_tbl) + sub lr, lr, #24 + ldr lr, [r12, lr, lsl #2] + add r12, r12, lr + bx r12 .align 2 L(blend_h_tbl): @@ -602,7 +602,7 @@ L(blend_h_tbl): vst1.16 {d20[0]}, [r0, :16], r1 vst1.16 {d20[1]}, [r12, :16], r1 bgt 2b - pop {r4-r8,pc} + pop {r4-r5,pc} 40: vmov.i8 d22, #64 add r12, r0, r1 @@ -621,7 +621,7 @@ L(blend_h_tbl): vst1.32 {d20[0]}, [r0, :32], r1 vst1.32 {d20[1]}, [r12, :32], r1 bgt 4b - pop {r4-r8,pc} + pop {r4-r5,pc} 80: vmov.i8 q8, #64 add r12, r0, r1 @@ -642,7 +642,7 @@ L(blend_h_tbl): vst1.u8 {d22}, [r0, :64], r1 vst1.u8 {d23}, [r12, :64], r1 bgt 8b - pop {r4-r8,pc} + pop {r4-r5,pc} 160: vmov.i8 q12, #64 add r12, r0, r1 @@ -669,7 +669,7 @@ L(blend_h_tbl): vst1.u8 {q9}, [r0, :128], r1 vst1.u8 {q10}, [r12, :128], r1 bgt 16b - pop {r4-r8,pc} + pop {r4-r5,pc} 320: 640: 1280: @@ -678,7 +678,7 @@ L(blend_h_tbl): 321: vld1.u8 {d6[]}, [r5]! vsub.i8 d7, d20, d6 - mov r8, r3 + mov r12, r3 32: vld1.u8 {q8, q9}, [r2, :128]! vld1.u8 {q0, q1}, [r0, :128] @@ -695,24 +695,24 @@ L(blend_h_tbl): vrshrn.i16 d2, q15, #6 vrshrn.i16 d3, q14, #6 vst1.u8 {q0, q1}, [r0, :128]! - subs r8, r8, #32 + subs r12, r12, #32 bgt 32b add r0, r0, r1 subs r4, r4, #1 bgt 321b - pop {r4-r8,pc} + pop {r4-r5,pc} endfunc function blend_v_8bpc_neon, export=1 - push {r4-r5,lr} - ldr r4, [sp, #12] - movrel r5, X(obmc_masks) - add r5, r5, r3 - clz lr, r3 + push {r4,lr} + ldr r4, [sp, #8] + movrel lr, X(obmc_masks) + add lr, lr, r3 + clz r12, r3 adr r3, L(blend_v_tbl) - sub lr, lr, #26 - ldr lr, [r3, lr, lsl #2] - add r3, r3, lr + sub r12, r12, #26 + ldr r12, [r3, r12, lsl #2] + add r3, r3, r12 bx r3 .align 2 @@ -725,7 +725,7 @@ L(blend_v_tbl): 20: vmov.i8 d22, #64 - vld1.8 {d2[]}, [r5] + vld1.8 {d2[]}, [lr] add r12, r0, r1 lsl r1, r1, #1 vsub.i8 d3, d22, d2 @@ -742,10 +742,10 @@ L(blend_v_tbl): vst1.8 {d6[0]}, [r0], r1 vst1.8 {d6[1]}, [r12], r1 bgt 2b - pop {r4-r5,pc} + pop {r4,pc} 40: vmov.i8 d22, #64 - vld1.32 {d4[]}, [r5, :32] + vld1.32 {d4[]}, [lr, :32] add r12, r0, r1 lsl r1, r1, #1 vsub.i8 d5, d22, d4 @@ -763,10 +763,10 @@ L(blend_v_tbl): vst1.8 {d20[2]}, [r0], r1 vst1.8 {d20[6]}, [r12], r1 bgt 4b - pop {r4-r5,pc} + pop {r4,pc} 80: vmov.i8 d16, #64 - vld1.u8 {d2}, [r5, :64] + vld1.u8 {d2}, [lr, :64] add r12, r0, r1 lsl r1, r1, #1 vsub.i8 d17, d16, d2 @@ -787,10 +787,10 @@ L(blend_v_tbl): vst1.16 {d22[2]}, [r0, :16], r1 vst1.16 {d23[2]}, [r12, :16], r1 bgt 8b - pop {r4-r5,pc} + pop {r4,pc} 160: vmov.i8 q12, #64 - vld1.u8 {q14}, [r5, :128] + vld1.u8 {q14}, [lr, :128] add r12, r0, r1 lsl r1, r1, #1 vsub.i8 q11, q12, q14 @@ -817,10 +817,10 @@ L(blend_v_tbl): vst1.32 {d19[0]}, [r0, :32], r1 vst1.32 {d21[0]}, [r12, :32], r1 bgt 16b - pop {r4-r5,pc} + pop {r4,pc} 320: vmov.i8 q10, #64 - vld1.u8 {q2, q3}, [r5, :128] + vld1.u8 {q2, q3}, [lr, :128] vsub.i8 q11, q10, q2 vsub.i8 d24, d20, d6 32: @@ -838,7 +838,7 @@ L(blend_v_tbl): vrshrn.i16 d2, q15, #6 vst1.u8 {d0, d1, d2}, [r0, :64], r1 bgt 32b - pop {r4-r5,pc} + pop {r4,pc} endfunc From 3df2fea0854dfa39d3e1706f08343b0687f44981 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 23 Nov 2020 11:16:47 +0200 Subject: [PATCH 089/155] arm32: mc: Use two-word replicating loads in emu_edge --- src/arm/32/mc.S | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/arm/32/mc.S b/src/arm/32/mc.S index bdd1309838..68b168b876 100644 --- a/src/arm/32/mc.S +++ b/src/arm/32/mc.S @@ -3234,10 +3234,9 @@ function emu_edge_8bpc_neon, export=1 .macro v_loop need_left, need_right 0: .if \need_left - vld1.8 {d0[]}, [r8] + vld1.8 {d0[], d1[]}, [r8] mov r12, r6 // out = dst mov r3, r4 - vmov d1, d0 1: subs r3, r3, #16 vst1.8 {q0}, [r12, :128]! @@ -3259,10 +3258,9 @@ function emu_edge_8bpc_neon, export=1 add r3, r8, r2 // in + center_w sub r3, r3, #1 // in + center_w - 1 add r12, r6, r4 // dst + left_ext - vld1.8 {d0[]}, [r3] + vld1.8 {d0[], d1[]}, [r3] add r12, r12, r2 // out = dst + left_ext + center_w mov r3, r11 - vmov d1, d0 1: subs r3, r3, #16 vst1.8 {q0}, [r12]! From 1ca7a0dc6512f26387b6a991c4d24462247f07dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 24 Nov 2020 14:28:50 +0200 Subject: [PATCH 090/155] arm32: mc: Use a replicating vld1 to all lanes in one place This is one cycle faster, when the other lanes don't need to be preserved, on some (old) cores. --- src/arm/32/mc.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arm/32/mc.S b/src/arm/32/mc.S index 68b168b876..676c1d74e5 100644 --- a/src/arm/32/mc.S +++ b/src/arm/32/mc.S @@ -590,7 +590,7 @@ L(blend_h_tbl): lsl r1, r1, #1 2: vld1.16 {d2[], d3[]}, [r5, :16]! - vld1.32 {d1[0]}, [r2, :32]! + vld1.32 {d1[]}, [r2, :32]! subs r4, r4, #2 vld1.16 {d0[]}, [r0, :16] vzip.8 d2, d3 From 3a94bb32e7a86d2aa956a9ebfca2e786422b0047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 24 Nov 2020 21:14:23 +0200 Subject: [PATCH 091/155] arm32: mc: Improve scheduling in blend_h --- src/arm/32/mc.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arm/32/mc.S b/src/arm/32/mc.S index 676c1d74e5..d4a90b99c4 100644 --- a/src/arm/32/mc.S +++ b/src/arm/32/mc.S @@ -694,8 +694,8 @@ L(blend_h_tbl): vmlal.u8 q14, d3, d7 vrshrn.i16 d2, q15, #6 vrshrn.i16 d3, q14, #6 - vst1.u8 {q0, q1}, [r0, :128]! subs r12, r12, #32 + vst1.u8 {q0, q1}, [r0, :128]! bgt 32b add r0, r0, r1 subs r4, r4, #1 From ef8ed6a2b19749140cb0ebc401d0e2ec2c29807f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 1 Dec 2020 15:35:42 +0200 Subject: [PATCH 092/155] arm32: mc16: Fix column alignment in the warp function --- src/arm/32/mc16.S | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/arm/32/mc16.S b/src/arm/32/mc16.S index 4a10d69f81..cc4e52ef61 100644 --- a/src/arm/32/mc16.S +++ b/src/arm/32/mc16.S @@ -2604,8 +2604,8 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1 ldrd r8, r9, [r4] sxth r7, r8 - asr r8, r8, #16 - asr r4, r9, #16 + asr r8, r8, #16 + asr r4, r9, #16 sxth r9, r9 mov r10, #8 sub r2, r2, r3, lsl #1 @@ -2665,26 +2665,26 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1 // This ordering of vmull/vmlal is highly beneficial for // Cortex A8/A9/A53 here, but harmful for Cortex A7. - vmull.s16 q0, d16, d2 - vmlal.s16 q0, d18, d4 - vmlal.s16 q0, d20, d6 - vmlal.s16 q0, d22, d8 - vmlal.s16 q0, d24, d10 - vmlal.s16 q0, d26, d12 - vmull.s16 q1, d17, d3 - vmlal.s16 q1, d19, d5 - vmlal.s16 q1, d21, d7 - vmlal.s16 q1, d23, d9 - vmlal.s16 q1, d25, d11 - vmlal.s16 q1, d27, d13 + vmull.s16 q0, d16, d2 + vmlal.s16 q0, d18, d4 + vmlal.s16 q0, d20, d6 + vmlal.s16 q0, d22, d8 + vmlal.s16 q0, d24, d10 + vmlal.s16 q0, d26, d12 + vmull.s16 q1, d17, d3 + vmlal.s16 q1, d19, d5 + vmlal.s16 q1, d21, d7 + vmlal.s16 q1, d23, d9 + vmlal.s16 q1, d25, d11 + vmlal.s16 q1, d27, d13 vmovl.s8 q2, d14 vmovl.s8 q3, d15 - vmlal.s16 q0, d28, d4 - vmlal.s16 q0, d30, d6 - vmlal.s16 q1, d29, d5 - vmlal.s16 q1, d31, d7 + vmlal.s16 q0, d28, d4 + vmlal.s16 q0, d30, d6 + vmlal.s16 q1, d29, d5 + vmlal.s16 q1, d31, d7 .ifb \t ldr lr, [sp, #4] // -(7 + intermediate_bits) From c0bf1640a15dbc40f5bedf973b613c53b14dfcca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 25 Nov 2020 14:46:52 +0200 Subject: [PATCH 093/155] arm64: mc16: Get rid of one instruction in blend_v w16 --- src/arm/64/mc16.S | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S index c00b709e68..7a8460e404 100644 --- a/src/arm/64/mc16.S +++ b/src/arm/64/mc16.S @@ -909,12 +909,11 @@ function blend_v_16bpc_neon, export=1 b.gt 8b ret 160: - ld1 {v16.8b, v17.8b}, [x5] + ld1 {v16.16b}, [x5] sub x1, x1, #16 - neg v16.8b, v16.8b // -m - neg v17.8b, v17.8b - sxtl v16.8h, v16.8b - sxtl v17.8h, v17.8b + neg v17.16b, v16.16b // -m + sxtl v16.8h, v17.8b + sxtl2 v17.8h, v17.16b shl v16.8h, v16.8h, #9 // -m << 9 shl v17.4h, v17.4h, #9 16: From 73bd5e3e32a2fb0ed2f34c01644381f0af5de22e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 3 Sep 2020 14:53:09 +0300 Subject: [PATCH 094/155] arm32: mc: Add NEON implementation of the blend functions for 16 bpc Checkasm numbers: Cortex A7 A8 A53 A72 A73 blend_h_w2_16bpc_neon: 190.0 163.0 135.5 67.4 71.2 blend_h_w4_16bpc_neon: 204.4 119.1 140.3 61.2 74.9 blend_h_w8_16bpc_neon: 247.6 126.2 159.5 86.1 88.4 blend_h_w16_16bpc_neon: 391.6 186.5 230.7 134.9 149.4 blend_h_w32_16bpc_neon: 734.9 354.2 454.1 248.1 270.9 blend_h_w64_16bpc_neon: 1290.8 611.7 801.1 456.6 491.3 blend_h_w128_16bpc_neon: 2876.4 1354.2 1788.6 1083.4 1092.0 blend_v_w2_16bpc_neon: 264.4 325.2 206.8 107.6 123.0 blend_v_w4_16bpc_neon: 471.8 358.7 356.9 187.0 229.9 blend_v_w8_16bpc_neon: 616.9 365.3 445.4 218.2 248.5 blend_v_w16_16bpc_neon: 928.3 517.1 629.1 325.0 358.0 blend_v_w32_16bpc_neon: 1771.6 790.1 1106.1 631.2 584.7 blend_w4_16bpc_neon: 128.8 66.6 95.5 33.5 42.0 blend_w8_16bpc_neon: 238.7 118.0 156.8 76.5 84.5 blend_w16_16bpc_neon: 809.7 360.9 482.3 268.5 298.3 blend_w32_16bpc_neon: 2015.7 916.6 1177.0 682.1 730.9 Corresponding numbers for arm64, for comparison: Cortex A53 A72 A73 blend_h_w2_16bpc_neon: 109.3 83.1 56.8 blend_h_w4_16bpc_neon: 114.1 61.1 62.3 blend_h_w8_16bpc_neon: 133.3 80.8 81.0 blend_h_w16_16bpc_neon: 215.6 132.7 149.5 blend_h_w32_16bpc_neon: 390.4 253.9 235.8 blend_h_w64_16bpc_neon: 715.8 455.8 454.0 blend_h_w128_16bpc_neon: 1649.7 1034.7 1066.2 blend_v_w2_16bpc_neon: 185.9 176.3 178.3 blend_v_w4_16bpc_neon: 338.3 184.4 234.3 blend_v_w8_16bpc_neon: 427.0 214.5 252.7 blend_v_w16_16bpc_neon: 680.4 358.1 389.2 blend_v_w32_16bpc_neon: 1100.7 615.5 690.1 blend_w4_16bpc_neon: 76.0 32.3 32.1 blend_w8_16bpc_neon: 134.4 76.3 71.5 blend_w16_16bpc_neon: 476.3 268.8 301.5 blend_w32_16bpc_neon: 1226.8 659.9 782.8 --- src/arm/32/mc16.S | 442 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 442 insertions(+) diff --git a/src/arm/32/mc16.S b/src/arm/32/mc16.S index cc4e52ef61..cc116ee024 100644 --- a/src/arm/32/mc16.S +++ b/src/arm/32/mc16.S @@ -273,6 +273,448 @@ bidir_fn w_avg, r7 bidir_fn mask, r7 +function blend_16bpc_neon, export=1 + push {r4-r5,lr} + ldrd r4, r5, [sp, #12] + clz lr, r3 + adr r3, L(blend_tbl) + sub lr, lr, #26 + ldr lr, [r3, lr, lsl #2] + add r3, r3, lr + bx r3 + + .align 2 +L(blend_tbl): + .word 320f - L(blend_tbl) + CONFIG_THUMB + .word 160f - L(blend_tbl) + CONFIG_THUMB + .word 80f - L(blend_tbl) + CONFIG_THUMB + .word 40f - L(blend_tbl) + CONFIG_THUMB + +40: + add r12, r0, r1 + lsl r1, r1, #1 +4: + vld1.8 {d4}, [r5, :64]! + vld1.16 {q1}, [r2, :128]! + vld1.16 {d0}, [r0, :64] + vneg.s8 d4, d4 // -m + subs r4, r4, #2 + vld1.16 {d1}, [r12, :64] + vmovl.s8 q2, d4 + vshl.i16 q2, q2, #9 // -m << 9 + vsub.i16 q1, q0, q1 // a - b + vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 + vadd.i16 q0, q0, q1 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r12, :64], r1 + bgt 4b + pop {r4-r5,pc} +80: + add r12, r0, r1 + lsl r1, r1, #1 +8: + vld1.8 {q8}, [r5, :128]! + vld1.16 {q2, q3}, [r2, :128]! + vneg.s8 q9, q8 // -m + vld1.16 {q0}, [r0, :128] + vld1.16 {q1}, [r12, :128] + vmovl.s8 q8, d18 + vmovl.s8 q9, d19 + vshl.i16 q8, q8, #9 // -m << 9 + vshl.i16 q9, q9, #9 + vsub.i16 q2, q0, q2 // a - b + vsub.i16 q3, q1, q3 + subs r4, r4, #2 + vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q3, q3, q9 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r12, :128], r1 + bgt 8b + pop {r4-r5,pc} +160: + add r12, r0, r1 + lsl r1, r1, #1 +16: + vld1.8 {q12, q13}, [r5, :128]! + vld1.16 {q8, q9}, [r2, :128]! + subs r4, r4, #2 + vneg.s8 q14, q12 // -m + vld1.16 {q0, q1}, [r0, :128] + vneg.s8 q15, q13 + vld1.16 {q10, q11}, [r2, :128]! + vmovl.s8 q12, d28 + vmovl.s8 q13, d29 + vmovl.s8 q14, d30 + vmovl.s8 q15, d31 + vld1.16 {q2, q3}, [r12, :128] + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 q13, q13, #9 + vshl.i16 q14, q14, #9 + vshl.i16 q15, q15, #9 + vsub.i16 q8, q0, q8 // a - b + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vsub.i16 q11, q3, q11 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q13 + vqrdmulh.s16 q10, q10, q14 + vqrdmulh.s16 q11, q11, q15 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vst1.16 {q0, q1}, [r0, :128], r1 + vadd.i16 q3, q3, q11 + vst1.16 {q2, q3}, [r12, :128], r1 + bgt 16b + pop {r4-r5,pc} +320: + add r12, r0, #32 +32: + vld1.8 {q12, q13}, [r5, :128]! + vld1.16 {q8, q9}, [r2, :128]! + subs r4, r4, #1 + vneg.s8 q14, q12 // -m + vld1.16 {q0, q1}, [r0, :128] + vneg.s8 q15, q13 + vld1.16 {q10, q11}, [r2, :128]! + vmovl.s8 q12, d28 + vmovl.s8 q13, d29 + vmovl.s8 q14, d30 + vmovl.s8 q15, d31 + vld1.16 {q2, q3}, [r12, :128] + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 q13, q13, #9 + vshl.i16 q14, q14, #9 + vshl.i16 q15, q15, #9 + vsub.i16 q8, q0, q8 // a - b + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vsub.i16 q11, q3, q11 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q13 + vqrdmulh.s16 q10, q10, q14 + vqrdmulh.s16 q11, q11, q15 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vst1.16 {q0, q1}, [r0, :128], r1 + vadd.i16 q3, q3, q11 + vst1.16 {q2, q3}, [r12, :128], r1 + bgt 32b + pop {r4-r5,pc} +endfunc + +function blend_h_16bpc_neon, export=1 + push {r4-r5,lr} + ldr r4, [sp, #12] + movrel r5, X(obmc_masks) + add r5, r5, r4 + sub r4, r4, r4, lsr #2 + clz lr, r3 + adr r12, L(blend_h_tbl) + sub lr, lr, #24 + ldr lr, [r12, lr, lsl #2] + add r12, r12, lr + bx r12 + + .align 2 +L(blend_h_tbl): + .word 1280f - L(blend_h_tbl) + CONFIG_THUMB + .word 640f - L(blend_h_tbl) + CONFIG_THUMB + .word 320f - L(blend_h_tbl) + CONFIG_THUMB + .word 160f - L(blend_h_tbl) + CONFIG_THUMB + .word 80f - L(blend_h_tbl) + CONFIG_THUMB + .word 40f - L(blend_h_tbl) + CONFIG_THUMB + .word 20f - L(blend_h_tbl) + CONFIG_THUMB + +20: + add r12, r0, r1 + lsl r1, r1, #1 +2: + vld2.8 {d4[], d5[]}, [r5, :16]! + vld1.16 {d2}, [r2, :64]! + vext.8 d4, d4, d5, #6 + subs r4, r4, #2 + vneg.s8 d4, d4 // -m + vld1.32 {d0[]}, [r0, :32] + vld1.32 {d0[1]}, [r12, :32] + vmovl.s8 q2, d4 + vshl.i16 d4, d4, #9 // -m << 9 + vsub.i16 d2, d0, d2 // a - b + vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6 + vadd.i16 d0, d0, d2 + vst1.32 {d0[0]}, [r0, :32], r1 + vst1.32 {d0[1]}, [r12, :32], r1 + bgt 2b + pop {r4-r5,pc} +40: + add r12, r0, r1 + lsl r1, r1, #1 +4: + vld2.8 {d4[], d5[]}, [r5, :16]! + vld1.16 {q1}, [r2, :128]! + vext.8 d4, d4, d5, #4 + subs r4, r4, #2 + vneg.s8 d4, d4 // -m + vld1.16 {d0}, [r0, :64] + vld1.16 {d1}, [r12, :64] + vmovl.s8 q2, d4 + vshl.i16 q2, q2, #9 // -m << 9 + vsub.i16 q1, q0, q1 // a - b + vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 + vadd.i16 q0, q0, q1 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d1}, [r12, :64], r1 + bgt 4b + pop {r4-r5,pc} +80: + add r12, r0, r1 + lsl r1, r1, #1 +8: + vld2.8 {d16[], d17[]}, [r5, :16]! + vld1.16 {q2, q3}, [r2, :128]! + vneg.s8 q9, q8 // -m + vld1.16 {q0}, [r0, :128] + subs r4, r4, #2 + vmovl.s8 q8, d18 + vmovl.s8 q9, d19 + vld1.16 {q1}, [r12, :128] + vshl.i16 q8, q8, #9 // -m << 9 + vshl.i16 q9, q9, #9 + vsub.i16 q2, q0, q2 // a - b + vsub.i16 q3, q1, q3 + vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q3, q3, q9 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r12, :128], r1 + bgt 8b + pop {r4-r5,pc} +160: + add r12, r0, r1 + lsl r1, r1, #1 +16: + vld2.8 {d24[], d25[]}, [r5, :16]! + vld1.16 {q8, q9}, [r2, :128]! + subs r4, r4, #2 + vneg.s8 q13, q12 // -m + vld1.16 {q0, q1}, [r0, :128] + vmovl.s8 q12, d26 + vld1.16 {q10, q11}, [r2, :128]! + vmovl.s8 q13, d27 + vld1.16 {q2, q3}, [r12, :128] + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 q13, q13, #9 + vsub.i16 q8, q0, q8 // a - b + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vsub.i16 q11, q3, q11 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q12 + vqrdmulh.s16 q10, q10, q13 + vqrdmulh.s16 q11, q11, q13 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vadd.i16 q3, q3, q11 + vst1.16 {q0, q1}, [r0, :128], r1 + vst1.16 {q2, q3}, [r12, :128], r1 + bgt 16b + pop {r4-r5,pc} +1280: +640: +320: + sub r1, r1, r3, lsl #1 +321: + vld1.8 {d24[]}, [r5]! + mov r12, r3 + vneg.s8 d24, d24 // -m + vmovl.s8 q12, d24 + vshl.i16 q12, q12, #9 // -m << 9 +32: + vld1.16 {q8, q9}, [r2, :128]! + vld1.16 {q0, q1}, [r0, :128]! + subs r12, r12, #32 + vld1.16 {q10, q11}, [r2, :128]! + vld1.16 {q2, q3}, [r0, :128] + vsub.i16 q8, q0, q8 // a - b + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vsub.i16 q11, q3, q11 + sub r0, r0, #32 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q12 + vqrdmulh.s16 q10, q10, q12 + vqrdmulh.s16 q11, q11, q12 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vst1.16 {q0, q1}, [r0, :128]! + vadd.i16 q3, q3, q11 + vst1.16 {q2, q3}, [r0, :128]! + bgt 32b + subs r4, r4, #1 + add r0, r0, r1 + bgt 321b + pop {r4-r5,pc} +endfunc + +function blend_v_16bpc_neon, export=1 + push {r4,lr} + ldr r4, [sp, #8] + movrel lr, X(obmc_masks) + add lr, lr, r3 + clz r12, r3 + adr r3, L(blend_v_tbl) + sub r12, r12, #26 + ldr r12, [r3, r12, lsl #2] + add r3, r3, r12 + bx r3 + + .align 2 +L(blend_v_tbl): + .word 320f - L(blend_v_tbl) + CONFIG_THUMB + .word 160f - L(blend_v_tbl) + CONFIG_THUMB + .word 80f - L(blend_v_tbl) + CONFIG_THUMB + .word 40f - L(blend_v_tbl) + CONFIG_THUMB + .word 20f - L(blend_v_tbl) + CONFIG_THUMB + +20: + add r12, r0, r1 + lsl r1, r1, #1 + vld1.8 {d4[]}, [lr] + vneg.s8 d4, d4 // -m + vmovl.s8 q2, d4 + vshl.i16 d4, d4, #9 // -m << 9 +2: + vld1.32 {d2[]}, [r2, :32]! + vld1.16 {d0[]}, [r0, :16] + subs r4, r4, #2 + vld1.16 {d2[1]}, [r2, :16] + vld1.16 {d0[1]}, [r12, :16] + add r2, r2, #4 + vsub.i16 d2, d0, d2 // a - b + vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6 + vadd.i16 d0, d0, d2 + vst1.16 {d0[0]}, [r0, :16], r1 + vst1.16 {d0[1]}, [r12, :16], r1 + bgt 2b + pop {r4,pc} +40: + vld1.32 {d4[]}, [lr, :32] + add r12, r0, r1 + vneg.s8 d4, d4 // -m + lsl r1, r1, #1 + vmovl.s8 q2, d4 + sub r1, r1, #4 + vshl.i16 q2, q2, #9 // -m << 9 +4: + vld1.16 {q1}, [r2, :128]! + vld1.16 {d0}, [r0, :64] + vld1.16 {d1}, [r12, :64] + subs r4, r4, #2 + vsub.i16 q1, q0, q1 // a - b + vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 + vadd.i16 q0, q0, q1 + vst1.32 {d0[0]}, [r0, :32]! + vst1.32 {d1[0]}, [r12, :32]! + vst1.16 {d0[2]}, [r0, :16], r1 + vst1.16 {d1[2]}, [r12, :16], r1 + bgt 4b + pop {r4,pc} +80: + vld1.8 {d16}, [lr, :64] + add r12, r0, r1 + vneg.s8 d16, d16 // -m + lsl r1, r1, #1 + vmovl.s8 q8, d16 + sub r1, r1, #8 + vshl.i16 q8, q8, #9 // -m << 9 +8: + vld1.16 {q2, q3}, [r2, :128]! + vld1.16 {q0}, [r0, :128] + vld1.16 {q1}, [r12, :128] + subs r4, r4, #2 + vsub.i16 q2, q0, q2 // a - b + vsub.i16 q3, q1, q3 + vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q3, q3, q8 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vst1.16 {d0}, [r0, :64]! + vst1.16 {d2}, [r12, :64]! + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d3[0]}, [r12, :32], r1 + bgt 8b + pop {r4,pc} +160: + vld1.8 {q12}, [lr, :128] + add r12, r0, r1 + vneg.s8 q13, q12 // -m + lsl r1, r1, #1 + vmovl.s8 q12, d26 + vmovl.s8 q13, d27 + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 d26, d26, #9 +16: + vld1.16 {q8, q9}, [r2, :128]! + vld1.16 {d0, d1, d2}, [r0, :64] + subs r4, r4, #2 + vld1.16 {q10, q11}, [r2, :128]! + vsub.i16 q8, q0, q8 // a - b + vld1.16 {d4, d5, d6}, [r12, :64] + vsub.i16 d18, d2, d18 + vsub.i16 q10, q2, q10 + vsub.i16 d22, d6, d22 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 d18, d18, d26 + vqrdmulh.s16 q10, q10, q12 + vqrdmulh.s16 d22, d22, d26 + vadd.i16 q0, q0, q8 + vadd.i16 d2, d2, d18 + vadd.i16 q2, q2, q10 + vst1.16 {d0, d1, d2}, [r0, :64], r1 + vadd.i16 d6, d6, d22 + vst1.16 {d4, d5, d6}, [r12, :64], r1 + bgt 16b + pop {r4,pc} +320: + vld1.8 {d24, d25, d26}, [lr, :64] + vneg.s8 q14, q12 // -m + vneg.s8 d30, d26 + vmovl.s8 q12, d28 + vmovl.s8 q13, d29 + vmovl.s8 q14, d30 + sub r1, r1, #32 + vshl.i16 q12, q12, #9 // -m << 9 + vshl.i16 q13, q13, #9 + vshl.i16 q14, q14, #9 +32: + vld1.16 {q8, q9}, [r2, :128]! + vld1.16 {q0, q1}, [r0, :128]! + subs r4, r4, #1 + vld1.16 {q10}, [r2, :128] + vsub.i16 q8, q0, q8 // a - b + vld1.16 {q2}, [r0, :128] + sub r0, r0, #32 + vsub.i16 q9, q1, q9 + vsub.i16 q10, q2, q10 + vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 + vqrdmulh.s16 q9, q9, q13 + vqrdmulh.s16 q10, q10, q14 + vadd.i16 q0, q0, q8 + vadd.i16 q1, q1, q9 + vadd.i16 q2, q2, q10 + vst1.16 {q0, q1}, [r0, :128]! + add r2, r2, #32 + vst1.16 {q2}, [r0, :128], r1 + bgt 32b + pop {r4,pc} +endfunc + // This has got the same signature as the put_8tap functions, // and assumes that r9 is set to (clz(w)-24). function put_neon From f004d16fbe4cbeecbb345c9d8049e20224041631 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 25 Nov 2020 14:49:09 +0200 Subject: [PATCH 095/155] arm32: mc: Add NEON implementations of the w_mask functions for 16 bpc Checkasm numbers: Cortex A7 A8 A53 A72 A73 w_mask_420_w4_16bpc_neon: 350.3 216.4 215.4 141.7 134.5 w_mask_420_w8_16bpc_neon: 926.7 590.9 529.1 373.8 354.5 w_mask_420_w16_16bpc_neon: 2956.7 1880.4 1654.8 1186.1 1134.1 w_mask_420_w32_16bpc_neon: 11489.3 7426.4 6314.1 4599.8 4398.6 w_mask_420_w64_16bpc_neon: 28175.9 17898.1 16002.8 11079.0 10551.8 w_mask_420_w128_16bpc_neon: 71599.4 44630.9 40696.9 28057.3 27836.5 w_mask_422_w4_16bpc_neon: 339.0 210.1 206.7 137.3 134.7 w_mask_422_w8_16bpc_neon: 887.2 573.3 499.6 361.6 353.5 w_mask_422_w16_16bpc_neon: 2918.0 1841.6 1593.0 1194.0 1157.9 w_mask_422_w32_16bpc_neon: 11313.8 7238.7 6043.4 4577.1 4469.6 w_mask_422_w64_16bpc_neon: 27746.5 17427.2 15386.9 11082.6 10693.8 w_mask_422_w128_16bpc_neon: 70521.4 43864.9 39209.3 29045.7 28305.5 w_mask_444_w4_16bpc_neon: 325.6 202.9 198.4 135.2 129.3 w_mask_444_w8_16bpc_neon: 860.7 534.9 474.8 358.0 352.2 w_mask_444_w16_16bpc_neon: 2764.3 1714.4 1517.8 1160.6 1133.1 w_mask_444_w32_16bpc_neon: 10719.8 6738.3 5746.7 4458.6 4347.1 w_mask_444_w64_16bpc_neon: 26407.9 16224.1 14783.9 10784.3 10371.4 w_mask_444_w128_16bpc_neon: 67226.1 41060.1 37823.1 41696.1 27722.2 Corresponding numbers for arm64, for comparison: Cortex A53 A72 A73 w_mask_420_w4_16bpc_neon: 173.6 123.6 120.3 w_mask_420_w8_16bpc_neon: 484.0 344.0 329.4 w_mask_420_w16_16bpc_neon: 1436.3 1025.7 1028.7 w_mask_420_w32_16bpc_neon: 5597.0 3994.8 3981.2 w_mask_420_w64_16bpc_neon: 13953.4 9700.8 9579.9 w_mask_420_w128_16bpc_neon: 35833.7 25519.3 24277.8 w_mask_422_w4_16bpc_neon: 159.4 111.7 114.2 w_mask_422_w8_16bpc_neon: 453.4 326.2 326.7 w_mask_422_w16_16bpc_neon: 1398.2 1063.3 1052.6 w_mask_422_w32_16bpc_neon: 5532.7 4143.0 4026.3 w_mask_422_w64_16bpc_neon: 13885.3 9978.0 9689.8 w_mask_422_w128_16bpc_neon: 35763.3 25822.4 24610.9 w_mask_444_w4_16bpc_neon: 152.9 110.0 112.8 w_mask_444_w8_16bpc_neon: 437.2 332.0 325.8 w_mask_444_w16_16bpc_neon: 1399.3 1068.9 1041.7 w_mask_444_w32_16bpc_neon: 5410.9 4139.7 4136.9 w_mask_444_w64_16bpc_neon: 13648.7 10011.8 10004.6 w_mask_444_w128_16bpc_neon: 35639.6 26910.8 25631.0 --- src/arm/32/mc16.S | 279 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 279 insertions(+) diff --git a/src/arm/32/mc16.S b/src/arm/32/mc16.S index cc116ee024..05a0056cd1 100644 --- a/src/arm/32/mc16.S +++ b/src/arm/32/mc16.S @@ -273,6 +273,285 @@ bidir_fn w_avg, r7 bidir_fn mask, r7 +.macro w_mask_fn type +function w_mask_\type\()_16bpc_neon, export=1 + push {r4-r10,lr} + vpush {q4-q7} + ldrd r4, r5, [sp, #96] + ldrd r6, r7, [sp, #104] + ldr r8, [sp, #112] + clz r9, r4 + adr lr, L(w_mask_\type\()_tbl) + vdup.16 q15, r8 // bitdepth_max + sub r9, r9, #24 + clz r8, r8 // clz(bitdepth_max) + ldr r9, [lr, r9, lsl #2] + add r9, lr, r9 + sub r8, r8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 + mov r10, #PREP_BIAS*64 + neg r8, r8 // -sh + movw r12, #27615 // (64 + 1 - 38)<> mask_sh + vshr.u16 q7, q7, #10 + vadd.i32 q2, q2, q13 // += PREP_BIAS*64 + vadd.i32 q3, q3, q13 + vadd.i32 q4, q4, q13 + vadd.i32 q5, q5, q13 + vmovl.u16 q12, d12 + vmovl.u16 q13, d13 + vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) + vmovl.u16 q12, d14 + vmla.i32 q3, q9, q13 + vmovl.u16 q13, d15 + vmla.i32 q4, q10, q12 + vmla.i32 q5, q11, q13 + vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + vrshl.s32 q3, q3, q14 + vrshl.s32 q4, q4, q14 + vrshl.s32 q5, q5, q14 + vqmovun.s32 d4, q2 // iclip_pixel + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q4 + vqmovun.s32 d7, q5 + vmin.u16 q2, q2, q15 // iclip_pixel + vmin.u16 q3, q3, q15 // iclip_pixel +.if \type == 444 + vmovn.i16 d12, q6 // 64 - m + vmovn.i16 d13, q7 + vsub.i16 q6, q1, q6 // m + vst1.8 {q6}, [r6, :128]! +.elseif \type == 422 + vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) + vpadd.i16 d13, d14, d15 + vmovn.i16 d12, q6 + vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + vst1.8 {d12}, [r6, :64]! +.elseif \type == 420 + vadd.i16 d12, d12, d13 // (64 - my1) + (64 - my2) (row wise addition) + vadd.i16 d13, d14, d15 + vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) + vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d12[0]}, [r6, :32]! +.endif + vst1.16 {d4}, [r0, :64], r1 + vst1.16 {d5}, [r12, :64], r1 + vst1.16 {d6}, [r0, :64], r1 + vst1.16 {d7}, [r12, :64], r1 + bgt 4b + vpop {q4-q7} + pop {r4-r10,pc} +8: + vld1.16 {q2, q3}, [r2, :128]! // tmp1 + vld1.16 {q4, q5}, [r3, :128]! // tmp2 + subs r5, r5, #2 + vdup.32 q13, r10 // PREP_BIAS*64 + vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2) + vabd.s16 q7, q3, q5 + vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit) + vsubl.s16 q9, d9, d5 + vsubl.s16 q10, d10, d6 + vsubl.s16 q11, d11, d7 + vqsub.u16 q6, q0, q6 // 27615 - abs() + vqsub.u16 q7, q0, q7 + vshll.s16 q5, d7, #6 // tmp1 << 6 + vshll.s16 q4, d6, #6 + vshll.s16 q3, d5, #6 + vshll.s16 q2, d4, #6 + vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh + vshr.u16 q7, q7, #10 + vadd.i32 q2, q2, q13 // += PREP_BIAS*64 + vadd.i32 q3, q3, q13 + vadd.i32 q4, q4, q13 + vadd.i32 q5, q5, q13 + vmovl.u16 q12, d12 + vmovl.u16 q13, d13 + vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) + vmovl.u16 q12, d14 + vmla.i32 q3, q9, q13 + vmovl.u16 q13, d15 + vmla.i32 q4, q10, q12 + vmla.i32 q5, q11, q13 + vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + vrshl.s32 q3, q3, q14 + vrshl.s32 q4, q4, q14 + vrshl.s32 q5, q5, q14 + vqmovun.s32 d4, q2 // iclip_pixel + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q4 + vqmovun.s32 d7, q5 + vmin.u16 q2, q2, q15 // iclip_pixel + vmin.u16 q3, q3, q15 // iclip_pixel +.if \type == 444 + vmovn.i16 d12, q6 // 64 - m + vmovn.i16 d13, q7 + vsub.i16 q6, q1, q6 // m + vst1.8 {q6}, [r6, :128]! +.elseif \type == 422 + vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) + vpadd.i16 d13, d14, d15 + vmovn.i16 d12, q6 + vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + vst1.8 {d12}, [r6, :64]! +.elseif \type == 420 + vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition) + vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) + vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d12[0]}, [r6, :32]! +.endif + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r12, :128], r1 + bgt 8b + vpop {q4-q7} + pop {r4-r10,pc} +1280: +640: +320: +160: + sub r1, r1, r4, lsl #1 +.if \type == 444 + add lr, r6, r4 +.elseif \type == 422 + add lr, r6, r4, lsr #1 +.endif + add r7, r2, r4, lsl #1 + add r9, r3, r4, lsl #1 +161: + mov r8, r4 +16: + vld1.16 {q2}, [r2, :128]! // tmp1 + vld1.16 {q4}, [r3, :128]! // tmp2 + vld1.16 {q3}, [r7, :128]! + vld1.16 {q5}, [r9, :128]! + subs r8, r8, #8 + vdup.32 q13, r10 // PREP_BIAS*64 + vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2) + vabd.s16 q7, q3, q5 + vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit) + vsubl.s16 q9, d9, d5 + vsubl.s16 q10, d10, d6 + vsubl.s16 q11, d11, d7 + vqsub.u16 q6, q0, q6 // 27615 - abs() + vqsub.u16 q7, q0, q7 + vshll.s16 q5, d7, #6 // tmp1 << 6 + vshll.s16 q4, d6, #6 + vshll.s16 q3, d5, #6 + vshll.s16 q2, d4, #6 + vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh + vshr.u16 q7, q7, #10 + vadd.i32 q2, q2, q13 // += PREP_BIAS*64 + vadd.i32 q3, q3, q13 + vadd.i32 q4, q4, q13 + vadd.i32 q5, q5, q13 + vmovl.u16 q12, d12 + vmovl.u16 q13, d13 + vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) + vmovl.u16 q12, d14 + vmla.i32 q3, q9, q13 + vmovl.u16 q13, d15 + vmla.i32 q4, q10, q12 + vmla.i32 q5, q11, q13 + vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh + vrshl.s32 q3, q3, q14 + vrshl.s32 q4, q4, q14 + vrshl.s32 q5, q5, q14 + vqmovun.s32 d4, q2 // iclip_pixel + vqmovun.s32 d5, q3 + vqmovun.s32 d6, q4 + vqmovun.s32 d7, q5 + vmin.u16 q2, q2, q15 // iclip_pixel + vmin.u16 q3, q3, q15 // iclip_pixel +.if \type == 444 + vmovn.i16 d12, q6 // 64 - m + vmovn.i16 d13, q7 + vsub.i16 q6, q1, q6 // m + vst1.8 {d12}, [r6, :64]! + vst1.8 {d13}, [lr, :64]! +.elseif \type == 422 + vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) + vpadd.i16 d13, d14, d15 + vmovn.i16 d12, q6 + vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 + vst1.32 {d12[0]}, [r6, :32]! + vst1.32 {d12[1]}, [lr, :32]! +.elseif \type == 420 + vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition) + vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) + vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) + vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 + vst1.32 {d12[0]}, [r6, :32]! +.endif + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + bgt 16b + subs r5, r5, #2 + add r2, r2, r4, lsl #1 + add r3, r3, r4, lsl #1 + add r7, r7, r4, lsl #1 + add r9, r9, r4, lsl #1 +.if \type == 444 + add r6, r6, r4 + add lr, lr, r4 +.elseif \type == 422 + add r6, r6, r4, lsr #1 + add lr, lr, r4, lsr #1 +.endif + add r0, r0, r1 + add r12, r12, r1 + bgt 161b + vpop {q4-q7} + pop {r4-r10,pc} +endfunc +.endm + +w_mask_fn 444 +w_mask_fn 422 +w_mask_fn 420 + function blend_16bpc_neon, export=1 push {r4-r5,lr} ldrd r4, r5, [sp, #12] From 2ef226383032334093f69215276423dd24610755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 23 Nov 2020 11:20:49 +0200 Subject: [PATCH 096/155] arm32: mc: Add NEON implementation of emu_edge for 16 bpc Checkasm benchmarks: Cortex A7 A8 A53 A72 A73 emu_edge_w4_16bpc_neon: 375.0 312.6 268.3 159.3 170.0 emu_edge_w8_16bpc_neon: 619.3 425.5 435.5 249.5 291.1 emu_edge_w16_16bpc_neon: 719.1 568.3 506.9 324.2 314.4 emu_edge_w32_16bpc_neon: 2112.2 1677.7 1396.2 1050.5 1009.6 emu_edge_w64_16bpc_neon: 5046.8 4322.5 3693.7 3953.8 2682.8 emu_edge_w128_16bpc_neon: 16311.1 14341.3 12877.8 26183.5 8924.9 Corresponding numbers for arm64, for comparison: Cortex A53 A72 A73 emu_edge_w4_16bpc_neon: 302.5 174.9 159.2 emu_edge_w8_16bpc_neon: 344.6 292.3 273.2 emu_edge_w16_16bpc_neon: 601.0 461.2 316.8 emu_edge_w32_16bpc_neon: 974.2 1274.7 960.5 emu_edge_w64_16bpc_neon: 2853.1 3527.6 2633.5 emu_edge_w128_16bpc_neon: 14633.5 26776.6 7236.0 --- src/arm/32/mc16.S | 190 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) diff --git a/src/arm/32/mc16.S b/src/arm/32/mc16.S index 05a0056cd1..eb7b3b549e 100644 --- a/src/arm/32/mc16.S +++ b/src/arm/32/mc16.S @@ -3452,3 +3452,193 @@ endfunc warp warp t + +// void dav1d_emu_edge_16bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_16bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + ldrd r8, r9, [sp, #52] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub r12, r3, #1 // ih - 1 + cmp r5, r3 + sub lr, r2, #1 // iw - 1 + it lt + movlt r12, r5 // min(y, ih - 1) + cmp r4, r2 + bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0) + it lt + movlt lr, r4 // min(x, iw - 1) + bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0) + mla r8, r12, r9, r8 // ref += iclip() * stride + add r8, r8, lr, lsl #1 // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add r10, r5, r1 // y + bh + neg r5, r5 // -y + sub r10, r10, r3 // y + bh - ih + sub r12, r1, #1 // bh - 1 + cmp r10, r1 + bic r5, r5, r5, asr #31 // max(-y, 0) + it ge + movge r10, r12 // min(y + bh - ih, bh-1) + cmp r5, r1 + bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0) + it ge + movge r5, r12 // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add r11, r4, r0 // x + bw + neg r4, r4 // -x + sub r11, r11, r2 // x + bw - iw + sub lr, r0, #1 // bw - 1 + cmp r11, r0 + bic r4, r4, r4, asr #31 // max(-x, 0) + it ge + movge r11, lr // min(x + bw - iw, bw-1) + cmp r4, r0 + bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0) + it ge + movge r4, lr // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub r1, r1, r5 // bh - top_ext + mla r6, r5, r7, r6 + sub r2, r0, r4 // bw - left_ext + sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext + sub r2, r2, r11 // center_w = bw - left_ext - right_ext + + mov r0, r6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + vld1.16 {d0[], d1[]}, [r8] + mov r12, r6 // out = dst + mov r3, r4 + vmov q1, q0 +1: + subs r3, r3, #16 + vst1.16 {q0, q1}, [r12, :128]! + bgt 1b +.endif + mov lr, r8 + add r12, r6, r4, lsl #1 // out = dst + left_ext + mov r3, r2 +1: + vld1.16 {q0, q1}, [lr]! + subs r3, r3, #32 + vld1.16 {q2, q3}, [lr]! +.if \need_left + vst1.16 {q0, q1}, [r12]! + vst1.16 {q2, q3}, [r12]! +.else + vst1.16 {q0, q1}, [r12, :128]! + vst1.16 {q2, q3}, [r12, :128]! +.endif + bgt 1b +.if \need_right + add r3, r8, r2, lsl #1 // in + center_w + sub r3, r3, #2 // in + center_w - 1 + add r12, r6, r4, lsl #1 // dst + left_ext + vld1.16 {d0[], d1[]}, [r3] + add r12, r12, r2, lsl #1 // out = dst + left_ext + center_w + mov r3, r11 + vmov q1, q0 +1: + subs r3, r3, #16 + vst1.16 {q0, q1}, [r12]! + bgt 1b +.endif + + subs r1, r1, #1 // center_h-- + add r6, r6, r7 + add r8, r8, r9 + bgt 0b +.endm + + cmp r4, #0 + beq 2f + // need_left + cmp r11, #0 + beq 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cmp r11, #0 + beq 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + cmp r10, #0 + // Storing the original dst in r0 overwrote bw, recalculate it here + add r2, r2, r4 // center_w + left_ext + add r2, r2, r11 // bw = center_w + left_ext + right_ext + + beq 3f + // need_bottom + sub r8, r6, r7 // ref = dst - stride + mov r4, r2 + sub r12, r7, #32 +1: + vld1.16 {q0, q1}, [r8, :128]! + mov r3, r10 + vld1.16 {q2, q3}, [r8, :128]! +2: + vst1.16 {q0, q1}, [r6, :128]! + subs r3, r3, #1 + vst1.16 {q2, q3}, [r6, :128], r12 + bgt 2b + mls r6, r7, r10, r6 // dst -= bottom_ext * stride + subs r4, r4, #32 // bw -= 32 + add r6, r6, #64 // dst += 32 + bgt 1b + +3: + cmp r5, #0 + beq 3f + // need_top + mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride + sub r12, r7, #32 +1: + vld1.16 {q0, q1}, [r0, :128]! + mov r3, r5 + vld1.16 {q2, q3}, [r0, :128]! +2: + vst1.16 {q0, q1}, [r6, :128]! + subs r3, r3, #1 + vst1.16 {q2, q3}, [r6, :128], r12 + bgt 2b + mls r6, r7, r5, r6 // dst -= top_ext * stride + subs r2, r2, #32 // bw -= 32 + add r6, r6, #64 // dst += 32 + bgt 1b + +3: + pop {r4-r11,pc} +endfunc From e353921b7141bd0da8ceffe54b8ffe2df1a0ed90 Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Fri, 1 Jan 2021 22:18:19 -0500 Subject: [PATCH 097/155] SSE2, msac: Use bsr shortcut for 50% bool decoding bsr has 3 cycles of latency for modern x86 processors. For this function, it's possible to obtain the number of bits to shift by alternative means. I'd estimate about approx -0.2% decrease in cpu usage based on percentages associated with function symbols in perf report. Benchmarks were run on a Ryzen 5 3600 (Zen 2). The used clip was the original 1080p chimera. --- src/x86/msac.asm | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/x86/msac.asm b/src/x86/msac.asm index 8d59c64a26..92a3a731d3 100644 --- a/src/x86/msac.asm +++ b/src/x86/msac.asm @@ -153,6 +153,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6 .renorm4: bsr ecx, t2d xor ecx, 15 ; d +.renorm5: shl t2d, cl shl t4, cl mov [t7+msac.rng], t2d @@ -413,13 +414,20 @@ cglobal msac_decode_bool_equi, 0, 6, 0 sub t2d, t1d ; r - v sub t4, rax ; dif - vw cmovb t2d, t1d + mov t1d, [t0+msac.cnt] cmovb t4, t3 + movifnidn t7, t0 + mov ecx, 0xbfff setb al ; the upper 32 bits contains garbage but that's OK + sub ecx, t2d not t4 + ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14) + ; i.e. (0 <= d <= 2) and v < (3 << 14) + shr ecx, 14 ; d %if ARCH_X86_64 == 0 movzx eax, al %endif - jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3 + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5 cglobal msac_decode_bool, 0, 6, 0 movifnidn t0, r0mp From dba517995071a870a51c8828be56e11ebf14e134 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sun, 3 Jan 2021 22:43:54 +0200 Subject: [PATCH 098/155] arm: looprestoration: Simplify code by allowing writing up to 8 pixels past the end of rows This corresponds to what the x86 assembly does right now. This allows removing a fair bit of code, and allows marking the stores as aligned. (Previously, the writes of the narrow slice temp buffer were unaligned.) --- src/arm/32/looprestoration.S | 139 +----------------------------- src/arm/32/looprestoration16.S | 139 +----------------------------- src/arm/32/looprestoration_tmpl.S | 24 +++--- src/arm/64/looprestoration.S | 133 ---------------------------- src/arm/64/looprestoration16.S | 133 ---------------------------- 5 files changed, 14 insertions(+), 554 deletions(-) diff --git a/src/arm/32/looprestoration.S b/src/arm/32/looprestoration.S index 79afdc394c..105a32e860 100644 --- a/src/arm/32/looprestoration.S +++ b/src/arm/32/looprestoration.S @@ -424,7 +424,7 @@ function wiener_filter_v_8bpc_neon, export=1 vqrshrun.s32 d4, q2, #11 vqrshrun.s32 d5, q3, #11 vqmovun.s16 d4, q2 - vst1.8 {d4}, [r0], r1 + vst1.8 {d4}, [r0, :64], r1 .if \compare cmp r4, #4 .else @@ -533,143 +533,6 @@ function wiener_filter_v_8bpc_neon, export=1 .purgem filter endfunc -// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride, -// const pixel *src, int w, int h); -function copy_narrow_8bpc_neon, export=1 - push {r4,lr} - ldr r4, [sp, #8] - adr r12, L(copy_narrow_tbl) - ldr r3, [r12, r3, lsl #2] - add r12, r12, r3 - bx r12 - - .align 2 -L(copy_narrow_tbl): - .word 0 - .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB - -10: - add r3, r0, r1 - lsl r1, r1, #1 -18: - subs r4, r4, #8 - blt 110f - vld1.8 {d0}, [r2, :64]! - vst1.8 {d0[0]}, [r0], r1 - vst1.8 {d0[1]}, [r3], r1 - vst1.8 {d0[2]}, [r0], r1 - vst1.8 {d0[3]}, [r3], r1 - vst1.8 {d0[4]}, [r0], r1 - vst1.8 {d0[5]}, [r3], r1 - vst1.8 {d0[6]}, [r0], r1 - vst1.8 {d0[7]}, [r3], r1 - ble 0f - b 18b -110: - add r4, r4, #8 - asr r1, r1, #1 -11: - subs r4, r4, #1 - vld1.8 {d0[]}, [r2]! - vst1.8 {d0[0]}, [r0], r1 - bgt 11b -0: - pop {r4,pc} - -20: - add r3, r0, r1 - lsl r1, r1, #1 -24: - subs r4, r4, #4 - blt 210f - vld1.16 {d0}, [r2, :64]! - vst1.16 {d0[0]}, [r0, :16], r1 - vst1.16 {d0[1]}, [r3, :16], r1 - vst1.16 {d0[2]}, [r0, :16], r1 - vst1.16 {d0[3]}, [r3, :16], r1 - ble 0f - b 24b -210: - add r4, r4, #4 - asr r1, r1, #1 -22: - subs r4, r4, #1 - vld1.16 {d0[]}, [r2, :16]! - vst1.16 {d0[0]}, [r0, :16], r1 - bgt 22b -0: - pop {r4,pc} - -30: - ldrh r3, [r2] - ldrb r12, [r2, #2] - add r2, r2, #3 - subs r4, r4, #1 - strh r3, [r0] - strb r12, [r0, #2] - add r0, r0, r1 - bgt 30b - pop {r4,pc} - -40: - add r3, r0, r1 - lsl r1, r1, #1 -42: - subs r4, r4, #2 - blt 41f - vld1.8 {d0}, [r2, :64]! - vst1.32 {d0[0]}, [r0, :32], r1 - vst1.32 {d0[1]}, [r3, :32], r1 - ble 0f - b 42b -41: - vld1.32 {d0[]}, [r2, :32] - vst1.32 {d0[0]}, [r0, :32] -0: - pop {r4,pc} - -50: - ldr r3, [r2] - ldrb r12, [r2, #4] - add r2, r2, #5 - subs r4, r4, #1 - str r3, [r0] - strb r12, [r0, #4] - add r0, r0, r1 - bgt 50b - pop {r4,pc} - -60: - ldr r3, [r2] - ldrh r12, [r2, #4] - add r2, r2, #6 - subs r4, r4, #1 - str r3, [r0] - strh r12, [r0, #4] - add r0, r0, r1 - bgt 60b - pop {r4,pc} - -70: - ldr r3, [r2] - ldrh r12, [r2, #4] - ldrb lr, [r2, #6] - add r2, r2, #7 - subs r4, r4, #1 - str r3, [r0] - strh r12, [r0, #4] - strb lr, [r0, #6] - add r0, r0, r1 - bgt 70b - pop {r4,pc} -endfunc - #define SUM_STRIDE (384+16) #include "looprestoration_tmpl.S" diff --git a/src/arm/32/looprestoration16.S b/src/arm/32/looprestoration16.S index a2ebbeff3a..7cda0cb2d3 100644 --- a/src/arm/32/looprestoration16.S +++ b/src/arm/32/looprestoration16.S @@ -457,7 +457,7 @@ function wiener_filter_v_16bpc_neon, export=1 vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vmin.u16 q2, q2, q5 // bitdepth_max - vst1.16 {q2}, [r0], r1 + vst1.16 {q2}, [r0, :128], r1 .if \compare cmp r4, #4 .else @@ -567,143 +567,6 @@ function wiener_filter_v_16bpc_neon, export=1 .purgem filter endfunc -// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride, -// const pixel *src, int w, int h); -function copy_narrow_16bpc_neon, export=1 - push {r4,lr} - ldr r4, [sp, #8] - adr r12, L(copy_narrow_tbl) - ldr r3, [r12, r3, lsl #2] - add r12, r12, r3 - bx r12 - - .align 2 -L(copy_narrow_tbl): - .word 0 - .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB - .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB - -10: - add r3, r0, r1 - lsl r1, r1, #1 -18: - subs r4, r4, #8 - blt 110f - vld1.16 {q0}, [r2, :128]! - vst1.16 {d0[0]}, [r0, :16], r1 - vst1.16 {d0[1]}, [r3, :16], r1 - vst1.16 {d0[2]}, [r0, :16], r1 - vst1.16 {d0[3]}, [r3, :16], r1 - vst1.16 {d1[0]}, [r0, :16], r1 - vst1.16 {d1[1]}, [r3, :16], r1 - vst1.16 {d1[2]}, [r0, :16], r1 - vst1.16 {d1[3]}, [r3, :16], r1 - ble 0f - b 18b -110: - add r4, r4, #8 - asr r1, r1, #1 -11: - subs r4, r4, #1 - vld1.16 {d0[]}, [r2]! - vst1.16 {d0[0]}, [r0], r1 - bgt 11b -0: - pop {r4,pc} - -20: - add r3, r0, r1 - lsl r1, r1, #1 -24: - subs r4, r4, #4 - blt 210f - vld1.32 {q0}, [r2, :128]! - vst1.32 {d0[0]}, [r0, :32], r1 - vst1.32 {d0[1]}, [r3, :32], r1 - vst1.32 {d1[0]}, [r0, :32], r1 - vst1.32 {d1[1]}, [r3, :32], r1 - ble 0f - b 24b -210: - add r4, r4, #4 - asr r1, r1, #1 -22: - subs r4, r4, #1 - vld1.32 {d0[]}, [r2, :32]! - vst1.32 {d0[0]}, [r0, :32], r1 - bgt 22b -0: - pop {r4,pc} - -30: - ldr r3, [r2] - ldrh r12, [r2, #4] - add r2, r2, #6 - subs r4, r4, #1 - str r3, [r0] - strh r12, [r0, #4] - add r0, r0, r1 - bgt 30b - pop {r4,pc} - -40: - add r3, r0, r1 - lsl r1, r1, #1 -42: - subs r4, r4, #2 - blt 41f - vld1.16 {q0}, [r2, :128]! - vst1.16 {d0}, [r0, :64], r1 - vst1.16 {d1}, [r3, :64], r1 - ble 0f - b 42b -41: - vld1.16 {d0}, [r2, :64] - vst1.16 {d0}, [r0, :64] -0: - pop {r4,pc} - -50: - vld1.16 {d0}, [r2] - ldrh r12, [r2, #8] - add r2, r2, #10 - subs r4, r4, #1 - vst1.16 {d0}, [r0] - strh r12, [r0, #8] - add r0, r0, r1 - bgt 50b - pop {r4,pc} - -60: - vld1.16 {d0}, [r2] - ldr r12, [r2, #8] - add r2, r2, #12 - subs r4, r4, #1 - vst1.16 {d0}, [r0] - str r12, [r0, #8] - add r0, r0, r1 - bgt 60b - pop {r4,pc} - -70: - vld1.16 {d0}, [r2] - ldr r12, [r2, #8] - ldrh lr, [r2, #12] - add r2, r2, #14 - subs r4, r4, #1 - vst1.16 {d0}, [r0] - str r12, [r0, #8] - strh lr, [r0, #12] - add r0, r0, r1 - bgt 70b - pop {r4,pc} -endfunc - #define SUM_STRIDE (384+16) #include "looprestoration_tmpl.S" diff --git a/src/arm/32/looprestoration_tmpl.S b/src/arm/32/looprestoration_tmpl.S index 324aa6251d..8a9940bb3a 100644 --- a/src/arm/32/looprestoration_tmpl.S +++ b/src/arm/32/looprestoration_tmpl.S @@ -389,8 +389,8 @@ function sgr_weighted1_\bpc\()bpc_neon, export=1 vrshrn.i32 d21, q11, #11 vqmovun.s16 d4, q2 vqmovun.s16 d20, q10 - vst1.8 {d4}, [r0]! - vst1.8 {d20}, [r9]! + vst1.8 {d4}, [r0, :64]! + vst1.8 {d20}, [r9, :64]! .else vqrshrun.s32 d4, q2, #11 vqrshrun.s32 d5, q3, #11 @@ -398,8 +398,8 @@ function sgr_weighted1_\bpc\()bpc_neon, export=1 vqrshrun.s32 d21, q11, #11 vmin.u16 q2, q2, q14 vmin.u16 q10, q10, q14 - vst1.16 {q2}, [r0]! - vst1.16 {q10}, [r9]! + vst1.16 {q2}, [r0, :128]! + vst1.16 {q10}, [r9, :128]! .endif bgt 1b @@ -438,12 +438,12 @@ function sgr_weighted1_\bpc\()bpc_neon, export=1 vrshrn.i32 d4, q2, #11 vrshrn.i32 d5, q3, #11 vqmovun.s16 d2, q2 - vst1.8 {d2}, [r0]! + vst1.8 {d2}, [r0, :64]! .else vqrshrun.s32 d4, q2, #11 vqrshrun.s32 d5, q3, #11 vmin.u16 q2, q2, q14 - vst1.16 {q2}, [r0]! + vst1.16 {q2}, [r0, :128]! .endif bgt 2b 0: @@ -531,8 +531,8 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 vrshrn.i32 d23, q8, #11 vqmovun.s16 d6, q3 vqmovun.s16 d22, q11 - vst1.8 {d6}, [r0]! - vst1.8 {d22}, [r10]! + vst1.8 {d6}, [r0, :64]! + vst1.8 {d22}, [r10, :64]! .else vqrshrun.s32 d6, q3, #11 vqrshrun.s32 d7, q0, #11 @@ -540,8 +540,8 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 vqrshrun.s32 d23, q8, #11 vmin.u16 q3, q3, q14 vmin.u16 q11, q11, q14 - vst1.16 {q3}, [r0]! - vst1.16 {q11}, [r10]! + vst1.16 {q3}, [r0, :128]! + vst1.16 {q11}, [r10, :128]! .endif bgt 1b @@ -586,12 +586,12 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 vrshrn.i32 d6, q3, #11 vrshrn.i32 d7, q0, #11 vqmovun.s16 d6, q3 - vst1.8 {d6}, [r0]! + vst1.8 {d6}, [r0, :64]! .else vqrshrun.s32 d6, q3, #11 vqrshrun.s32 d7, q0, #11 vmin.u16 q3, q3, q14 - vst1.16 {q3}, [r0]! + vst1.16 {q3}, [r0, :128]! .endif bgt 1b 0: diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S index c3b7918f2e..21e7804eae 100644 --- a/src/arm/64/looprestoration.S +++ b/src/arm/64/looprestoration.S @@ -480,139 +480,6 @@ function wiener_filter_v_8bpc_neon, export=1 .purgem filter endfunc -// void dav1d_copy_narrow_8bpc_neon(pixel *dst, ptrdiff_t stride, -// const pixel *src, int w, int h); -function copy_narrow_8bpc_neon, export=1 - adr x5, L(copy_narrow_tbl) - ldrh w6, [x5, w3, uxtw #1] - sub x5, x5, w6, uxth - br x5 -10: - add x7, x0, x1 - lsl x1, x1, #1 -18: - subs w4, w4, #8 - b.lt 110f - ld1 {v0.8b}, [x2], #8 - st1 {v0.b}[0], [x0], x1 - st1 {v0.b}[1], [x7], x1 - st1 {v0.b}[2], [x0], x1 - st1 {v0.b}[3], [x7], x1 - st1 {v0.b}[4], [x0], x1 - st1 {v0.b}[5], [x7], x1 - st1 {v0.b}[6], [x0], x1 - st1 {v0.b}[7], [x7], x1 - b.le 0f - b 18b -110: - add w4, w4, #8 - asr x1, x1, #1 -11: - subs w4, w4, #1 - ld1 {v0.b}[0], [x2], #1 - st1 {v0.b}[0], [x0], x1 - b.gt 11b -0: - ret - -20: - add x7, x0, x1 - lsl x1, x1, #1 -24: - subs w4, w4, #4 - b.lt 210f - ld1 {v0.4h}, [x2], #8 - st1 {v0.h}[0], [x0], x1 - st1 {v0.h}[1], [x7], x1 - st1 {v0.h}[2], [x0], x1 - st1 {v0.h}[3], [x7], x1 - b.le 0f - b 24b -210: - add w4, w4, #4 - asr x1, x1, #1 -22: - subs w4, w4, #1 - ld1 {v0.h}[0], [x2], #2 - st1 {v0.h}[0], [x0], x1 - b.gt 22b -0: - ret - -30: - ldrh w5, [x2] - ldrb w6, [x2, #2] - add x2, x2, #3 - subs w4, w4, #1 - strh w5, [x0] - strb w6, [x0, #2] - add x0, x0, x1 - b.gt 30b - ret - -40: - add x7, x0, x1 - lsl x1, x1, #1 -42: - subs w4, w4, #2 - b.lt 41f - ld1 {v0.2s}, [x2], #8 - st1 {v0.s}[0], [x0], x1 - st1 {v0.s}[1], [x7], x1 - b.le 0f - b 42b -41: - ld1 {v0.s}[0], [x2] - st1 {v0.s}[0], [x0] -0: - ret - -50: - ldr w5, [x2] - ldrb w6, [x2, #4] - add x2, x2, #5 - subs w4, w4, #1 - str w5, [x0] - strb w6, [x0, #4] - add x0, x0, x1 - b.gt 50b - ret - -60: - ldr w5, [x2] - ldrh w6, [x2, #4] - add x2, x2, #6 - subs w4, w4, #1 - str w5, [x0] - strh w6, [x0, #4] - add x0, x0, x1 - b.gt 60b - ret - -70: - ldr w5, [x2] - ldrh w6, [x2, #4] - ldrb w7, [x2, #6] - add x2, x2, #7 - subs w4, w4, #1 - str w5, [x0] - strh w6, [x0, #4] - strb w7, [x0, #6] - add x0, x0, x1 - b.gt 70b - ret - -L(copy_narrow_tbl): - .hword 0 - .hword L(copy_narrow_tbl) - 10b - .hword L(copy_narrow_tbl) - 20b - .hword L(copy_narrow_tbl) - 30b - .hword L(copy_narrow_tbl) - 40b - .hword L(copy_narrow_tbl) - 50b - .hword L(copy_narrow_tbl) - 60b - .hword L(copy_narrow_tbl) - 70b -endfunc - #define SUM_STRIDE (384+16) #include "looprestoration_tmpl.S" diff --git a/src/arm/64/looprestoration16.S b/src/arm/64/looprestoration16.S index 669d993132..c5e853f64e 100644 --- a/src/arm/64/looprestoration16.S +++ b/src/arm/64/looprestoration16.S @@ -532,139 +532,6 @@ function wiener_filter_v_16bpc_neon, export=1 .purgem filter endfunc -// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride, -// const pixel *src, int w, int h); -function copy_narrow_16bpc_neon, export=1 - adr x5, L(copy_narrow_tbl) - ldrh w6, [x5, w3, uxtw #1] - sub x5, x5, w6, uxth - br x5 -10: - add x7, x0, x1 - lsl x1, x1, #1 -18: - subs w4, w4, #8 - b.lt 110f - ld1 {v0.8h}, [x2], #16 - st1 {v0.h}[0], [x0], x1 - st1 {v0.h}[1], [x7], x1 - st1 {v0.h}[2], [x0], x1 - st1 {v0.h}[3], [x7], x1 - st1 {v0.h}[4], [x0], x1 - st1 {v0.h}[5], [x7], x1 - st1 {v0.h}[6], [x0], x1 - st1 {v0.h}[7], [x7], x1 - b.le 0f - b 18b -110: - add w4, w4, #8 - asr x1, x1, #1 -11: - subs w4, w4, #1 - ld1 {v0.h}[0], [x2], #2 - st1 {v0.h}[0], [x0], x1 - b.gt 11b -0: - ret - -20: - add x7, x0, x1 - lsl x1, x1, #1 -24: - subs w4, w4, #4 - b.lt 210f - ld1 {v0.4s}, [x2], #16 - st1 {v0.s}[0], [x0], x1 - st1 {v0.s}[1], [x7], x1 - st1 {v0.s}[2], [x0], x1 - st1 {v0.s}[3], [x7], x1 - b.le 0f - b 24b -210: - add w4, w4, #4 - asr x1, x1, #1 -22: - subs w4, w4, #1 - ld1 {v0.s}[0], [x2], #4 - st1 {v0.s}[0], [x0], x1 - b.gt 22b -0: - ret - -30: - ldr w5, [x2] - ldrh w6, [x2, #4] - add x2, x2, #6 - subs w4, w4, #1 - str w5, [x0] - strh w6, [x0, #4] - add x0, x0, x1 - b.gt 30b - ret - -40: - add x7, x0, x1 - lsl x1, x1, #1 -42: - subs w4, w4, #2 - b.lt 41f - ld1 {v0.2d}, [x2], #16 - st1 {v0.d}[0], [x0], x1 - st1 {v0.d}[1], [x7], x1 - b.le 0f - b 42b -41: - ld1 {v0.4h}, [x2] - st1 {v0.4h}, [x0] -0: - ret - -50: - ldr x5, [x2] - ldrh w6, [x2, #8] - add x2, x2, #10 - subs w4, w4, #1 - str x5, [x0] - strh w6, [x0, #8] - add x0, x0, x1 - b.gt 50b - ret - -60: - ldr x5, [x2] - ldr w6, [x2, #8] - add x2, x2, #12 - subs w4, w4, #1 - str x5, [x0] - str w6, [x0, #8] - add x0, x0, x1 - b.gt 60b - ret - -70: - ldr x5, [x2] - ldr w6, [x2, #8] - ldrh w7, [x2, #12] - add x2, x2, #14 - subs w4, w4, #1 - str x5, [x0] - str w6, [x0, #8] - strh w7, [x0, #12] - add x0, x0, x1 - b.gt 70b - ret - -L(copy_narrow_tbl): - .hword 0 - .hword L(copy_narrow_tbl) - 10b - .hword L(copy_narrow_tbl) - 20b - .hword L(copy_narrow_tbl) - 30b - .hword L(copy_narrow_tbl) - 40b - .hword L(copy_narrow_tbl) - 50b - .hword L(copy_narrow_tbl) - 60b - .hword L(copy_narrow_tbl) - 70b -endfunc - #define SUM_STRIDE (384+16) #include "looprestoration_tmpl.S" From 265b838fe5f4672ebf3ded0346e56cda443260a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 30 Dec 2020 11:32:11 +0200 Subject: [PATCH 099/155] arm: ipred: Improve schedulimg in cfl and dc This gives a speedup of around one cycle. --- src/arm/32/ipred.S | 18 +++++++++--------- src/arm/64/ipred.S | 18 +++++++++--------- src/arm/64/ipred16.S | 18 +++++++++--------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/arm/32/ipred.S b/src/arm/32/ipred.S index e895696c7b..5f93c52ae8 100644 --- a/src/arm/32/ipred.S +++ b/src/arm/32/ipred.S @@ -600,10 +600,10 @@ L(ipred_dc_tbl): L(ipred_dc_h4): vld1.32 {d0[]}, [r2, :32]! vpaddl.u8 d0, d0 + add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w4): - add r2, r2, #1 vld1.32 {d1[]}, [r2] vadd.s16 d0, d0, d30 vpaddl.u8 d1, d1 @@ -635,10 +635,10 @@ L(ipred_dc_h8): vld1.8 {d0}, [r2, :64]! vpaddl.u8 d0, d0 vpadd.u16 d0, d0 + add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w8): - add r2, r2, #1 vld1.8 {d2}, [r2] vadd.s16 d0, d0, d30 vpaddl.u8 d2, d2 @@ -672,10 +672,10 @@ L(ipred_dc_h16): vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 + add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w16): - add r2, r2, #1 vld1.8 {d2, d3}, [r2] vadd.s16 d0, d0, d30 vaddl.u8 q1, d2, d3 @@ -712,10 +712,10 @@ L(ipred_dc_h32): vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 + add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w32): - add r2, r2, #1 vld1.8 {d2, d3, d4, d5}, [r2] vadd.s16 d0, d0, d30 vaddl.u8 q1, d2, d3 @@ -760,10 +760,10 @@ L(ipred_dc_h64): vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 + add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w64): - add r2, r2, #1 vld1.8 {d2, d3, d4, d5}, [r2]! vadd.s16 d0, d0, d30 vaddl.u8 q2, d4, d5 @@ -2003,10 +2003,10 @@ L(ipred_cfl_tbl): L(ipred_cfl_h4): vld1.32 {d0[]}, [r2, :32]! vpaddl.u8 d0, d0 + add r2, r2, #1 vpadd.i16 d0, d0 bx r12 L(ipred_cfl_w4): - add r2, r2, #1 vld1.32 {d1[]}, [r2] vadd.i16 d0, d0, d16 vpaddl.u8 d1, d1 @@ -2031,10 +2031,10 @@ L(ipred_cfl_h8): vld1.8 {d0}, [r2, :64]! vpaddl.u8 d0, d0 vpadd.i16 d0, d0 + add r2, r2, #1 vpadd.i16 d0, d0 bx r12 L(ipred_cfl_w8): - add r2, r2, #1 vld1.8 {d1}, [r2] vadd.i16 d0, d0, d16 vpaddl.u8 d1, d1 @@ -2061,10 +2061,10 @@ L(ipred_cfl_h16): vaddl.u8 q0, d0, d1 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0 + add r2, r2, #1 vpadd.i16 d0, d0 bx r12 L(ipred_cfl_w16): - add r2, r2, #1 vld1.8 {q2}, [r2] vadd.i16 d0, d0, d16 vaddl.u8 q2, d4, d5 @@ -2094,10 +2094,10 @@ L(ipred_cfl_h32): vadd.i16 q0, q2, q3 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0 + add r2, r2, #1 vpadd.i16 d0, d0 bx r12 L(ipred_cfl_w32): - add r2, r2, #1 vld1.8 {q2, q3}, [r2] vadd.i16 d0, d0, d16 vaddl.u8 q2, d4, d5 diff --git a/src/arm/64/ipred.S b/src/arm/64/ipred.S index 4be84a1a26..e338f0d5b2 100644 --- a/src/arm/64/ipred.S +++ b/src/arm/64/ipred.S @@ -502,9 +502,9 @@ L(ipred_dc_h4): ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr uaddlv h0, v0.8b + add x2, x2, #1 br x3 L(ipred_dc_w4): - add x2, x2, #1 ld1 {v1.s}[0], [x2] ins v1.s[1], wzr add v0.4h, v0.4h, v16.4h @@ -534,9 +534,9 @@ L(ipred_dc_w4): L(ipred_dc_h8): ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b + add x2, x2, #1 br x3 L(ipred_dc_w8): - add x2, x2, #1 ld1 {v1.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.8b @@ -565,9 +565,9 @@ L(ipred_dc_w8): L(ipred_dc_h16): ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b + add x2, x2, #1 br x3 L(ipred_dc_w16): - add x2, x2, #1 ld1 {v1.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b @@ -597,10 +597,10 @@ L(ipred_dc_h32): ld1 {v0.16b, v1.16b}, [x2], #32 uaddlv h0, v0.16b uaddlv h1, v1.16b + add x2, x2, #1 add v0.4h, v0.4h, v1.4h br x3 L(ipred_dc_w32): - add x2, x2, #1 ld1 {v1.16b, v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b @@ -637,10 +637,10 @@ L(ipred_dc_h64): uaddlv h3, v3.16b add v0.4h, v0.4h, v1.4h add v2.4h, v2.4h, v3.4h + add x2, x2, #1 add v0.4h, v0.4h, v2.4h br x3 L(ipred_dc_w64): - add x2, x2, #1 ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b @@ -1834,10 +1834,10 @@ function ipred_cfl_8bpc_neon, export=1 L(ipred_cfl_h4): ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr + add x2, x2, #1 uaddlv h0, v0.8b br x9 L(ipred_cfl_w4): - add x2, x2, #1 ld1 {v2.s}[0], [x2] ins v2.s[1], wzr add v0.4h, v0.4h, v16.4h @@ -1860,9 +1860,9 @@ L(ipred_cfl_w4): L(ipred_cfl_h8): ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b + add x2, x2, #1 br x9 L(ipred_cfl_w8): - add x2, x2, #1 ld1 {v2.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.8b @@ -1884,9 +1884,9 @@ L(ipred_cfl_w8): L(ipred_cfl_h16): ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b + add x2, x2, #1 br x9 L(ipred_cfl_w16): - add x2, x2, #1 ld1 {v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b @@ -1909,10 +1909,10 @@ L(ipred_cfl_h32): ld1 {v2.16b, v3.16b}, [x2], #32 uaddlv h2, v2.16b uaddlv h3, v3.16b + add x2, x2, #1 add v0.4h, v2.4h, v3.4h br x9 L(ipred_cfl_w32): - add x2, x2, #1 ld1 {v2.16b, v3.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b diff --git a/src/arm/64/ipred16.S b/src/arm/64/ipred16.S index a3993d034a..43c910a257 100644 --- a/src/arm/64/ipred16.S +++ b/src/arm/64/ipred16.S @@ -562,9 +562,9 @@ function ipred_dc_16bpc_neon, export=1 L(ipred_dc_h4): ld1 {v0.4h}, [x2], #8 uaddlv s0, v0.4h + add x2, x2, #2 br x3 L(ipred_dc_w4): - add x2, x2, #2 ld1 {v1.4h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s1, v1.4h @@ -594,9 +594,9 @@ L(ipred_dc_w4): L(ipred_dc_h8): ld1 {v0.8h}, [x2], #16 uaddlv s0, v0.8h + add x2, x2, #2 br x3 L(ipred_dc_w8): - add x2, x2, #2 ld1 {v1.8h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s1, v1.8h @@ -626,10 +626,10 @@ L(ipred_dc_w8): L(ipred_dc_h16): ld1 {v0.8h, v1.8h}, [x2], #32 addp v0.8h, v0.8h, v1.8h + add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w16): - add x2, x2, #2 ld1 {v1.8h, v2.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h @@ -663,10 +663,10 @@ L(ipred_dc_h32): addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v2.8h + add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w32): - add x2, x2, #2 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h @@ -709,10 +709,10 @@ L(ipred_dc_h64): addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h + add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w64): - add x2, x2, #2 ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h @@ -2143,9 +2143,9 @@ function ipred_cfl_16bpc_neon, export=1 L(ipred_cfl_h4): ld1 {v0.4h}, [x2], #8 uaddlv s0, v0.4h + add x2, x2, #2 br x9 L(ipred_cfl_w4): - add x2, x2, #2 ld1 {v2.4h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s2, v2.4h @@ -2168,9 +2168,9 @@ L(ipred_cfl_w4): L(ipred_cfl_h8): ld1 {v0.8h}, [x2], #16 uaddlv s0, v0.8h + add x2, x2, #2 br x9 L(ipred_cfl_w8): - add x2, x2, #2 ld1 {v2.8h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s2, v2.8h @@ -2193,10 +2193,10 @@ L(ipred_cfl_w8): L(ipred_cfl_h16): ld1 {v2.8h, v3.8h}, [x2], #32 addp v0.8h, v2.8h, v3.8h + add x2, x2, #2 uaddlv s0, v0.8h br x9 L(ipred_cfl_w16): - add x2, x2, #2 ld1 {v2.8h, v3.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v2.8h, v2.8h, v3.8h @@ -2222,10 +2222,10 @@ L(ipred_cfl_h32): addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v0.8h, v2.8h, v4.8h + add x2, x2, #2 uaddlv s0, v0.8h br x9 L(ipred_cfl_w32): - add x2, x2, #2 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] add v0.4s, v0.4s, v16.4s addp v2.8h, v2.8h, v3.8h From 8dce3499c84f4c5c9a1f1e8b4bce3c649a092259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 12 Dec 2020 12:10:35 +0200 Subject: [PATCH 100/155] arm32: ipred: Fix various inefficiencies in the dc functions Also fix the location of one comment, to be consistent with other similar comments. --- src/arm/32/ipred.S | 55 +++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/src/arm/32/ipred.S b/src/arm/32/ipred.S index 5f93c52ae8..f9b6d84398 100644 --- a/src/arm/32/ipred.S +++ b/src/arm/32/ipred.S @@ -40,8 +40,7 @@ function ipred_dc_128_8bpc_neon, export=1 adr r2, L(ipred_dc_128_tbl) sub r3, r3, #25 ldr r3, [r2, r3, lsl #2] - mov lr, #128 - vdup.8 q0, lr + vmov.i8 q0, #128 add r2, r2, r3 add r12, r0, r1 lsl r1, r1, #1 @@ -79,7 +78,7 @@ L(ipred_dc_128_tbl): bgt 16b pop {r4, pc} 320: - vdup.8 q1, lr + vmov.i8 q1, #128 32: vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 @@ -89,20 +88,18 @@ L(ipred_dc_128_tbl): bgt 32b pop {r4, pc} 640: - vdup.8 q1, lr - vdup.8 q2, lr - vdup.8 q3, lr + vmov.i8 q1, #128 sub r1, r1, #32 64: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 64b pop {r4, pc} endfunc @@ -401,19 +398,17 @@ L(ipred_dc_top_tbl): vrshrn.u16 d18, q0, #6 vdup.8 q0, d18[0] vdup.8 q1, d18[0] - vdup.8 q2, d18[0] - vdup.8 q3, d18[0] sub r1, r1, #32 64: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 64b pop {r4-r5, pc} endfunc @@ -538,20 +533,18 @@ L(ipred_dc_left_h64): vdup.8 q0, d0[0] bx r3 L(ipred_dc_left_w64): - sub r1, r1, #32 vmov.8 q1, q0 - vmov.8 q2, q0 - vmov.8 q3, q0 + sub r1, r1, #32 1: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 1b pop {r4-r5, pc} endfunc @@ -789,11 +782,11 @@ L(ipred_dc_w64): vadd.s16 d0, d0, d2 vadd.s16 d0, d0, d3 vshl.u16 d18, d0, d28 - beq 1f // h = 16/32 + beq 1f + // h = 16/32 movw lr, #(0x5556/2) movt lr, #(0x3334/2) - mov r5, r4 - and r5, r5, #31 + and r5, r4, #31 lsr lr, lr, r5 vdup.16 d30, lr vqdmulh.s16 d18, d18, d30 @@ -801,18 +794,16 @@ L(ipred_dc_w64): sub r1, r1, #32 vdup.8 q0, d18[0] vdup.8 q1, d18[0] - vdup.8 q2, d18[0] - vdup.8 q3, d18[0] 2: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! - vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 - vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} endfunc From 72480652414fa87e893240530d8b7f02f0d9288f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 24 Dec 2020 01:37:08 +0200 Subject: [PATCH 101/155] arm64: ipred16: Remove a leftover instruction --- src/arm/64/ipred16.S | 1 - 1 file changed, 1 deletion(-) diff --git a/src/arm/64/ipred16.S b/src/arm/64/ipred16.S index 43c910a257..f6d7dd2b6b 100644 --- a/src/arm/64/ipred16.S +++ b/src/arm/64/ipred16.S @@ -1421,7 +1421,6 @@ function ipred_filter_\bpc\()bpc_neon smin v2.8h, v2.8h, v31.8h subs w4, w4, #2 st1 {v2.d}[0], [x0], x1 - uxtl v0.8h, v2.8b ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] st1 {v2.d}[1], [x6], x1 b.gt 4b From ccea2a7d00642c0186739165d31fe17a7f5865be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 24 Dec 2020 01:38:17 +0200 Subject: [PATCH 102/155] arm32: ipred: Use a simpler instruction in filter w4 --- src/arm/32/ipred.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arm/32/ipred.S b/src/arm/32/ipred.S index f9b6d84398..8ab0060419 100644 --- a/src/arm/32/ipred.S +++ b/src/arm/32/ipred.S @@ -1464,7 +1464,7 @@ L(ipred_filter_tbl): vst1.32 {d4[0]}, [r0, :32], r1 vmovl.u8 q0, d4 vst1.32 {d4[1]}, [r6, :32], r1 - vext.8 q0, q0, q0, #8 // move top from [4-7] to [0-3] + vmov d0, d1 // move top from [4-7] to [0-3] bgt 4b pop {r4-r8, pc} 80: From fb9a3959b42ffbbb61b038ec8b9841f7fbfa2750 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 28 Dec 2020 11:30:38 +0200 Subject: [PATCH 103/155] arm64: ipred16: Remove an unnecessary instruction from the 12 bpc version of filter --- src/arm/64/ipred16.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/arm/64/ipred16.S b/src/arm/64/ipred16.S index f6d7dd2b6b..72a369fca6 100644 --- a/src/arm/64/ipred16.S +++ b/src/arm/64/ipred16.S @@ -1382,7 +1382,9 @@ function ipred_filter_\bpc\()bpc_neon sxtl v21.8h, v21.8b sxtl v22.8h, v22.8b dup v31.8h, w8 +.if \bpc == 10 movi v30.8h, #0 +.endif br x5 40: ldur d0, [x2, #2] // top (0-3) From a8ea06002e219f90247a1bab4d448ca660fe0317 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 26 Dec 2020 00:55:55 +0200 Subject: [PATCH 104/155] arm32: ipred: Hoist a few shared instructions into common code This might cause a slowdown of around one cycle on some cores, as the instructions were placed in a latency bubble before though, but simplify the code by moving them to the header where they'd normally be. --- src/arm/32/ipred.S | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/arm/32/ipred.S b/src/arm/32/ipred.S index 8ab0060419..d37668b4f8 100644 --- a/src/arm/32/ipred.S +++ b/src/arm/32/ipred.S @@ -1435,6 +1435,8 @@ function ipred_filter_8bpc_neon, export=1 vmovl.s8 q13, d28 vmovl.s8 q14, d29 add r8, r2, #1 + sub r2, r2, #2 + mov r7, #-2 bx r5 .align 2 @@ -1446,8 +1448,6 @@ L(ipred_filter_tbl): 40: vld1.32 {d0[]}, [r8] // top (0-3) - sub r2, r2, #2 - mov r7, #-2 vmovl.u8 q0, d0 // top (0-3) 4: vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2) @@ -1469,8 +1469,6 @@ L(ipred_filter_tbl): pop {r4-r8, pc} 80: vld1.8 {d0}, [r8] // top (0-7) - sub r2, r2, #2 - mov r7, #-2 vmovl.u8 q0, d0 // top (0-7) 8: vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2) @@ -1502,8 +1500,6 @@ L(ipred_filter_tbl): 160: 320: vpush {q4-q5} - sub r2, r2, #2 - mov r7, #-2 sub r1, r1, r3 mov lr, r3 From 47615e07e9e0a9fa4775901b3e81323bf47ba1bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 31 Dec 2020 22:42:11 +0200 Subject: [PATCH 105/155] arm32: ipred: Fix the element size declarations in stores in filter w8 --- src/arm/32/ipred.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arm/32/ipred.S b/src/arm/32/ipred.S index d37668b4f8..ff55d95d4a 100644 --- a/src/arm/32/ipred.S +++ b/src/arm/32/ipred.S @@ -1492,9 +1492,9 @@ L(ipred_filter_tbl): vqrshrun.s16 d5, q3, #4 vzip.32 d4, d5 subs r4, r4, #2 - vst1.64 {d4}, [r0, :64], r1 + vst1.8 {d4}, [r0, :64], r1 vmovl.u8 q0, d5 - vst1.64 {d5}, [r6, :64], r1 + vst1.8 {d5}, [r6, :64], r1 bgt 8b pop {r4-r8, pc} 160: From e25b712b23a3db5808f6389f7c37639d848e6cee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 12 Dec 2020 00:51:06 +0200 Subject: [PATCH 106/155] arm32: ipred: NEON implementation of ipred functions for 16 bpc Samples of some checkasm benchmarks: Cortex A7 A8 A53 A72 A73 cfl_ac_420_w4_16bpc_neon: 258.2 130.0 187.8 88.1 99.9 cfl_ac_420_w8_16bpc_neon: 396.3 192.3 278.0 134.1 148.1 cfl_ac_420_w16_16bpc_neon: 705.9 341.5 508.4 231.2 263.0 intra_pred_filter_w32_10bpc_neon: 3450.6 3279.7 1505.6 1716.8 1631.0 intra_pred_filter_w32_12bpc_neon: 5075.2 2467.3 2027.9 1605.7 1556.0 intra_pred_paeth_w64_16bpc_neon: 7850.6 4682.9 4538.4 4640.4 4952.4 intra_pred_smooth_w64_16bpc_neon: 6807.7 4044.0 4001.4 3001.9 3131.5 Corresponding numbers for arm64: Cortex A53 A72 A73 cfl_ac_420_w4_16bpc_neon: 154.8 87.1 81.6 cfl_ac_420_w8_16bpc_neon: 235.6 124.8 133.0 cfl_ac_420_w16_16bpc_neon: 428.8 206.5 234.9 intra_pred_filter_w32_10bpc_neon: 1333.2 1485.9 1468.3 intra_pred_filter_w32_12bpc_neon: 1839.1 1429.0 1439.7 intra_pred_paeth_w64_16bpc_neon: 3691.1 3091.8 3289.7 intra_pred_smooth_w64_16bpc_neon: 3776.8 3124.4 2827.1 --- src/arm/32/ipred16.S | 3253 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 3253 insertions(+) create mode 100644 src/arm/32/ipred16.S diff --git a/src/arm/32/ipred16.S b/src/arm/32/ipred16.S new file mode 100644 index 0000000000..eb2efe0188 --- /dev/null +++ b/src/arm/32/ipred16.S @@ -0,0 +1,3253 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, B Krishnan Iyer + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height, +// const int bitdepth_max); +function ipred_dc_128_16bpc_neon, export=1 + push {r4, lr} + ldr r4, [sp, #8] + ldr r12, [sp, #24] + clz r3, r3 + adr r2, L(ipred_dc_128_tbl) + sub r3, r3, #25 + vdup.16 q0, r12 + ldr r3, [r2, r3, lsl #2] + add r12, r0, r1 + vrshr.u16 q0, q0, #1 + add r2, r2, r3 + lsl r1, r1, #1 + bx r2 + + .align 2 +L(ipred_dc_128_tbl): + .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 160f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB + .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB +4: + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 4b + pop {r4, pc} +8: + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt 8b + pop {r4, pc} +160: + vmov q1, q0 +16: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 16b + pop {r4, pc} +320: + vmov q1, q0 + sub r1, r1, #32 +32: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 32b + pop {r4, pc} +640: + vmov q1, q0 + sub r1, r1, #96 +64: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + subs r4, r4, #2 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 64b + pop {r4, pc} +endfunc + +// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_16bpc_neon, export=1 + push {r4, lr} + ldr lr, [sp, #8] + clz r3, r3 + adr r4, L(ipred_v_tbl) + sub r3, r3, #25 + ldr r3, [r4, r3, lsl #2] + add r2, r2, #2 + add r4, r4, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r4 + + .align 2 +L(ipred_v_tbl): + .word 640f - L(ipred_v_tbl) + CONFIG_THUMB + .word 320f - L(ipred_v_tbl) + CONFIG_THUMB + .word 160f - L(ipred_v_tbl) + CONFIG_THUMB + .word 80f - L(ipred_v_tbl) + CONFIG_THUMB + .word 40f - L(ipred_v_tbl) + CONFIG_THUMB + +40: + vld1.16 {d0}, [r2] +4: + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs lr, lr, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 4b + pop {r4, pc} +80: + vld1.16 {q0}, [r2] +8: + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs lr, lr, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt 8b + pop {r4, pc} +160: + vld1.16 {q0, q1}, [r2] +16: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs lr, lr, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 16b + pop {r4, pc} +320: + vld1.16 {q0, q1}, [r2]! + sub r1, r1, #32 + vld1.16 {q2, q3}, [r2] +32: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.16 {d4, d5, d6, d7}, [r12, :128], r1 + subs lr, lr, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d4, d5, d6, d7}, [r0, :128], r1 + vst1.16 {d4, d5, d6, d7}, [r12, :128], r1 + bgt 32b + pop {r4, pc} +640: + vld1.16 {q0, q1}, [r2]! + sub r1, r1, #96 + vld1.16 {q2, q3}, [r2]! + vld1.16 {q8, q9}, [r2]! + vld1.16 {q10, q11}, [r2]! +64: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d4, d5, d6, d7}, [r0, :128]! + vst1.16 {d4, d5, d6, d7}, [r12, :128]! + subs lr, lr, #2 + vst1.16 {d16, d17, d18, d19}, [r0, :128]! + vst1.16 {d16, d17, d18, d19}, [r12, :128]! + vst1.16 {d20, d21, d22, d23}, [r0, :128], r1 + vst1.16 {d20, d21, d22, d23}, [r12, :128], r1 + bgt 64b + pop {r4, pc} +endfunc + +// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_16bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + clz r3, r3 + adr r5, L(ipred_h_tbl) + sub r3, r3, #25 + ldr r3, [r5, r3, lsl #2] + sub r2, r2, #2 + mov lr, #-2 + add r5, r5, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_h_tbl): + .word 640f - L(ipred_h_tbl) + CONFIG_THUMB + .word 320f - L(ipred_h_tbl) + CONFIG_THUMB + .word 160f - L(ipred_h_tbl) + CONFIG_THUMB + .word 8f - L(ipred_h_tbl) + CONFIG_THUMB + .word 40f - L(ipred_h_tbl) + CONFIG_THUMB +40: + sub r2, r2, #6 + mov lr, #-8 +4: + vld4.16 {d0[], d1[], d2[], d3[]}, [r2], lr + vst1.16 {d3}, [r0, :64], r1 + vst1.16 {d2}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d1}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 4b + pop {r4-r5, pc} +8: + vld1.16 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.16 {d2[], d3[]}, [r2], lr + vst1.16 {q0}, [r0, :128], r1 + vld1.16 {d4[], d5[]}, [r2], lr + vst1.16 {q1}, [r12, :128], r1 + vld1.16 {d6[], d7[]}, [r2], lr + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r12, :128], r1 + bgt 8b + pop {r4-r5, pc} +160: + sub r1, r1, #16 +16: + vld1.16 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.16 {d2[], d3[]}, [r2], lr + vst1.16 {q0}, [r0, :128]! + vld1.16 {d4[], d5[]}, [r2], lr + vst1.16 {q1}, [r12, :128]! + vld1.16 {d6[], d7[]}, [r2], lr + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r12, :128], r1 + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r12, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + sub r1, r1, #48 +32: + vld1.16 {d0[], d1[]}, [r2], lr + subs r4, r4, #4 + vld1.16 {d2[], d3[]}, [r2], lr + vst1.16 {q0}, [r0, :128]! + vld1.16 {d4[], d5[]}, [r2], lr + vst1.16 {q1}, [r12, :128]! + vld1.16 {d6[], d7[]}, [r2], lr + vst1.16 {q0}, [r0, :128]! + vst1.16 {q1}, [r12, :128]! + vst1.16 {q0}, [r0, :128]! + vst1.16 {q1}, [r12, :128]! + vst1.16 {q0}, [r0, :128], r1 + vst1.16 {q1}, [r12, :128], r1 + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + vst1.16 {q2}, [r0, :128]! + vst1.16 {q3}, [r12, :128]! + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r12, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + sub r1, r1, #96 +64: + vld1.16 {d0[], d1[]}, [r2], lr + subs r4, r4, #2 + vld1.16 {d4[], d5[]}, [r2], lr + vmov q1, q0 + vmov q3, q2 + vst1.16 {q0, q1}, [r0, :128]! + vst1.16 {q2, q3}, [r12, :128]! + vst1.16 {q0, q1}, [r0, :128]! + vst1.16 {q2, q3}, [r12, :128]! + vst1.16 {q0, q1}, [r0, :128]! + vst1.16 {q2, q3}, [r12, :128]! + vst1.16 {q0, q1}, [r0, :128], r1 + vst1.16 {q2, q3}, [r12, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_16bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + clz r3, r3 + adr r5, L(ipred_dc_top_tbl) + sub r3, r3, #25 + ldr r3, [r5, r3, lsl #2] + add r2, r2, #2 + add r5, r5, r3 + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_top_tbl): + .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB + .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB + +40: + vld1.16 {d0}, [r2] + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 d0, d0[0] +4: + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 4b + pop {r4-r5, pc} +80: + vld1.16 {d0, d1}, [r2] + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] +8: + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt 8b + pop {r4-r5, pc} +160: + vld1.16 {d0, d1, d2, d3}, [r2] + vadd.i16 q0, q0, q1 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d4, d0, #4 + vdup.16 q0, d4[0] + vdup.16 q1, d4[0] +16: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + vld1.16 {d0, d1, d2, d3}, [r2]! + vld1.16 {d4, d5, d6, d7}, [r2] + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vadd.i16 q0, q0, q2 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpaddl.u16 d0, d0 + vrshrn.i32 d18, q0, #5 + vdup.16 q0, d18[0] + vdup.16 q1, d18[0] + sub r1, r1, #32 +32: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + vld1.16 {d0, d1, d2, d3}, [r2]! + vld1.16 {d4, d5, d6, d7}, [r2]! + vadd.i16 q0, q0, q1 + vld1.16 {d16, d17, d18, d19}, [r2]! + vadd.i16 q2, q2, q3 + vld1.16 {d20, d21, d22, d23}, [r2] + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q0, q2 + vadd.i16 q8, q8, q10 + vadd.i16 q0, q0, q8 + vadd.i16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpadd.i32 d0, d0, d0 + vrshrn.i32 d18, q0, #6 + vdup.16 q0, d18[0] + vdup.16 q1, d18[0] + sub r1, r1, #96 +64: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + subs r4, r4, #2 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_16bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + sub r2, r2, r4, lsl #1 + clz r3, r3 + clz lr, r4 + sub lr, lr, #25 + adr r5, L(ipred_dc_left_tbl) + sub r3, r3, #20 + ldr r3, [r5, r3, lsl #2] + ldr lr, [r5, lr, lsl #2] + add r3, r5, r3 + add r5, r5, lr + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_left_tbl): + .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB + .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB + +L(ipred_dc_left_h4): + vld1.16 {d0}, [r2, :64] + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w4): + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt L(ipred_dc_left_w4) + pop {r4-r5, pc} +L(ipred_dc_left_h8): + vld1.16 {d0, d1}, [r2, :128] + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w8): + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt L(ipred_dc_left_w8) + pop {r4-r5, pc} +L(ipred_dc_left_h16): + vld1.16 {d0, d1, d2, d3}, [r2, :128] + vadd.i16 q0, q0, q1 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w16): + vmov q1, q0 +1: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +L(ipred_dc_left_h32): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vld1.16 {d4, d5, d6, d7}, [r2, :128] + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vadd.i16 q0, q0, q2 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpaddl.u16 d0, d0 + vrshrn.i32 d0, q0, #5 + vdup.16 q0, d0[0] + bx r3 +L(ipred_dc_left_w32): + sub r1, r1, #32 + vmov q1, q0 +1: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +L(ipred_dc_left_h64): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vld1.16 {d4, d5, d6, d7}, [r2, :128]! + vadd.i16 q0, q0, q1 + vld1.16 {d16, d17, d18, d19}, [r2, :128]! + vadd.i16 q2, q2, q3 + vld1.16 {d20, d21, d22, d23}, [r2, :128] + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q0, q2 + vadd.i16 q8, q8, q10 + vadd.i16 q0, q0, q8 + vadd.i16 d0, d0, d1 + vpaddl.u16 d0, d0 + vpadd.i32 d0, d0, d0 + vrshrn.i32 d0, q0, #6 + vdup.16 q0, d0[0] +L(ipred_dc_left_w64): + sub r1, r1, #96 + vmov q1, q0 +1: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + subs r4, r4, #2 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 1b + pop {r4-r5, pc} +endfunc + +// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_16bpc_neon, export=1 + push {r4-r6, lr} + ldr r4, [sp, #16] + sub r2, r2, r4, lsl #1 + add lr, r3, r4 // width + height + clz r3, r3 + clz r12, r4 + vdup.32 q15, lr // width + height + adr r5, L(ipred_dc_tbl) + rbit lr, lr // rbit(width + height) + sub r3, r3, #20 // 25 leading bits, minus table offset 5 + sub r12, r12, #25 + clz lr, lr // ctz(width + height) + ldr r3, [r5, r3, lsl #2] + ldr r12, [r5, r12, lsl #2] + neg lr, lr // -ctz(width + height) + add r3, r5, r3 + add r5, r5, r12 + vshr.u32 q15, q15, #1 // (width + height) >> 1 + vdup.32 q14, lr // -ctz(width + height) + add r12, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_dc_tbl): + .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB + .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB + +L(ipred_dc_h4): + vld1.16 {d0}, [r2, :64]! + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r3 +L(ipred_dc_w4): + vld1.16 {d2}, [r2] + vadd.i32 d0, d0, d30 + vpadd.i16 d2, d2, d2 + vpaddl.u16 d2, d2 + cmp r4, #4 + vadd.i32 d0, d0, d2 + vshl.u32 d0, d0, d28 + beq 1f + // h = 8/16 + cmp r4, #16 + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d0, d0, d24 + vshr.u32 d0, d0, #17 +1: + vdup.16 d0, d0[0] +2: + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + subs r4, r4, #4 + vst1.16 {d0}, [r0, :64], r1 + vst1.16 {d0}, [r12, :64], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h8): + vld1.16 {d0, d1}, [r2, :128]! + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r3 +L(ipred_dc_w8): + vld1.16 {d2, d3}, [r2] + vadd.i32 d0, d0, d30 + vadd.i16 d2, d2, d3 + vpadd.i16 d2, d2, d2 + vpaddl.u16 d2, d2 + cmp r4, #8 + vadd.i32 d0, d0, d2 + vshl.u32 d0, d0, d28 + beq 1f + // h = 4/16/32 + cmp r4, #32 + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d0, d0, d24 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] +2: + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1}, [r0, :128], r1 + vst1.16 {d0, d1}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h16): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vadd.i16 q0, q0, q1 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r3 +L(ipred_dc_w16): + vld1.16 {d2, d3, d4, d5}, [r2] + vadd.i32 d0, d0, d30 + vadd.i16 q1, q1, q2 + vadd.i16 d2, d2, d3 + vpadd.i16 d2, d2, d1 + vpaddl.u16 d2, d2 + cmp r4, #16 + vadd.i32 d0, d0, d2 + vshl.u32 d4, d0, d28 + beq 1f + // h = 4/8/32/64 + tst r4, #(32+16+8) // 16 added to make a consecutive bitmask + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d4, d4, d24 + vshr.u32 d4, d4, #17 +1: + vdup.16 q0, d4[0] + vdup.16 q1, d4[0] +2: + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} + +L(ipred_dc_h32): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vld1.16 {d4, d5, d6, d7}, [r2, :128]! + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vadd.i16 q0, q0, q2 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r3 +L(ipred_dc_w32): + vld1.16 {d2, d3, d4, d5}, [r2]! + vadd.i32 d0, d0, d30 + vld1.16 {d16, d17, d18, d19}, [r2] + vadd.i16 q1, q1, q2 + vadd.i16 q8, q8, q9 + vadd.i16 q1, q1, q8 + vadd.i16 d2, d2, d3 + vpadd.i16 d2, d2, d2 + vpaddl.u16 d2, d2 + cmp r4, #32 + vadd.i32 d0, d0, d2 + vshl.u32 d4, d0, d28 + beq 1f + // h = 8/16/64 + cmp r4, #8 + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d4, d4, d24 + vshr.u32 d4, d4, #17 +1: + sub r1, r1, #32 + vdup.16 q0, d4[0] + vdup.16 q1, d4[0] +2: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + subs r4, r4, #4 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} +L(ipred_dc_h64): + vld1.16 {d0, d1, d2, d3}, [r2, :128]! + vld1.16 {d4, d5, d6, d7}, [r2, :128]! + vadd.i16 q0, q0, q1 + vld1.16 {d16, d17, d18, d19}, [r2, :128]! + vadd.i16 q2, q2, q3 + vld1.16 {d20, d21, d22, d23}, [r2, :128]! + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q0, q2 + vadd.i16 q8, q8, q10 + vadd.i16 q0, q0, q8 + vadd.i16 d0, d0, d1 + vpaddl.u16 d0, d0 + add r2, r2, #2 + vpadd.i32 d0, d0, d0 + bx r3 +L(ipred_dc_w64): + vld1.16 {d2, d3, d4, d5}, [r2]! + vadd.i32 d0, d0, d30 + vld1.16 {d16, d17, d18, d19}, [r2]! + vadd.i16 q1, q1, q2 + vld1.16 {d20, d21, d22, d23}, [r2]! + vadd.i16 q8, q8, q9 + vld1.16 {d24, d25, d26, d27}, [r2]! + vadd.i16 q10, q10, q11 + vadd.i16 q12, q12, q13 + vadd.i16 q1, q1, q8 + vadd.i16 q10, q10, q12 + vadd.i16 q1, q1, q10 + vadd.i16 d2, d2, d3 + vpaddl.u16 d2, d2 + vpadd.i32 d2, d2, d2 + cmp r4, #64 + vadd.i32 d0, d0, d2 + vshl.u32 d4, d0, d28 + beq 1f + // h = 16/32 + cmp r4, #16 + movw lr, #0x6667 + movw r5, #0xAAAB + it ne + movne lr, r5 + vdup.32 d24, lr + vmul.i32 d4, d4, d24 + vshr.u32 d4, d4, #17 +1: + sub r1, r1, #96 + vdup.16 q0, d4[0] + vdup.16 q1, d4[0] +2: + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + subs r4, r4, #2 + vst1.16 {d0, d1, d2, d3}, [r0, :128]! + vst1.16 {d0, d1, d2, d3}, [r12, :128]! + vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 + vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 + bgt 2b + pop {r4-r6, pc} +endfunc + +// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_16bpc_neon, export=1 + push {r4-r6, lr} + vpush {q4} + ldr r4, [sp, #32] + clz lr, r3 + adr r12, L(ipred_paeth_tbl) + sub lr, lr, #25 + ldr lr, [r12, lr, lsl #2] + vld1.16 {d4[], d5[]}, [r2] + add r6, r2, #2 + sub r2, r2, #4 + add r12, r12, lr + mov r5, #-4 + add lr, r0, r1 + lsl r1, r1, #1 + bx r12 + + .align 2 +L(ipred_paeth_tbl): + .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB + .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB + +40: + sub r2, r2, #4 + mov r5, #-8 + vld1.16 {d6}, [r6] + vsub.i16 d16, d6, d4 // top - topleft + vmov d7, d6 + vmov d17, d16 +4: + vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r5 + vadd.i16 q9, q8, q0 // base + vadd.i16 q10, q8, q1 + vabd.s16 q11, q3, q9 // tdiff + vabd.s16 q12, q3, q10 + vabd.s16 q13, q2, q9 // tldiff + vabd.s16 q14, q2, q10 + vabd.s16 q9, q0, q9 // ldiff + vabd.s16 q10, q1, q10 + vmin.u16 q15, q11, q13 // min(tdiff, tldiff) + vmin.u16 q4, q12, q14 + vcge.u16 q11, q13, q11 // tldiff >= tdiff + vcge.u16 q12, q14, q12 + vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff + vcge.u16 q10, q4, q10 + vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft + vbsl q11, q3, q2 + vbit q12, q1, q10 // ldiff <= min ? left : ... + vbit q11, q0, q9 + vst1.16 {d25}, [r0, :64], r1 + vst1.16 {d24}, [lr, :64], r1 + subs r4, r4, #4 + vst1.16 {d23}, [r0, :64], r1 + vst1.16 {d22}, [lr, :64], r1 + bgt 4b + vpop {q4} + pop {r4-r6, pc} +80: +160: +320: +640: + vld1.16 {q3}, [r6]! + mov r12, r3 + sub r1, r1, r3, lsl #1 +1: + vld2.16 {d0[], d2[]}, [r2, :32], r5 + vmov d1, d0 + vmov d3, d2 +2: + vsub.i16 q8, q3, q2 // top - topleft + vadd.i16 q9, q8, q0 // base + vadd.i16 q10, q8, q1 + vabd.s16 q11, q3, q9 // tdiff + vabd.s16 q12, q3, q10 + vabd.s16 q13, q2, q9 // tldiff + vabd.s16 q14, q2, q10 + vabd.s16 q9, q0, q9 // ldiff + vabd.s16 q10, q1, q10 + vmin.u16 q15, q11, q13 // min(tdiff, tldiff) + vmin.u16 q4, q12, q14 + vcge.u16 q11, q13, q11 // tldiff >= tdiff + vcge.u16 q12, q14, q12 + vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff + vcge.u16 q10, q4, q10 + vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft + vbsl q11, q3, q2 + vbit q12, q1, q10 // ldiff <= min ? left : ... + vbit q11, q0, q9 + subs r3, r3, #8 + vst1.16 {q12}, [r0, :128]! + vst1.16 {q11}, [lr, :128]! + ble 8f + vld1.16 {q3}, [r6]! + b 2b +8: + subs r4, r4, #2 + ble 9f + // End of horizontal loop, move pointers to next two rows + sub r6, r6, r12, lsl #1 + add r0, r0, r1 + add lr, lr, r1 + vld1.16 {q3}, [r6]! + mov r3, r12 + b 1b +9: + vpop {q4} + pop {r4-r6, pc} +endfunc + +// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_16bpc_neon, export=1 + push {r4-r10, lr} + ldr r4, [sp, #32] + movrel r10, X(sm_weights) + add r12, r10, r4 + add r10, r10, r3 + clz r9, r3 + adr r5, L(ipred_smooth_tbl) + sub lr, r2, r4, lsl #1 + sub r9, r9, #25 + ldr r9, [r5, r9, lsl #2] + vld1.16 {d4[], d5[]}, [lr] // bottom + add r8, r2, #2 + add r5, r5, r9 + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_tbl): + .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB + +40: + vld1.16 {d16}, [r8] // top + vld1.32 {d18[]}, [r10, :32] // weights_hor + sub r2, r2, #8 + mov r7, #-8 + vdup.16 q3, d16[3] // right + vsub.i16 q8, q8, q2 // top-bottom + vmovl.u8 q9, d18 // weights_hor + vadd.i16 d19, d4, d6 // bottom+right +4: + vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left + vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver + vshll.u16 q12, d19, #8 // (bottom+right)*256 + vshll.u16 q13, d19, #8 + vshll.u16 q14, d19, #8 + vshll.u16 q15, d19, #8 + vzip.32 d20, d21 // weights_ver + vzip.32 d22, d23 + vsub.i16 q1, q1, q3 // left-right + vsub.i16 q0, q0, q3 + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 + vmlal.s16 q12, d3, d18 // += (left-right)*weights_hor + vmlal.s16 q13, d2, d18 // (left flipped) + vmlal.s16 q14, d1, d18 + vmlal.s16 q15, d0, d18 + vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver + vmlal.s16 q13, d16, d21 + vmlal.s16 q14, d16, d22 + vmlal.s16 q15, d16, d23 + vrshrn.i32 d24, q12, #9 + vrshrn.i32 d25, q13, #9 + vrshrn.i32 d26, q14, #9 + vrshrn.i32 d27, q15, #9 + vst1.16 {d24}, [r0, :64], r1 + vst1.16 {d25}, [r6, :64], r1 + subs r4, r4, #4 + vst1.16 {d26}, [r0, :64], r1 + vst1.16 {d27}, [r6, :64], r1 + bgt 4b + pop {r4-r10, pc} +80: + vld1.16 {q8}, [r8] // top + vld1.8 {d18}, [r10, :64] // weights_hor + sub r2, r2, #4 + mov r7, #-4 + vdup.16 q3, d17[3] // right + vsub.i16 q8, q8, q2 // top-bottom + vmovl.u8 q9, d18 // weights_hor + vadd.i16 d3, d4, d6 // bottom+right +8: + vld2.16 {d0[], d1[]}, [r2, :32], r7 // left + vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver + vshll.u16 q12, d3, #8 // (bottom+right)*256 + vshll.u16 q13, d3, #8 + vshll.u16 q14, d3, #8 + vshll.u16 q15, d3, #8 + vsub.i16 q0, q0, q3 // left-right + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 + vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor + vmlal.s16 q13, d1, d19 // (left flipped) + vmlal.s16 q14, d0, d18 + vmlal.s16 q15, d0, d19 + vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver + vmlal.s16 q13, d17, d20 + vmlal.s16 q14, d16, d22 + vmlal.s16 q15, d17, d22 + vrshrn.i32 d24, q12, #9 + vrshrn.i32 d25, q13, #9 + vrshrn.i32 d26, q14, #9 + vrshrn.i32 d27, q15, #9 + subs r4, r4, #2 + vst1.16 {q12}, [r0, :128], r1 + vst1.16 {q13}, [r6, :128], r1 + bgt 8b + pop {r4-r10, pc} +160: +320: +640: + add lr, r2, r3, lsl #1 + sub r2, r2, #4 + mov r7, #-4 + vld1.16 {d6[], d7[]}, [lr] // right + sub r1, r1, r3, lsl #1 + mov r9, r3 + vadd.i16 d3, d4, d6 // bottom+right + +1: + vld2.16 {d0[], d1[]}, [r2, :32], r7 // left + vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver + vsub.i16 q0, q0, q3 // left-right + vmovl.u8 q10, d20 // weights_ver + vmovl.u8 q11, d22 +2: + vld1.8 {d18}, [r10, :64]! // weights_hor + vld1.16 {q8}, [r8]! // top + vshll.u16 q12, d3, #8 // (bottom+right)*256 + vshll.u16 q13, d3, #8 + vmovl.u8 q9, d18 // weights_hor + vshll.u16 q14, d3, #8 + vshll.u16 q15, d3, #8 + vsub.i16 q8, q8, q2 // top-bottom + vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor + vmlal.s16 q13, d1, d19 // (left flipped) + vmlal.s16 q14, d0, d18 + vmlal.s16 q15, d0, d19 + vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver + vmlal.s16 q13, d17, d20 + vmlal.s16 q14, d16, d22 + vmlal.s16 q15, d17, d22 + vrshrn.i32 d24, q12, #9 + vrshrn.i32 d25, q13, #9 + vrshrn.i32 d26, q14, #9 + vrshrn.i32 d27, q15, #9 + subs r3, r3, #8 + vst1.16 {q12}, [r0, :128]! + vst1.16 {q13}, [r6, :128]! + bgt 2b + subs r4, r4, #2 + ble 9f + sub r8, r8, r9, lsl #1 + sub r10, r10, r9 + add r0, r0, r1 + add r6, r6, r1 + mov r3, r9 + b 1b +9: + pop {r4-r10, pc} +endfunc + +// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_16bpc_neon, export=1 + push {r4-r7, lr} + ldr r4, [sp, #20] + movrel r7, X(sm_weights) + add r7, r7, r4 + clz lr, r3 + adr r5, L(ipred_smooth_v_tbl) + sub r12, r2, r4, lsl #1 + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.16 {d4[], d5[]}, [r12] // bottom + add r2, r2, #2 + add r5, r5, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_v_tbl): + .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB + +40: + vld1.16 {d6}, [r2] // top + vsub.i16 d6, d6, d4 // top-bottom + vmov d7, d6 +4: + vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver + vzip.32 d16, d17 // weights_ver + vzip.32 d18, d19 + vshll.u8 q8, d16, #7 // weights_ver << 7 + vshll.u8 q9, d18, #7 + vqrdmulh.s16 q10, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8 + vqrdmulh.s16 q11, q3, q9 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vst1.16 {d20}, [r0, :64], r1 + vst1.16 {d21}, [r6, :64], r1 + subs r4, r4, #4 + vst1.16 {d22}, [r0, :64], r1 + vst1.16 {d23}, [r6, :64], r1 + bgt 4b + pop {r4-r7, pc} +80: + vld1.16 {q3}, [r2] // top + vsub.i16 q3, q3, q2 // top-bottom +8: + vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver + vshll.u8 q8, d16, #7 // weights_ver << 7 + vshll.u8 q9, d18, #7 + vshll.u8 q10, d20, #7 + vshll.u8 q11, d22, #7 + vqrdmulh.s16 q8, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8 + vqrdmulh.s16 q9, q3, q9 + vqrdmulh.s16 q10, q3, q10 + vqrdmulh.s16 q11, q3, q11 + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vst1.16 {q8}, [r0, :128], r1 + vst1.16 {q9}, [r6, :128], r1 + subs r4, r4, #4 + vst1.16 {q10}, [r0, :128], r1 + vst1.16 {q11}, [r6, :128], r1 + bgt 8b + pop {r4-r7, pc} +160: +320: +640: + vpush {q4-q7} + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3, lsl #1 + mov r12, r3 + +1: + vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver + vshll.u8 q4, d8, #7 // weights_ver << 7 + vshll.u8 q5, d10, #7 + vshll.u8 q6, d12, #7 + vshll.u8 q7, d14, #7 +2: + vld1.16 {q0, q1}, [r2]! // top + vsub.i16 q0, q0, q2 // top-bottom + vsub.i16 q1, q1, q2 + vqrdmulh.s16 q8, q0, q4 // ((top-bottom)*weights_ver + 128) >> 8 + vqrdmulh.s16 q9, q1, q4 + vqrdmulh.s16 q10, q0, q5 + vqrdmulh.s16 q11, q1, q5 + vqrdmulh.s16 q12, q0, q6 + vqrdmulh.s16 q13, q1, q6 + vqrdmulh.s16 q14, q0, q7 + vqrdmulh.s16 q15, q1, q7 + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vadd.i16 q12, q12, q2 + vadd.i16 q13, q13, q2 + vadd.i16 q14, q14, q2 + vadd.i16 q15, q15, q2 + subs r3, r3, #16 + vst1.16 {q8, q9}, [r0, :128]! + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r5, :128]! + vst1.16 {q14, q15}, [lr, :128]! + bgt 2b + subs r4, r4, #4 + ble 9f + sub r2, r2, r12, lsl #1 + add r0, r0, r1 + add r6, r6, r1 + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + vpop {q4-q7} + pop {r4-r7, pc} +endfunc + +// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_16bpc_neon, export=1 + push {r4-r8, lr} + ldr r4, [sp, #24] + movrel r8, X(sm_weights) + add r8, r8, r3 + clz lr, r3 + adr r5, L(ipred_smooth_h_tbl) + add r12, r2, r3, lsl #1 + sub lr, lr, #25 + ldr lr, [r5, lr, lsl #2] + vld1.16 {d4[], d5[]}, [r12] // right + add r5, r5, lr + add r6, r0, r1 + lsl r1, r1, #1 + bx r5 + + .align 2 +L(ipred_smooth_h_tbl): + .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB + +40: + vld1.32 {d6[]}, [r8, :32] // weights_hor + sub r2, r2, #8 + mov r7, #-8 + vshll.u8 q3, d6, #7 // weights_hor << 7 +4: + vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left + vsub.i16 q0, q0, q2 // left-right + vsub.i16 q1, q1, q2 + subs r4, r4, #4 + vqrdmulh.s16 q8, q1, q3 // ((left-right)*weights_hor + 128) >> 8 + vqrdmulh.s16 q9, q0, q3 // (left flipped) + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vst1.16 {d17}, [r0, :64], r1 + vst1.16 {d16}, [r6, :64], r1 + vst1.16 {d19}, [r0, :64], r1 + vst1.16 {d18}, [r6, :64], r1 + bgt 4b + pop {r4-r8, pc} +80: + vld1.8 {d6}, [r8, :64] // weights_hor + sub r2, r2, #8 + mov r7, #-8 + vshll.u8 q3, d6, #7 // weights_hor << 7 +8: + vld1.16 {d23}, [r2, :64], r7 // left + subs r4, r4, #4 + vsub.i16 d23, d23, d4 // left-right + vdup.16 q8, d23[3] // flip left + vdup.16 q9, d23[2] + vdup.16 q10, d23[1] + vdup.16 q11, d23[0] + vqrdmulh.s16 q8, q8, q3 // ((left-right)*weights_hor + 128) >> 8 + vqrdmulh.s16 q9, q9, q3 + vqrdmulh.s16 q10, q10, q3 + vqrdmulh.s16 q11, q11, q3 + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vst1.16 {q8}, [r0, :128], r1 + vst1.16 {q9}, [r6, :128], r1 + vst1.16 {q10}, [r0, :128], r1 + vst1.16 {q11}, [r6, :128], r1 + bgt 8b + pop {r4-r8, pc} +160: +320: +640: + vpush {q4-q7} + sub r2, r2, #8 + mov r7, #-8 + // Set up pointers for four rows in parallel; r0, r6, r5, lr + add r5, r0, r1 + add lr, r6, r1 + lsl r1, r1, #1 + sub r1, r1, r3, lsl #1 + mov r12, r3 + +1: + vld1.16 {d15}, [r2, :64], r7 // left + vsub.i16 d15, d15, d4 // left-right + vdup.16 q4, d15[3] // flip left + vdup.16 q5, d15[2] + vdup.16 q6, d15[1] + vdup.16 q7, d15[0] +2: + vld1.8 {q1}, [r8, :128]! // weights_hor + subs r3, r3, #16 + vshll.u8 q0, d2, #7 // weights_hor << 7 + vshll.u8 q1, d3, #7 + vqrdmulh.s16 q8, q0, q4 // ((left-right)*weights_hor + 128) >> 8 + vqrdmulh.s16 q9, q1, q4 + vqrdmulh.s16 q10, q0, q5 + vqrdmulh.s16 q11, q1, q5 + vqrdmulh.s16 q12, q0, q6 + vqrdmulh.s16 q13, q1, q6 + vqrdmulh.s16 q14, q0, q7 + vqrdmulh.s16 q15, q1, q7 + vadd.i16 q8, q8, q2 + vadd.i16 q9, q9, q2 + vadd.i16 q10, q10, q2 + vadd.i16 q11, q11, q2 + vadd.i16 q12, q12, q2 + vadd.i16 q13, q13, q2 + vadd.i16 q14, q14, q2 + vadd.i16 q15, q15, q2 + vst1.16 {q8, q9}, [r0, :128]! + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r5, :128]! + vst1.16 {q14, q15}, [lr, :128]! + bgt 2b + subs r4, r4, #4 + ble 9f + sub r8, r8, r12 + add r0, r0, r1 + add r6, r6, r1 + add r5, r5, r1 + add lr, lr, r1 + mov r3, r12 + b 1b +9: + vpop {q4-q7} + pop {r4-r8, pc} +endfunc + +// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height, +// const int bitdepth_max); +.macro filter_fn bpc +function ipred_filter_\bpc\()bpc_neon, export=1 + movw r12, #511 + ldrd r4, r5, [sp, #88] + and r5, r5, r12 // 511 + movrel r6, X(filter_intra_taps) + lsl r5, r5, #6 + add r6, r6, r5 + vld1.8 {d20, d21, d22, d23}, [r6, :128]! + clz lr, r3 + adr r5, L(ipred_filter\bpc\()_tbl) + vld1.8 {d27, d28, d29}, [r6, :64] + sub lr, lr, #26 + ldr lr, [r5, lr, lsl #2] + vmovl.s8 q8, d20 + vmovl.s8 q9, d21 + add r5, r5, lr + vmovl.s8 q10, d22 + vmovl.s8 q11, d23 + add r6, r0, r1 + lsl r1, r1, #1 + vmovl.s8 q12, d27 + vmovl.s8 q13, d28 + vmovl.s8 q14, d29 + mov r7, #-4 + vdup.16 q15, r8 + add r8, r2, #2 + sub r2, r2, #4 +.if \bpc == 10 + vmov.i16 q7, #0 +.endif + bx r5 + + .align 2 +L(ipred_filter\bpc\()_tbl): + .word 320f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB + .word 160f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB + .word 80f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB + .word 40f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB + +40: + vld1.16 {d0}, [r8] // top (0-3) +4: + vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2) +.if \bpc == 10 + vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) + vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) + vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) + vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) + vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) + vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) + vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) + vrshr.s16 q2, q2, #4 + vmax.s16 q2, q2, q7 +.else + vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1) + vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2) + vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3) + vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4) + vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0) + vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5) + vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6) + vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1) + vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2) + vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3) + vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4) + vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0) + vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5) + vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6) + vqrshrun.s32 d4, q2, #4 + vqrshrun.s32 d5, q3, #4 +.endif + vmin.s16 q2, q2, q15 + subs r4, r4, #2 + vst1.16 {d4}, [r0, :64], r1 + vst1.16 {d5}, [r6, :64], r1 + vmov d0, d5 // move top from [4-7] to [0-3] + bgt 4b + vpop {q4-q7} + pop {r4-r8, pc} +80: + vld1.16 {q0}, [r8] // top (0-7) +8: + vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2) +.if \bpc == 10 + vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) + vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) + vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) + vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) + vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) + vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) + vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) + vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1) + vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2) + vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3) + vrshr.s16 q2, q2, #4 + vmax.s16 q2, q2, q7 + vmin.s16 q2, q2, q15 + vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4) + vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0) + vmla.i16 q3, q13, d4[3] // p5(left[0]) * filter(5) + vmla.i16 q3, q14, d5[3] // p6(left[1]) * filter(6) + vrshr.s16 q3, q3, #4 + vmax.s16 q3, q3, q7 +.else + vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1) + vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2) + vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3) + vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4) + vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0) + vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5) + vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6) + vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1) + vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2) + vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3) + vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4) + vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0) + vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5) + vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6) + vqrshrun.s32 d4, q2, #4 + vmull.s16 q4, d18, d1[0] // p1(top[0]) * filter(1) + vmlal.s16 q4, d20, d1[1] // p2(top[1]) * filter(2) + vmlal.s16 q4, d22, d1[2] // p3(top[2]) * filter(3) + vqrshrun.s32 d5, q3, #4 + vmin.s16 q2, q2, q15 + vmlal.s16 q4, d24, d1[3] // p4(top[3]) * filter(4) + vmlal.s16 q4, d16, d0[3] // p0(topleft) * filter(0) + vmlal.s16 q4, d26, d4[3] // p5(left[0]) * filter(5) + vmlal.s16 q4, d28, d5[3] // p6(left[1]) * filter(6) + vmull.s16 q5, d19, d1[0] // p1(top[0]) * filter(1) + vmlal.s16 q5, d21, d1[1] // p2(top[1]) * filter(2) + vmlal.s16 q5, d23, d1[2] // p3(top[2]) * filter(3) + vmlal.s16 q5, d25, d1[3] // p4(top[3]) * filter(4) + vmlal.s16 q5, d17, d0[3] // p0(topleft) * filter(0) + vmlal.s16 q5, d27, d4[3] // p5(left[0]) * filter(5) + vmlal.s16 q5, d29, d5[3] // p6(left[1]) * filter(6) + vqrshrun.s32 d6, q4, #4 + vqrshrun.s32 d7, q5, #4 +.endif + vmin.s16 q3, q3, q15 + vswp d5, d6 + subs r4, r4, #2 + vst1.16 {q2}, [r0, :128], r1 + vmov q0, q3 + vst1.16 {q3}, [r6, :128], r1 + bgt 8b + vpop {q4-q7} + pop {r4-r8, pc} +160: +320: + sub r1, r1, r3, lsl #1 + mov lr, r3 + +1: + vld1.16 {d0}, [r2], r7 // left (0-1) + topleft (2) +2: + vld1.16 {q1, q2}, [r8]! // top(0-15) +.if \bpc == 10 + vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0) + vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5) + vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6) + vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1) + vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2) + vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3) + vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4) + + vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1) + vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2) + vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3) + vrshr.s16 q3, q3, #4 + vmax.s16 q3, q3, q7 + vmin.s16 q3, q3, q15 + vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4) + vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0) + vmla.i16 q4, q13, d6[3] // p5(left[0]) * filter(5) + vmla.i16 q4, q14, d7[3] // p6(left[1]) * filter(6) + + vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1) + vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2) + vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3) + vrshr.s16 q4, q4, #4 + vmax.s16 q4, q4, q7 + vmin.s16 q4, q4, q15 + vmov q0, q4 + vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4) + vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0) + vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6) + + vmul.i16 q6, q9, d5[0] // p1(top[0]) * filter(1) + vmla.i16 q6, q10, d5[1] // p2(top[1]) * filter(2) + vmla.i16 q6, q11, d5[2] // p3(top[2]) * filter(3) + vrshr.s16 q5, q5, #4 + vmax.s16 q5, q5, q7 + vmin.s16 q5, q5, q15 + vmov q0, q5 + vmov.u16 r12, d5[3] + vmla.i16 q6, q12, d5[3] // p4(top[3]) * filter(4) + vmla.i16 q6, q8, d4[3] // p0(topleft) * filter(0) + vmla.i16 q6, q13, d0[3] // p5(left[0]) * filter(5) + vmla.i16 q6, q14, d1[3] // p6(left[1]) * filter(6) + vmov.16 d0[2], r12 + subs r3, r3, #16 + vrshr.s16 q6, q6, #4 +.else + vmull.s16 q3, d16, d0[2] // p0(topleft) * filter(0) + vmlal.s16 q3, d26, d0[1] // p5(left[0]) * filter(5) + vmlal.s16 q3, d28, d0[0] // p6(left[1]) * filter(6) + vmlal.s16 q3, d18, d2[0] // p1(top[0]) * filter(1) + vmlal.s16 q3, d20, d2[1] // p2(top[1]) * filter(2) + vmlal.s16 q3, d22, d2[2] // p3(top[2]) * filter(3) + vmlal.s16 q3, d24, d2[3] // p4(top[3]) * filter(4) + vmull.s16 q4, d17, d0[2] // p0(topleft) * filter(0) + vmlal.s16 q4, d27, d0[1] // p5(left[0]) * filter(5) + vmlal.s16 q4, d29, d0[0] // p6(left[1]) * filter(6) + vmlal.s16 q4, d19, d2[0] // p1(top[0]) * filter(1) + vmlal.s16 q4, d21, d2[1] // p2(top[1]) * filter(2) + vmlal.s16 q4, d23, d2[2] // p3(top[2]) * filter(3) + vmlal.s16 q4, d25, d2[3] // p4(top[3]) * filter(4) + vqrshrun.s32 d6, q3, #4 + vmull.s16 q5, d18, d3[0] // p1(top[0]) * filter(1) + vmlal.s16 q5, d20, d3[1] // p2(top[1]) * filter(2) + vqrshrun.s32 d7, q4, #4 + vmin.s16 q3, q3, q15 + vmlal.s16 q5, d22, d3[2] // p3(top[2]) * filter(3) + vmlal.s16 q5, d24, d3[3] // p4(top[3]) * filter(4) + vmlal.s16 q5, d16, d2[3] // p0(topleft) * filter(0) + vmlal.s16 q5, d26, d6[3] // p5(left[0]) * filter(5) + vmlal.s16 q5, d28, d7[3] // p6(left[1]) * filter(6) + vmull.s16 q6, d19, d3[0] // p1(top[0]) * filter(1) + vmlal.s16 q6, d21, d3[1] // p2(top[1]) * filter(2) + vmlal.s16 q6, d23, d3[2] // p3(top[2]) * filter(3) + vmlal.s16 q6, d25, d3[3] // p4(top[3]) * filter(4) + vmlal.s16 q6, d17, d2[3] // p0(topleft) * filter(0) + vmlal.s16 q6, d27, d6[3] // p5(left[0]) * filter(5) + vmlal.s16 q6, d29, d7[3] // p6(left[1]) * filter(6) + vqrshrun.s32 d8, q5, #4 + vmull.s16 q7, d18, d4[0] // p1(top[0]) * filter(1) + vmlal.s16 q7, d20, d4[1] // p2(top[1]) * filter(2) + vmlal.s16 q7, d22, d4[2] // p3(top[2]) * filter(3) + vqrshrun.s32 d9, q6, #4 + vmin.s16 q0, q4, q15 + vmlal.s16 q7, d24, d4[3] // p4(top[3]) * filter(4) + vmlal.s16 q7, d16, d3[3] // p0(topleft) * filter(0) + vmlal.s16 q7, d26, d0[3] // p5(left[0]) * filter(5) + vmlal.s16 q7, d28, d1[3] // p6(left[1]) * filter(6) + vmin.s16 q4, q4, q15 + vmull.s16 q6, d19, d4[0] // p1(top[0]) * filter(1) + vmlal.s16 q6, d21, d4[1] // p2(top[1]) * filter(2) + vmlal.s16 q6, d23, d4[2] // p3(top[2]) * filter(3) + vmlal.s16 q6, d25, d4[3] // p4(top[3]) * filter(4) + vmlal.s16 q6, d17, d3[3] // p0(topleft) * filter(0) + vmlal.s16 q6, d27, d0[3] // p5(left[0]) * filter(5) + vmlal.s16 q6, d29, d1[3] // p6(left[1]) * filter(6) + vqrshrun.s32 d10, q7, #4 + vmull.s16 q1, d18, d5[0] // p1(top[0]) * filter(1) + vmlal.s16 q1, d20, d5[1] // p2(top[1]) * filter(2) + vmlal.s16 q1, d22, d5[2] // p3(top[2]) * filter(3) + vqrshrun.s32 d11, q6, #4 + vmin.s16 q0, q5, q15 + vmlal.s16 q1, d24, d5[3] // p4(top[3]) * filter(4) + vmlal.s16 q1, d16, d4[3] // p0(topleft) * filter(0) + vmlal.s16 q1, d26, d0[3] // p5(left[0]) * filter(5) + vmlal.s16 q1, d28, d1[3] // p6(left[1]) * filter(6) + vmin.s16 q5, q5, q15 + vmov.u16 r12, d5[3] + vmull.s16 q7, d19, d5[0] // p1(top[0]) * filter(1) + vmlal.s16 q7, d21, d5[1] // p2(top[1]) * filter(2) + vmlal.s16 q7, d23, d5[2] // p3(top[2]) * filter(3) + vmlal.s16 q7, d25, d5[3] // p4(top[3]) * filter(4) + vmlal.s16 q7, d17, d4[3] // p0(topleft) * filter(0) + vmlal.s16 q7, d27, d0[3] // p5(left[0]) * filter(5) + vmlal.s16 q7, d29, d1[3] // p6(left[1]) * filter(6) + vmov.16 d0[2], r12 + vqrshrun.s32 d12, q1, #4 + subs r3, r3, #16 + vqrshrun.s32 d13, q7, #4 +.endif + vswp q4, q5 +.if \bpc == 10 + vmax.s16 q6, q6, q7 +.endif + vswp d7, d10 + vmin.s16 q6, q6, q15 + + vswp d9, d12 + + vst1.16 {q3, q4}, [r0, :128]! + vst1.16 {q5, q6}, [r6, :128]! + ble 8f + vmov.u16 r12, d13[3] + vmov.16 d0[0], r12 + vmov.u16 r12, d9[3] + vmov.16 d0[1], r12 + b 2b +8: + subs r4, r4, #2 + + ble 9f + sub r8, r6, lr, lsl #1 + add r0, r0, r1 + add r6, r6, r1 + mov r3, lr + b 1b +9: + vpop {q4-q7} + pop {r4-r8, pc} +endfunc +.endm + +filter_fn 10 +filter_fn 12 + +function ipred_filter_16bpc_neon, export=1 + push {r4-r8, lr} + vpush {q4-q7} + movw r12, 0x3ff + ldr r8, [sp, #104] + cmp r8, r12 + ble ipred_filter_10bpc_neon + b ipred_filter_12bpc_neon +endfunc + +// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_16bpc_neon, export=1 + push {r4-r5, lr} + ldr r4, [sp, #12] + ldr r5, [sp, #16] + vld1.16 {q14}, [r2, :128] + clz lr, r4 + adr r12, L(pal_pred_tbl) + sub lr, lr, #25 + ldr lr, [r12, lr, lsl #2] + vmov.i16 q15, #0x100 + add r12, r12, lr + add r2, r0, r1 + bx r12 + + .align 2 +L(pal_pred_tbl): + .word 640f - L(pal_pred_tbl) + CONFIG_THUMB + .word 320f - L(pal_pred_tbl) + CONFIG_THUMB + .word 160f - L(pal_pred_tbl) + CONFIG_THUMB + .word 80f - L(pal_pred_tbl) + CONFIG_THUMB + .word 40f - L(pal_pred_tbl) + CONFIG_THUMB + +40: + lsl r1, r1, #1 +4: + vld1.8 {q1}, [r3, :128]! + subs r5, r5, #4 + // Restructure q1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... + vadd.i8 q0, q1, q1 + vadd.i8 q1, q1, q1 + vzip.8 q0, q1 + vadd.i16 q0, q0, q15 + vadd.i16 q1, q1, q15 + vtbl.8 d0, {q14}, d0 + vtbl.8 d1, {q14}, d1 + vst1.16 {d0}, [r0, :64], r1 + vtbl.8 d2, {q14}, d2 + vst1.16 {d1}, [r2, :64], r1 + vtbl.8 d3, {q14}, d3 + vst1.16 {d2}, [r0, :64], r1 + vst1.16 {d3}, [r2, :64], r1 + bgt 4b + pop {r4-r5, pc} +80: + lsl r1, r1, #1 +8: + vld1.8 {q1, q2}, [r3, :128]! + subs r5, r5, #4 + // Prefer doing the adds twice, instead of chaining a vmov after + // the add. + vadd.i8 q0, q1, q1 + vadd.i8 q1, q1, q1 + vadd.i8 q3, q2, q2 + vadd.i8 q2, q2, q2 + vzip.8 q0, q1 + vzip.8 q2, q3 + vadd.i16 q0, q0, q15 + vadd.i16 q1, q1, q15 + vtbl.8 d0, {q14}, d0 + vadd.i16 q2, q2, q15 + vtbl.8 d1, {q14}, d1 + vadd.i16 q3, q3, q15 + vtbl.8 d2, {q14}, d2 + vtbl.8 d3, {q14}, d3 + vtbl.8 d4, {q14}, d4 + vtbl.8 d5, {q14}, d5 + vst1.16 {q0}, [r0, :128], r1 + vtbl.8 d6, {q14}, d6 + vst1.16 {q1}, [r2, :128], r1 + vtbl.8 d7, {q14}, d7 + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r2, :128], r1 + bgt 8b + pop {r4-r5, pc} +160: + lsl r1, r1, #1 +16: + vld1.8 {q2, q3}, [r3, :128]! + subs r5, r5, #4 + vld1.8 {q10, q11}, [r3, :128]! + vadd.i8 q0, q2, q2 + vadd.i8 q1, q2, q2 + vadd.i8 q2, q3, q3 + vadd.i8 q3, q3, q3 + vadd.i8 q8, q10, q10 + vadd.i8 q9, q10, q10 + vadd.i8 q10, q11, q11 + vzip.8 q0, q1 + vadd.i8 q11, q11, q11 + vzip.8 q2, q3 + vzip.8 q8, q9 + vadd.i16 q0, q0, q15 + vzip.8 q10, q11 + vadd.i16 q1, q1, q15 + vadd.i16 q2, q2, q15 + vadd.i16 q3, q3, q15 + vadd.i16 q8, q8, q15 + vadd.i16 q9, q9, q15 + vadd.i16 q10, q10, q15 + vtbl.8 d0, {q14}, d0 + vadd.i16 q11, q11, q15 + vtbl.8 d1, {q14}, d1 + vtbl.8 d2, {q14}, d2 + vtbl.8 d3, {q14}, d3 + vtbl.8 d4, {q14}, d4 + vtbl.8 d5, {q14}, d5 + vtbl.8 d6, {q14}, d6 + vtbl.8 d7, {q14}, d7 + vtbl.8 d16, {q14}, d16 + vtbl.8 d17, {q14}, d17 + vtbl.8 d18, {q14}, d18 + vst1.16 {q0, q1}, [r0, :128], r1 + vtbl.8 d19, {q14}, d19 + vtbl.8 d20, {q14}, d20 + vst1.16 {q2, q3}, [r2, :128], r1 + vtbl.8 d21, {q14}, d21 + vtbl.8 d22, {q14}, d22 + vst1.16 {q8, q9}, [r0, :128], r1 + vtbl.8 d23, {q14}, d23 + vst1.16 {q10, q11}, [r2, :128], r1 + bgt 16b + pop {r4-r5, pc} +320: + lsl r1, r1, #1 + sub r1, r1, #32 +32: + vld1.8 {q2, q3}, [r3, :128]! + subs r5, r5, #2 + vld1.8 {q10, q11}, [r3, :128]! + vadd.i8 q0, q2, q2 + vadd.i8 q1, q2, q2 + vadd.i8 q2, q3, q3 + vadd.i8 q3, q3, q3 + vadd.i8 q8, q10, q10 + vadd.i8 q9, q10, q10 + vadd.i8 q10, q11, q11 + vzip.8 q0, q1 + vadd.i8 q11, q11, q11 + vzip.8 q2, q3 + vzip.8 q8, q9 + vadd.i16 q0, q0, q15 + vzip.8 q10, q11 + vadd.i16 q1, q1, q15 + vadd.i16 q2, q2, q15 + vadd.i16 q3, q3, q15 + vadd.i16 q8, q8, q15 + vadd.i16 q9, q9, q15 + vadd.i16 q10, q10, q15 + vtbl.8 d0, {q14}, d0 + vadd.i16 q11, q11, q15 + vtbl.8 d1, {q14}, d1 + vtbl.8 d2, {q14}, d2 + vtbl.8 d3, {q14}, d3 + vtbl.8 d4, {q14}, d4 + vtbl.8 d5, {q14}, d5 + vtbl.8 d6, {q14}, d6 + vtbl.8 d7, {q14}, d7 + vtbl.8 d16, {q14}, d16 + vtbl.8 d17, {q14}, d17 + vtbl.8 d18, {q14}, d18 + vst1.16 {q0, q1}, [r0, :128]! + vtbl.8 d19, {q14}, d19 + vtbl.8 d20, {q14}, d20 + vst1.16 {q2, q3}, [r0, :128], r1 + vtbl.8 d21, {q14}, d21 + vtbl.8 d22, {q14}, d22 + vst1.16 {q8, q9}, [r2, :128]! + vtbl.8 d23, {q14}, d23 + vst1.16 {q10, q11}, [r2, :128], r1 + bgt 32b + pop {r4-r5, pc} +640: + sub r1, r1, #96 +64: + vld1.8 {q2, q3}, [r3, :128]! + subs r5, r5, #1 + vld1.8 {q10, q11}, [r3, :128]! + vadd.i8 q0, q2, q2 + vadd.i8 q1, q2, q2 + vadd.i8 q2, q3, q3 + vadd.i8 q3, q3, q3 + vadd.i8 q8, q10, q10 + vadd.i8 q9, q10, q10 + vadd.i8 q10, q11, q11 + vzip.8 q0, q1 + vadd.i8 q11, q11, q11 + vzip.8 q2, q3 + vzip.8 q8, q9 + vadd.i16 q0, q0, q15 + vzip.8 q10, q11 + vadd.i16 q1, q1, q15 + vadd.i16 q2, q2, q15 + vadd.i16 q3, q3, q15 + vadd.i16 q8, q8, q15 + vadd.i16 q9, q9, q15 + vadd.i16 q10, q10, q15 + vtbl.8 d0, {q14}, d0 + vadd.i16 q11, q11, q15 + vtbl.8 d1, {q14}, d1 + vtbl.8 d2, {q14}, d2 + vtbl.8 d3, {q14}, d3 + vtbl.8 d4, {q14}, d4 + vtbl.8 d5, {q14}, d5 + vtbl.8 d6, {q14}, d6 + vtbl.8 d7, {q14}, d7 + vtbl.8 d16, {q14}, d16 + vtbl.8 d17, {q14}, d17 + vtbl.8 d18, {q14}, d18 + vst1.16 {q0, q1}, [r0, :128]! + vtbl.8 d19, {q14}, d19 + vtbl.8 d20, {q14}, d20 + vst1.16 {q2, q3}, [r0, :128]! + vtbl.8 d21, {q14}, d21 + vtbl.8 d22, {q14}, d22 + vst1.16 {q8, q9}, [r0, :128]! + vtbl.8 d23, {q14}, d23 + vst1.16 {q10, q11}, [r0, :128], r1 + bgt 64b + pop {r4-r5, pc} +endfunc + +// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_128_16bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + clz lr, r3 + vdup.16 q15, r7 // bitdepth_max + adr r12, L(ipred_cfl_128_tbl) + sub lr, lr, #26 + ldr lr, [r12, lr, lsl #2] + vrshr.u16 q0, q15, #1 + vdup.16 q1, r6 // alpha + add r12, r12, lr + add r6, r0, r1 + lsl r1, r1, #1 + vmov.i16 q14, #0 + bx r12 + + .align 2 +L(ipred_cfl_128_tbl): +L(ipred_cfl_splat_tbl): + .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB + +L(ipred_cfl_splat_w4): + vld1.16 {q8, q9}, [r5, :128]! + vmull.s16 q2, d16, d2 // diff = ac * alpha + vmull.s16 q3, d17, d3 + vmull.s16 q8, d18, d2 + vmull.s16 q9, d19, d3 + vshr.s32 q10, q2, #31 // sign = diff >> 15 + vshr.s32 q11, q3, #31 + vshr.s32 q12, q8, #31 + vshr.s32 q13, q9, #31 + vadd.i32 q2, q2, q10 // diff + sign + vadd.i32 q3, q3, q11 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q8, #6 + vrshrn.i32 d7, q9, #6 + vadd.i16 q2, q2, q0 // dc + apply_sign() + vadd.i16 q3, q3, q0 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + vst1.16 {d4}, [r0, :64], r1 + vst1.16 {d5}, [r6, :64], r1 + subs r4, r4, #4 + vst1.16 {d6}, [r0, :64], r1 + vst1.16 {d7}, [r6, :64], r1 + bgt L(ipred_cfl_splat_w4) + pop {r4-r8, pc} +L(ipred_cfl_splat_w8): + vld1.16 {q8, q9}, [r5, :128]! + subs r4, r4, #2 + vmull.s16 q2, d16, d2 // diff = ac * alpha + vmull.s16 q3, d17, d3 + vmull.s16 q8, d18, d2 + vmull.s16 q9, d19, d3 + vshr.s32 q10, q2, #31 // sign = diff >> 15 + vshr.s32 q11, q3, #31 + vshr.s32 q12, q8, #31 + vshr.s32 q13, q9, #31 + vadd.i32 q2, q2, q10 // diff + sign + vadd.i32 q3, q3, q11 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q8, #6 + vrshrn.i32 d7, q9, #6 + vadd.i16 q2, q2, q0 // dc + apply_sign() + vadd.i16 q3, q3, q0 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r6, :128], r1 + bgt L(ipred_cfl_splat_w8) + pop {r4-r8, pc} +L(ipred_cfl_splat_w16): + vpush {q4-q7} + add r12, r5, r3, lsl #1 + sub r1, r1, r3, lsl #1 + mov lr, r3 +1: + vld1.16 {q6, q7}, [r5, :128]! + vmull.s16 q2, d12, d2 // diff = ac * alpha + vld1.16 {q8, q9}, [r12, :128]! + vmull.s16 q3, d13, d3 + vmull.s16 q4, d14, d2 + vmull.s16 q5, d15, d3 + vmull.s16 q6, d16, d2 + vmull.s16 q7, d17, d3 + vmull.s16 q8, d18, d2 + vmull.s16 q9, d19, d3 + vshr.s32 q10, q2, #31 // sign = diff >> 15 + vshr.s32 q11, q3, #31 + vshr.s32 q12, q4, #31 + vshr.s32 q13, q5, #31 + vadd.i32 q2, q2, q10 // diff + sign + vshr.s32 q10, q6, #31 + vadd.i32 q3, q3, q11 + vshr.s32 q11, q7, #31 + vadd.i32 q4, q4, q12 + vshr.s32 q12, q8, #31 + vadd.i32 q5, q5, q13 + vshr.s32 q13, q9, #31 + vadd.i32 q6, q6, q10 + vadd.i32 q7, q7, q11 + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() + vrshrn.i32 d5, q3, #6 + vrshrn.i32 d6, q4, #6 + vrshrn.i32 d7, q5, #6 + vadd.i16 q2, q2, q0 // dc + apply_sign() + vrshrn.i32 d8, q6, #6 + vrshrn.i32 d9, q7, #6 + vadd.i16 q3, q3, q0 + vrshrn.i32 d10, q8, #6 + vrshrn.i32 d11, q9, #6 + vadd.i16 q4, q4, q0 + vadd.i16 q5, q5, q0 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmax.s16 q4, q4, q14 + vmax.s16 q5, q5, q14 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + vmin.s16 q4, q4, q15 + vmin.s16 q5, q5, q15 + subs r3, r3, #16 + vst1.16 {q2, q3}, [r0, :128]! + vst1.16 {q4, q5}, [r6, :128]! + bgt 1b + subs r4, r4, #2 + add r5, r5, lr, lsl #1 + add r12, r12, lr, lsl #1 + add r0, r0, r1 + add r6, r6, r1 + mov r3, lr + bgt 1b + vpop {q4-q7} + pop {r4-r8, pc} +endfunc + +// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_top_16bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + clz lr, r3 + vdup.16 q15, r7 // bitdepth_max + adr r12, L(ipred_cfl_top_tbl) + sub lr, lr, #26 + ldr lr, [r12, lr, lsl #2] + vdup.16 q1, r6 // alpha + add r2, r2, #2 + add r12, r12, lr + add r6, r0, r1 + lsl r1, r1, #1 + vmov.i16 q14, #0 + bx r12 + + .align 2 +L(ipred_cfl_top_tbl): + .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB + +4: + vld1.16 {d0}, [r2] + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w4) +8: + vld1.16 {q0}, [r2] + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w8) +16: + vld1.16 {q2, q3}, [r2] + vadd.i16 q0, q2, q3 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +32: + vld1.16 {q8, q9}, [r2]! + vld1.16 {q10, q11}, [r2] + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q8, q10 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpaddl.u16 d0, d0 + vrshrn.i32 d0, q0, #5 + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +endfunc + +// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_left_16bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + sub r2, r2, r4, lsl #1 + clz lr, r3 + clz r8, r4 + vdup.16 q15, r7 // bitdepth_max + adr r12, L(ipred_cfl_splat_tbl) + adr r7, L(ipred_cfl_left_tbl) + sub lr, lr, #26 + sub r8, r8, #26 + ldr lr, [r12, lr, lsl #2] + ldr r8, [r7, r8, lsl #2] + vdup.16 q1, r6 // alpha + add r12, r12, lr + add r7, r7, r8 + add r6, r0, r1 + lsl r1, r1, #1 + vmov.i16 q14, #0 + bx r7 + + .align 2 +L(ipred_cfl_left_tbl): + .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB + +L(ipred_cfl_left_h4): + vld1.16 {d0}, [r2, :64] + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #2 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h8): + vld1.16 {q0}, [r2, :128] + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #3 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h16): + vld1.16 {q2, q3}, [r2, :128] + vadd.i16 q0, q2, q3 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpadd.i16 d0, d0, d0 + vrshr.u16 d0, d0, #4 + vdup.16 q0, d0[0] + bx r12 + +L(ipred_cfl_left_h32): + vld1.16 {q8, q9}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128] + vadd.i16 q8, q8, q9 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q8, q10 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + vpaddl.u16 d0, d0 + vrshrn.i32 d0, q0, #5 + vdup.16 q0, d0[0] + bx r12 +endfunc + +// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_16bpc_neon, export=1 + push {r4-r8, lr} + ldrd r4, r5, [sp, #24] + ldrd r6, r7, [sp, #32] + sub r2, r2, r4, lsl #1 + add r8, r3, r4 // width + height + vdup.16 q1, r6 // alpha + clz lr, r3 + clz r6, r4 + vdup.32 d16, r8 // width + height + vdup.16 q15, r7 // bitdepth_max + adr r7, L(ipred_cfl_tbl) + rbit r8, r8 // rbit(width + height) + sub lr, lr, #22 // 26 leading bits, minus table offset 4 + sub r6, r6, #26 + clz r8, r8 // ctz(width + height) + ldr lr, [r7, lr, lsl #2] + ldr r6, [r7, r6, lsl #2] + neg r8, r8 // -ctz(width + height) + add r12, r7, lr + add r7, r7, r6 + vshr.u32 d16, d16, #1 // (width + height) >> 1 + vdup.32 d17, r8 // -ctz(width + height) + add r6, r0, r1 + lsl r1, r1, #1 + vmov.i16 q14, #0 + bx r7 + + .align 2 +L(ipred_cfl_tbl): + .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB + .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB + +L(ipred_cfl_h4): + vld1.16 {d0}, [r2, :64]! + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r12 +L(ipred_cfl_w4): + vld1.16 {d1}, [r2] + vadd.i32 d0, d0, d16 + vpadd.i16 d1, d1, d1 + vpaddl.u16 d1, d1 + cmp r4, #4 + vadd.i32 d0, d0, d1 + vshl.u32 d0, d0, d17 + beq 1f + // h = 8/16 + cmp r4, #16 + movw lr, #0x6667 + movw r8, #0xAAAB + it ne + movne lr, r8 + vdup.32 d18, lr + vmul.i32 d0, d0, d18 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w4) + +L(ipred_cfl_h8): + vld1.16 {q0}, [r2, :128]! + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r12 +L(ipred_cfl_w8): + vld1.16 {q2}, [r2] + vadd.i32 d0, d0, d16 + vadd.i16 d1, d4, d5 + vpadd.i16 d1, d1, d1 + vpaddl.u16 d1, d1 + cmp r4, #8 + vadd.i32 d0, d0, d1 + vshl.u32 d0, d0, d17 + beq 1f + // h = 4/16/32 + cmp r4, #32 + movw lr, #0x6667 + movw r8, #0xAAAB + it ne + movne lr, r8 + vdup.32 d18, lr + vmul.i32 d0, d0, d18 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w8) + +L(ipred_cfl_h16): + vld1.16 {q2, q3}, [r2, :128]! + vadd.i16 q0, q2, q3 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r12 +L(ipred_cfl_w16): + vld1.16 {q2, q3}, [r2] + vadd.i32 d0, d0, d16 + vadd.i16 q2, q2, q3 + vadd.i16 d1, d4, d5 + vpadd.i16 d1, d1, d1 + vpaddl.u16 d1, d1 + cmp r4, #16 + vadd.i32 d0, d0, d1 + vshl.u32 d0, d0, d17 + beq 1f + // h = 4/8/32/64 + tst r4, #(32+16+8) // 16 added to make a consecutive bitmask + movw lr, #0x6667 + movw r8, #0xAAAB + it ne + movne lr, r8 + vdup.32 d18, lr + vmul.i32 d0, d0, d18 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_h32): + vld1.16 {q2, q3}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128]! + vadd.i16 q2, q2, q3 + vadd.i16 q10, q10, q11 + vadd.i16 q0, q2, q10 + vadd.i16 d0, d0, d1 + vpadd.i16 d0, d0, d0 + add r2, r2, #2 + vpaddl.u16 d0, d0 + bx r12 +L(ipred_cfl_w32): + vld1.16 {q2, q3}, [r2]! + vadd.i32 d0, d0, d16 + vld1.16 {q10, q11}, [r2]! + vadd.i16 q2, q2, q3 + vadd.i16 q10, q10, q11 + vadd.i16 q2, q2, q10 + vadd.i16 d1, d4, d5 + vpadd.i16 d1, d1, d1 + vpaddl.u16 d1, d1 + cmp r4, #32 + vadd.i32 d0, d0, d1 + vshl.u32 d0, d0, d17 + beq 1f + // h = 8/16/64 + cmp r4, #8 + movw lr, #0x6667 + movw r8, #0xAAAB + it ne + movne lr, r8 + vdup.32 d18, lr + vmul.i32 d0, d0, d18 + vshr.u32 d0, d0, #17 +1: + vdup.16 q0, d0[0] + b L(ipred_cfl_splat_w16) +endfunc + +// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_16bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_420_tbl) + sub r8, r8, #27 + ldr r8, [r7, r8, lsl #2] + vmov.i32 q8, #0 + vmov.i32 q9, #0 + vmov.i32 q10, #0 + vmov.i32 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_420_tbl): + .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_420_w4): +1: // Copy and subsample input + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q1}, [r12, :128], r2 + vld1.16 {q2}, [r1, :128], r2 + vld1.16 {q3}, [r12, :128], r2 + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d4, d5 + vshl.i16 q0, q0, #1 + subs r8, r8, #2 + vst1.16 {q0}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + bgt 1b + cmp r4, #0 + vmov d0, d1 + vmov d2, d1 + vmov d3, d1 +L(ipred_cfl_ac_420_w4_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 2b +3: +L(ipred_cfl_ac_420_w4_calc_subtract_dc): + // Aggregate the sums + vadd.i32 q8, q8, q9 + vadd.i32 q10, q10, q11 + vadd.i32 q0, q8, q10 + vadd.i32 d0, d0, d1 + vpadd.i32 d0, d0, d0 // sum + sub r0, r0, r6, lsl #3 + vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz + vdup.16 q8, d16[0] +6: // Subtract dc from ac + vld1.16 {q0, q1}, [r0, :128] + subs r6, r6, #4 + vsub.i16 q0, q0, q8 + vsub.i16 q1, q1, q8 + vst1.16 {q0, q1}, [r0, :128]! + bgt 6b + pop {r4-r8, pc} + +L(ipred_cfl_ac_420_w8): + cmp r3, #0 + bne L(ipred_cfl_ac_420_w8_wpad) +1: // Copy and subsample input, without padding + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q2, q3}, [r12, :128], r2 + vld1.16 {q12, q13}, [r1, :128], r2 + vadd.i16 q0, q0, q2 + vadd.i16 q1, q1, q3 + vld1.16 {q2, q3}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vadd.i16 q12, q12, q2 + vadd.i16 q13, q13, q3 + vpadd.i16 d2, d24, d25 + vpadd.i16 d3, d26, d27 + vshl.i16 q0, q0, #1 + vshl.i16 q1, q1, #1 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + vmov q0, q1 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_420_w8_wpad): +1: // Copy and subsample input, padding 4 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q1}, [r12, :128], r2 + vld1.16 {q2}, [r1, :128], r2 + vld1.16 {q3}, [r12, :128], r2 + vadd.i16 q0, q0, q1 + vadd.i16 q2, q2, q3 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d4, d5 + vshl.i16 q0, q0, #1 + vdup.16 d3, d1[3] + vmov d2, d1 + vdup.16 d1, d0[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + vmov q0, q1 + +L(ipred_cfl_ac_420_w8_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 2b +3: + + // Double the height and reuse the w4 summing/subtracting + lsl r6, r6, #1 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_w16): + adr r7, L(ipred_cfl_ac_420_w16_tbl) + ldr r3, [r7, r3, lsl #2] + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_420_w16_tbl): + .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_420_w16_wpad0): + sub r2, r2, #32 +1: // Copy and subsample input, without padding + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q12, q13}, [r12, :128]! + vld1.16 {q2, q3}, [r1, :128], r2 + vadd.i16 q0, q0, q12 + vadd.i16 q1, q1, q13 + vld1.16 {q12, q13}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vadd.i16 q2, q2, q12 + vadd.i16 q3, q3, q13 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vshl.i16 q0, q0, #1 + vshl.i16 q1, q1, #1 + subs r8, r8, #1 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad1): + sub r2, r2, #32 +1: // Copy and subsample input, padding 4 + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q12, q13}, [r12, :128]! + vld1.16 {q2}, [r1, :128], r2 + vadd.i16 q0, q0, q12 + vadd.i16 q1, q1, q13 + vld1.16 {q12}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vadd.i16 q2, q2, q12 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vshl.i16 q0, q0, #1 + vshl.i16 d2, d2, #1 + subs r8, r8, #1 + vdup.16 d3, d2[3] + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad2): +1: // Copy and subsample input, padding 8 + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q12, q13}, [r12, :128], r2 + vadd.i16 q0, q0, q12 + vadd.i16 q1, q1, q13 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vshl.i16 q0, q0, #1 + subs r8, r8, #1 + vdup.16 q1, d1[3] + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad3): +1: // Copy and subsample input, padding 12 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q12}, [r12, :128], r2 + vadd.i16 q0, q0, q12 + vpadd.i16 d0, d0, d1 + vshl.i16 d0, d0, #1 + subs r8, r8, #1 + vdup.16 q1, d0[3] + vdup.16 d1, d0[3] + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 2b +3: + + // Quadruple the height and reuse the w4 summing/subtracting + lsl r6, r6, #2 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) +endfunc + +// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_16bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_422_tbl) + sub r8, r8, #27 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_422_tbl): + .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_422_w4): +1: // Copy and subsample input + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q1}, [r12, :128], r2 + vld1.16 {q2}, [r1, :128], r2 + vld1.16 {q3}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + vmov d0, d3 + vmov d1, d3 + vmov d2, d3 + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_422_w8): + cmp r3, #0 + bne L(ipred_cfl_ac_422_w8_wpad) +1: // Copy and subsample input, without padding + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q2, q3}, [r12, :128], r2 + vld1.16 {q12, q13}, [r1, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d4, d5 + vpadd.i16 d3, d6, d7 + vld1.16 {q2, q3}, [r12, :128], r2 + vpadd.i16 d24, d24, d25 + vpadd.i16 d25, d26, d27 + vpadd.i16 d26, d4, d5 + vpadd.i16 d27, d6, d7 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vshl.i16 q2, q12, #2 + vshl.i16 q3, q13, #2 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w8_wpad): +1: // Copy and subsample input, padding 4 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q2}, [r12, :128], r2 + vld1.16 {q12}, [r1, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d4, d5 + vld1.16 {q2, q3}, [r12, :128], r2 + vpadd.i16 d24, d24, d25 + vpadd.i16 d25, d4, d5 + vshl.i16 q0, q0, #2 + vshl.i16 q12, q12, #2 + vdup.16 d7, d25[3] + vmov d6, d25 + vdup.16 d5, d24[3] + vmov d4, d24 + vdup.16 d3, d1[3] + vmov d2, d1 + vdup.16 d1, d0[3] + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w16): + adr r7, L(ipred_cfl_ac_422_w16_tbl) + ldr r3, [r7, r3, lsl #2] + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_422_w16_tbl): + .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_422_w16_wpad0): + sub r2, r2, #32 +1: // Copy and subsample input, without padding + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q2, q3}, [r12, :128]! + vld1.16 {q12, q13}, [r1, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d24, d25 + vpadd.i16 d3, d26, d27 + vld1.16 {q12, q13}, [r12, :128], r2 + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vpadd.i16 d6, d24, d25 + vpadd.i16 d7, d26, d27 + vshl.i16 q0, q0, #2 + vshl.i16 q1, q1, #2 + vshl.i16 q2, q2, #2 + vshl.i16 q3, q3, #2 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad1): + sub r2, r2, #32 +1: // Copy and subsample input, padding 4 + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q2, q3}, [r12, :128]! + vld1.16 {q12}, [r1, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d2, d24, d25 + vld1.16 {q12}, [r12, :128], r2 + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vpadd.i16 d6, d24, d25 + vshl.i16 q0, q0, #2 + vshl.i16 d2, d2, #2 + vshl.i16 q2, q2, #2 + vshl.i16 d6, d6, #2 + vdup.16 d3, d2[3] + vdup.16 d7, d6[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad2): +1: // Copy and subsample input, padding 8 + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q2, q3}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vshl.i16 q0, q0, #2 + vshl.i16 q2, q2, #2 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad3): +1: // Copy and subsample input, padding 12 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q2}, [r12, :128], r2 + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d4, d5 + vshl.i16 q0, q0, #2 + vdup.16 q3, d1[3] + vdup.16 q1, d0[3] + vdup.16 d5, d1[3] + vmov d4, d1 + vdup.16 d1, d0[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) +endfunc + +// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_444_16bpc_neon, export=1 + push {r4-r8,lr} + ldrd r4, r5, [sp, #24] + ldr r6, [sp, #32] + clz r8, r5 + lsl r4, r4, #2 + adr r7, L(ipred_cfl_ac_444_tbl) + sub r8, r8, #26 + ldr r8, [r7, r8, lsl #2] + vmov.i16 q8, #0 + vmov.i16 q9, #0 + vmov.i16 q10, #0 + vmov.i16 q11, #0 + add r7, r7, r8 + sub r8, r6, r4 // height - h_pad + rbit lr, r5 // rbit(width) + rbit r12, r6 // rbit(height) + clz lr, lr // ctz(width) + clz r12, r12 // ctz(height) + add lr, lr, r12 // log2sz + add r12, r1, r2 + vdup.32 d31, lr + lsl r2, r2, #1 + vneg.s32 d31, d31 // -log2sz + bx r7 + + .align 2 +L(ipred_cfl_ac_444_tbl): + .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_444_w4): +1: // Copy and expand input + vld1.16 {d0}, [r1, :64], r2 + vld1.16 {d1}, [r12, :64], r2 + vld1.16 {d2}, [r1, :64], r2 + vld1.16 {d3}, [r12, :64], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + bgt 1b + cmp r4, #0 + vmov d0, d3 + vmov d1, d3 + vmov d2, d3 + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_444_w8): +1: // Copy and expand input + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q1}, [r12, :128], r2 + vld1.16 {q2}, [r1, :128], r2 + vld1.16 {q3}, [r12, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + vshl.i16 q2, q2, #3 + vshl.i16 q3, q3, #3 + subs r8, r8, #4 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q3 + vmov q1, q3 + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_444_w16): + cmp r3, #0 + bne L(ipred_cfl_ac_444_w16_wpad) +1: // Copy and expand input, without padding + vld1.16 {q0, q1}, [r1, :128], r2 + vld1.16 {q2, q3}, [r12, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + vshl.i16 q2, q2, #3 + vshl.i16 q3, q3, #3 + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w16_wpad): +1: // Copy and expand input, padding 8 + vld1.16 {q0}, [r1, :128], r2 + vld1.16 {q2}, [r12, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q2, q2, #3 + vdup.16 q1, d1[3] + vdup.16 q3, d5[3] + subs r8, r8, #2 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + vmov q0, q2 + vmov q1, q3 + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_444_w32): + adr r7, L(ipred_cfl_ac_444_w32_tbl) + ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2 + asr r2, r2, #1 + add r7, r7, r3 + bx r7 + + .align 2 +L(ipred_cfl_ac_444_w32_tbl): + .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB + +L(ipred_cfl_ac_444_w32_wpad0): + sub r2, r2, #32 +1: // Copy and expand input, without padding + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q2, q3}, [r1, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + vshl.i16 q2, q2, #3 + vshl.i16 q3, q3, #3 + subs r8, r8, #1 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad2): + sub r2, r2, #32 +1: // Copy and expand input, padding 8 + vld1.16 {q0, q1}, [r1, :128]! + vld1.16 {q2}, [r1, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + vshl.i16 q2, q2, #3 + subs r8, r8, #1 + vst1.16 {q0, q1}, [r0, :128]! + vdup.16 q3, d5[3] + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad4): +1: // Copy and expand input, padding 16 + vld1.16 {q0, q1}, [r1, :128], r2 + vshl.i16 q0, q0, #3 + vshl.i16 q1, q1, #3 + subs r8, r8, #1 + vst1.16 {q0, q1}, [r0, :128]! + vdup.16 q2, d3[3] + vdup.16 q3, d3[3] + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + b L(ipred_cfl_ac_444_w32_hpad) + +L(ipred_cfl_ac_444_w32_wpad6): +1: // Copy and expand input, padding 24 + vld1.16 {q0}, [r1, :128], r2 + vshl.i16 q0, q0, #3 + subs r8, r8, #1 + vdup.16 q1, d1[3] + vst1.16 {q0, q1}, [r0, :128]! + vdup.16 q2, d1[3] + vdup.16 q3, d1[3] + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 1b + cmp r4, #0 + +L(ipred_cfl_ac_444_w32_hpad): + beq 3f // This assumes that all callers already did "cmp r4, #0" +2: // Vertical padding (h_pad > 0) + subs r4, r4, #1 + vst1.16 {q0, q1}, [r0, :128]! + vaddw.u16 q8, q8, d0 + vaddw.u16 q9, q9, d1 + vaddw.u16 q10, q10, d2 + vaddw.u16 q11, q11, d3 + vst1.16 {q2, q3}, [r0, :128]! + vaddw.u16 q8, q8, d4 + vaddw.u16 q9, q9, d5 + vaddw.u16 q10, q10, d6 + vaddw.u16 q11, q11, d7 + bgt 2b +3: + + // Multiply the height by eight and reuse the w4 subtracting + lsl r6, r6, #3 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) +endfunc From 7b7fac6f213662b5a1d16c4d7c3e202b1a8be24f Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Sat, 16 Jan 2021 09:48:09 -0500 Subject: [PATCH 107/155] arm64: cdef_dir: Preload rows to prevent stalling Before: Cortex A53 A55 A72 A73 cdef_dir_8bpc_neon: 400.0 391.2 269.7 282.9 cdef_dir_16bpc_neon: 417.7 413.0 303.8 313.6 After: Cortex A55 cdef_dir_8bpc_neon: 369.0 360.2 248.4 273.4 cdef_dir_16bpc_neon: 388.7 384.0 272.2 290.7 --- src/arm/64/cdef_tmpl.S | 85 ++++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 28 deletions(-) diff --git a/src/arm/64/cdef_tmpl.S b/src/arm/64/cdef_tmpl.S index 87c6388b4c..cbdb6deb16 100644 --- a/src/arm/64/cdef_tmpl.S +++ b/src/arm/64/cdef_tmpl.S @@ -311,6 +311,30 @@ endconst .endif .endm +// Steps for loading and preparing each row +.macro dir_load_step1 s1, bpc +.if \bpc == 8 + ld1 {\s1\().8b}, [x0], x1 +.else + ld1 {\s1\().8h}, [x0], x1 +.endif +.endm + +.macro dir_load_step2 s1, bpc +.if \bpc == 8 + usubl \s1\().8h, \s1\().8b, v31.8b +.else + ushl \s1\().8h, \s1\().8h, v8.8h +.endif +.endm + +.macro dir_load_step3 s1, bpc +// Nothing for \bpc == 8 +.if \bpc != 8 + sub \s1\().8h, \s1\().8h, v31.8h +.endif +.endm + // int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, // unsigned *const var) .macro find_dir bpc @@ -333,21 +357,15 @@ function cdef_find_dir_\bpc\()bpc_neon, export=1 movi v3.8h, #0 // v2-v3 sum_diag[1] movi v5.8h, #0 // v4-v5 sum_hv[0-1] movi v7.8h, #0 // v6-v7 sum_alt[0] + dir_load_step1 v26, \bpc // Setup first row early movi v17.8h, #0 // v16-v17 sum_alt[1] movi v18.8h, #0 // v18-v19 sum_alt[2] + dir_load_step2 v26, \bpc movi v19.8h, #0 + dir_load_step3 v26, \bpc movi v21.8h, #0 // v20-v21 sum_alt[3] .irpc i, 01234567 -.if \bpc == 8 - ld1 {v26.8b}, [x0], x1 - usubl v26.8h, v26.8b, v31.8b -.else - ld1 {v26.8h}, [x0], x1 - ushl v26.8h, v26.8h, v8.8h - sub v26.8h, v26.8h, v31.8h -.endif - addv h25, v26.8h // [y] rev64 v27.8h, v26.8h addp v28.8h, v26.8h, v30.8h // [(x >> 1)] @@ -355,48 +373,59 @@ function cdef_find_dir_\bpc\()bpc_neon, export=1 ext v27.16b, v27.16b, v27.16b, #8 // [-x] rev64 v29.4h, v28.4h // [-(x >> 1)] ins v4.h[\i], v25.h[0] // sum_hv[0] - +.if \i < 6 + ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2))) + ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2))) + add v18.8h, v18.8h, v22.8h // sum_alt[2] + add v19.4h, v19.4h, v23.4h // sum_alt[2] +.else + add v18.8h, v18.8h, v26.8h // sum_alt[2] +.endif +.if \i == 0 + mov v20.16b, v26.16b // sum_alt[3] +.elseif \i == 1 + add v20.8h, v20.8h, v26.8h // sum_alt[3] +.else + ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2)) + ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2)) + add v20.8h, v20.8h, v24.8h // sum_alt[3] + add v21.4h, v21.4h, v25.4h // sum_alt[3] +.endif .if \i == 0 mov v0.16b, v26.16b // sum_diag[0] + dir_load_step1 v26, \bpc mov v2.16b, v27.16b // sum_diag[1] + dir_load_step2 v26, \bpc mov v6.16b, v28.16b // sum_alt[0] + dir_load_step3 v26, \bpc mov v16.16b, v29.16b // sum_alt[1] .else ext v22.16b, v30.16b, v26.16b, #(16-2*\i) ext v23.16b, v26.16b, v30.16b, #(16-2*\i) ext v24.16b, v30.16b, v27.16b, #(16-2*\i) ext v25.16b, v27.16b, v30.16b, #(16-2*\i) +.if \i != 7 // Nothing to load for the final row + dir_load_step1 v26, \bpc // Start setting up the next row early. +.endif add v0.8h, v0.8h, v22.8h // sum_diag[0] add v1.8h, v1.8h, v23.8h // sum_diag[0] add v2.8h, v2.8h, v24.8h // sum_diag[1] add v3.8h, v3.8h, v25.8h // sum_diag[1] +.if \i != 7 + dir_load_step2 v26, \bpc +.endif ext v22.16b, v30.16b, v28.16b, #(16-2*\i) ext v23.16b, v28.16b, v30.16b, #(16-2*\i) ext v24.16b, v30.16b, v29.16b, #(16-2*\i) ext v25.16b, v29.16b, v30.16b, #(16-2*\i) +.if \i != 7 + dir_load_step3 v26, \bpc +.endif add v6.8h, v6.8h, v22.8h // sum_alt[0] add v7.4h, v7.4h, v23.4h // sum_alt[0] add v16.8h, v16.8h, v24.8h // sum_alt[1] add v17.4h, v17.4h, v25.4h // sum_alt[1] .endif -.if \i < 6 - ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2))) - ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2))) - add v18.8h, v18.8h, v22.8h // sum_alt[2] - add v19.4h, v19.4h, v23.4h // sum_alt[2] -.else - add v18.8h, v18.8h, v26.8h // sum_alt[2] -.endif -.if \i == 0 - mov v20.16b, v26.16b // sum_alt[3] -.elseif \i == 1 - add v20.8h, v20.8h, v26.8h // sum_alt[3] -.else - ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2)) - ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2)) - add v20.8h, v20.8h, v24.8h // sum_alt[3] - add v21.4h, v21.4h, v25.4h // sum_alt[3] -.endif .endr movi v31.4s, #105 From 2d2121131db629a216b280620f9eaa4ca8135520 Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Sat, 28 Nov 2020 12:16:20 -0500 Subject: [PATCH 108/155] AVX2: Swap shuffles with zen 2/3 friendly equivalents On zen 2 and 3, vpermq is slower than vperm2i128. In some assembly, we use the former to swap lanes of a vector when we could be using the latter. On zen 1, the most expensive instruction is swapped, so this patch will be slower on them. On current intel cpus, these instructions are equally expensive, so there should be no impact there. --- src/x86/cdef_avx2.asm | 8 ++++---- src/x86/ipred.asm | 4 ++-- src/x86/mc_avx2.asm | 3 +-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/x86/cdef_avx2.asm b/src/x86/cdef_avx2.asm index 672fae7a51..685a1274a1 100644 --- a/src/x86/cdef_avx2.asm +++ b/src/x86/cdef_avx2.asm @@ -1622,10 +1622,10 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 psubw m3, m8 ; shuffle registers to generate partial_sum_diag[0-1] together - vpermq m7, m0, q1032 - vpermq m6, m1, q1032 - vpermq m5, m2, q1032 - vpermq m4, m3, q1032 + vperm2i128 m7, m0, m0, 0x01 + vperm2i128 m6, m1, m1, 0x01 + vperm2i128 m5, m2, m2, 0x01 + vperm2i128 m4, m3, m3, 0x01 ; start with partial_sum_hv[0-1] paddw m8, m0, m1 diff --git a/src/x86/ipred.asm b/src/x86/ipred.asm index 20fd89dc12..6838110d33 100644 --- a/src/x86/ipred.asm +++ b/src/x86/ipred.asm @@ -1170,7 +1170,7 @@ ALIGN function_align mova m9, [base+ipred_v_shuf] vbroadcasti128 m6, [base+smooth_weights+16*2] vbroadcasti128 m7, [base+smooth_weights+16*3] - vpermq m8, m9, q1032 + vperm2i128 m8, m9, m9, 0x01 paddw m0, m10, m3 paddw m3, m11 paddw m12, m0 @@ -4197,7 +4197,7 @@ ALIGN function_align pmaddubsw m%3, m5 paddw m%1, m%3 psraw m%1, 4 - vpermq m%3, m%1, q1032 + vperm2i128 m%3, m%1, m%1, 0x01 packuswb m%1, m%3 %endmacro diff --git a/src/x86/mc_avx2.asm b/src/x86/mc_avx2.asm index dda8234f13..6a1ab0570f 100644 --- a/src/x86/mc_avx2.asm +++ b/src/x86/mc_avx2.asm @@ -3825,9 +3825,8 @@ cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy pblendw m6, m7, 0xaa ; 67 89 pmulhrsw m6, m12 paddd m4, m5 - vpblendd m0, m1, m6, 0x0f + vperm2i128 m0, m1, m6, 0x21 ; 45 67 mova m1, m6 - vpermq m0, m0, q1032 ; 45 67 pmaddwd m6, m0, m10 pmaddwd m7, m1, m11 paddd m4, m13 From 0d7eca65544ddd8e2b8162286d38f1972c8ea0e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sun, 10 Jan 2021 09:46:26 +0200 Subject: [PATCH 109/155] arm: looprestoration: Simplify dup'ing the padding pixel --- src/arm/32/looprestoration.S | 6 ++---- src/arm/64/looprestoration.S | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/arm/32/looprestoration.S b/src/arm/32/looprestoration.S index 105a32e860..a8a2b6bfad 100644 --- a/src/arm/32/looprestoration.S +++ b/src/arm/32/looprestoration.S @@ -131,10 +131,8 @@ function wiener_filter_h_8bpc_neon, export=1 ldrb r11, [r2, r9] ldrb r9, [lr, r9] // Fill q12/q13 with the right padding pixel - vdup.8 d24, r11 - vdup.8 d26, r9 - vmovl.u8 q12, d24 - vmovl.u8 q13, d26 + vdup.16 d24, r11 + vdup.16 d26, r9 3: // !LR_HAVE_RIGHT // If we'll have to pad the right edge we need to quit early here. cmp r5, #11 diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S index 21e7804eae..5d025153ef 100644 --- a/src/arm/64/looprestoration.S +++ b/src/arm/64/looprestoration.S @@ -124,10 +124,8 @@ function wiener_filter_h_8bpc_neon, export=1 ldr b28, [x2, w9, sxtw] ldr b29, [x13, w9, sxtw] // Fill v28/v29 with the right padding pixel - dup v28.8b, v28.b[0] - dup v29.8b, v29.b[0] - uxtl v28.8h, v28.8b - uxtl v29.8h, v29.8b + dup v28.8h, v28.h[0] + dup v29.8h, v29.h[0] 3: // !LR_HAVE_RIGHT // If we'll have to pad the right edge we need to quit early here. cmp w5, #11 From 2f5ee20f7576d4e2a9da40d6b64ced1394defd55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 5 Jan 2021 23:40:29 +0200 Subject: [PATCH 110/155] arm: looprestoration: Simplify right edge padding in horz filters Use a variable mask for inserting padding, instead of fixed code paths for different padding widths. This allows simplifying the filtering logic to simply always process 8 pixels at a time. Also improve scheduling of the loop subtract instruction in all these cases. --- src/arm/32/looprestoration.S | 522 +++++++------------------------- src/arm/32/looprestoration16.S | 477 +++++------------------------ src/arm/64/looprestoration.S | 528 +++++++++------------------------ src/arm/64/looprestoration16.S | 487 +++++++----------------------- 4 files changed, 435 insertions(+), 1579 deletions(-) diff --git a/src/arm/32/looprestoration.S b/src/arm/32/looprestoration.S index a8a2b6bfad..a9a5ccdce7 100644 --- a/src/arm/32/looprestoration.S +++ b/src/arm/32/looprestoration.S @@ -28,6 +28,18 @@ #include "src/arm/asm.S" #include "util.S" +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + // void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], // const pixel *src, ptrdiff_t stride, // const int16_t fh[8], intptr_t w, @@ -47,27 +59,19 @@ function wiener_filter_h_8bpc_neon, export=1 bic r10, r10, #7 lsl r10, r10, #1 - // Clear the last unused element of q0, to allow filtering a single - // pixel with one plain vmul+vpadd. - mov r12, #0 - vmov.16 d1[3], r12 - // Set up pointers for reading/writing alternate rows add r12, r0, r10 lsl r10, r10, #1 add lr, r2, r3 lsl r3, r3, #1 - // Subtract the width from mid_stride - sub r10, r10, r5, lsl #1 - - // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. - cmp r5, #8 - add r11, r5, #13 + // Subtract the aligned width from mid_stride + add r11, r5, #7 bic r11, r11, #7 - bge 1f - mov r11, #16 -1: + sub r10, r10, r11, lsl #1 + + // Subtract the number of pixels read from the source stride + add r11, r11, #8 sub r3, r3, r11 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL @@ -131,15 +135,28 @@ function wiener_filter_h_8bpc_neon, export=1 ldrb r11, [r2, r9] ldrb r9, [lr, r9] // Fill q12/q13 with the right padding pixel - vdup.16 d24, r11 - vdup.16 d26, r9 + vdup.16 q12, r11 + vdup.16 q13, r9 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp r5, #11 bge 4f // If w >= 11, all used input pixels are valid - cmp r5, #7 - bge 5f // If w >= 7, we can filter 4 pixels - b 6f + + // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel_local r4, right_ext_mask, -6 + sub r4, r4, r5, lsl #1 + vld1.8 {q10, q11}, [r4] + + vbit q1, q12, q10 + vbit q2, q12, q11 + vbit q8, q13, q10 + vbit q9, q13, q11 4: // Loop horizontally // This is tuned as some sort of compromise between Cortex A7, A8, @@ -184,10 +201,10 @@ function wiener_filter_h_8bpc_neon, export=1 vshr.s16 q10, q10, #3 vadd.s16 q3, q3, q15 vadd.s16 q10, q10, q15 + subs r5, r5, #8 vst1.16 {q3}, [r0, :128]! vst1.16 {q10}, [r12, :128]! - subs r5, r5, #8 ble 9f tst r7, #2 // LR_HAVE_RIGHT vmov q1, q2 @@ -199,145 +216,6 @@ function wiener_filter_h_8bpc_neon, export=1 bne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. -5: // Filter 4 pixels, 7 <= w < 11 -.macro filter_4 - vext.8 d20, d2, d3, #2 - vext.8 d21, d2, d3, #4 - vext.8 d22, d2, d3, #6 - vext.8 d23, d3, d4, #2 - vext.8 d8, d3, d4, #4 - vmul.s16 d6, d2, d0[0] - vmla.s16 d6, d20, d0[1] - vmla.s16 d6, d21, d0[2] - vmla.s16 d6, d22, d0[3] - vmla.s16 d6, d3, d1[0] - vmla.s16 d6, d23, d1[1] - vmla.s16 d6, d8, d1[2] - - vext.8 d20, d16, d17, #2 - vext.8 d21, d16, d17, #4 - vext.8 d22, d16, d17, #6 - vext.8 d23, d17, d18, #2 - vext.8 d8, d17, d18, #4 - vmul.s16 d7, d16, d0[0] - vmla.s16 d7, d20, d0[1] - vmla.s16 d7, d21, d0[2] - vmla.s16 d7, d22, d0[3] - vmla.s16 d7, d17, d1[0] - vmla.s16 d7, d23, d1[1] - vmla.s16 d7, d8, d1[2] - - vext.8 d22, d2, d3, #6 - vext.8 d23, d16, d17, #6 - vshl.s16 q11, q11, #7 - vsub.s16 q11, q11, q14 - vqadd.s16 q3, q3, q11 - vshr.s16 q3, q3, #3 - vadd.s16 q3, q3, q15 -.endm - filter_4 - vst1.16 {d6}, [r0, :64]! - vst1.16 {d7}, [r12, :64]! - - subs r5, r5, #4 // 3 <= w < 7 - vext.8 q1, q1, q2, #8 - vext.8 q2, q2, q2, #8 - vext.8 q8, q8, q9, #8 - vext.8 q9, q9, q9, #8 - -6: // Pad the right edge and filter the last few pixels. - // w < 7, w+3 pixels valid in q1-q2 - cmp r5, #5 - blt 7f - bgt 8f - // w == 5, 8 pixels valid in q1, q2 invalid - vmov q2, q12 - vmov q9, q13 - b 88f - -7: // 1 <= w < 5, 4-7 pixels valid in q1 - sub r9, r5, #1 - // r9 = (pixels valid - 4) - adr r11, L(variable_shift_tbl) - ldr r9, [r11, r9, lsl #2] - add r11, r11, r9 - vmov q2, q12 - vmov q9, q13 - bx r11 - - .align 2 -L(variable_shift_tbl): - .word 44f - L(variable_shift_tbl) + CONFIG_THUMB - .word 55f - L(variable_shift_tbl) + CONFIG_THUMB - .word 66f - L(variable_shift_tbl) + CONFIG_THUMB - .word 77f - L(variable_shift_tbl) + CONFIG_THUMB - -44: // 4 pixels valid in d2/d16, fill d3/d17 with padding. - vmov d3, d4 - vmov d17, d18 - b 88f - // Shift q1 right, shifting out invalid pixels, - // shift q1 left to the original offset, shifting in padding pixels. -55: // 5 pixels valid - vext.8 q1, q1, q1, #10 - vext.8 q1, q1, q2, #6 - vext.8 q8, q8, q8, #10 - vext.8 q8, q8, q9, #6 - b 88f -66: // 6 pixels valid - vext.8 q1, q1, q1, #12 - vext.8 q1, q1, q2, #4 - vext.8 q8, q8, q8, #12 - vext.8 q8, q8, q9, #4 - b 88f -77: // 7 pixels valid - vext.8 q1, q1, q1, #14 - vext.8 q1, q1, q2, #2 - vext.8 q8, q8, q8, #14 - vext.8 q8, q8, q9, #2 - b 88f - -8: // w > 5, w == 6, 9 pixels valid in q1-q2, 1 pixel valid in q2 - vext.8 q2, q2, q2, #2 - vext.8 q2, q2, q12, #14 - vext.8 q9, q9, q9, #2 - vext.8 q9, q9, q13, #14 - -88: - // w < 7, q1-q2 padded properly - cmp r5, #4 - blt 888f - - // w >= 4, filter 4 pixels - filter_4 - vst1.16 {d6}, [r0, :64]! - vst1.16 {d7}, [r12, :64]! - subs r5, r5, #4 // 0 <= w < 4 - vext.8 q1, q1, q2, #8 - vext.8 q8, q8, q9, #8 - beq 9f -888: // 1 <= w < 4, filter 1 pixel at a time - vmul.s16 q3, q1, q0 - vmul.s16 q10, q8, q0 - vpadd.s16 d6, d6, d7 - vpadd.s16 d7, d20, d21 - vdup.16 d24, d2[3] - vpadd.s16 d6, d6, d7 - vdup.16 d25, d16[3] - vpadd.s16 d6, d6, d6 - vtrn.16 d24, d25 - vshl.s16 d24, d24, #7 - vsub.s16 d24, d24, d28 - vqadd.s16 d6, d6, d24 - vshr.s16 d6, d6, #3 - vadd.s16 d6, d6, d30 - vst1.s16 {d6[0]}, [r0, :16]! - vst1.s16 {d6[1]}, [r12, :16]! - subs r5, r5, #1 - vext.8 q1, q1, q2, #2 - vext.8 q8, q8, q9, #2 - bgt 888b - 9: subs r6, r6, #2 ble 0f @@ -351,7 +229,6 @@ L(variable_shift_tbl): 0: vpop {q4} pop {r4-r11,pc} -.purgem filter_4 endfunc // void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride, @@ -555,25 +432,15 @@ function sgr_box3_h_8bpc_neon, export=1 mov r9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - tst r7, #2 // LR_HAVE_RIGHT - bne 0f - // !LR_HAVE_RIGHT - add lr, r5, #3 - bic lr, lr, #3 - b 1f -0: add lr, r5, #7 bic lr, lr, #7 -1: sub r9, r9, lr, lsl #1 // Store the width for the vertical loop mov r8, r5 // Subtract the number of pixels read from the input from the stride - add lr, r5, #14 - bic lr, lr, #7 + add lr, lr, #8 sub r4, r4, lr // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL @@ -642,34 +509,30 @@ function sgr_box3_h_8bpc_neon, export=1 // Restore r11 after using it for a temporary value add r11, r1, #(2*SUM_STRIDE) 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp r5, #10 bge 4f // If w >= 10, all used input pixels are valid - cmp r5, #6 - bge 5f // If w >= 6, we can filter 4 pixels - b 6f -4: // Loop horizontally -.macro vaddl_u16_n dst1, dst2, src1, src2, src3, src4, w - vaddl.u16 \dst1, \src1, \src3 -.if \w > 4 - vaddl.u16 \dst2, \src2, \src4 -.endif -.endm -.macro vaddw_u16_n dst1, dst2, src1, src2, w - vaddw.u16 \dst1, \dst1, \src1 -.if \w > 4 - vaddw.u16 \dst2, \dst2, \src2 -.endif -.endm -.macro vadd_i32_n dst1, dst2, src1, src2, w - vadd.i32 \dst1, \dst1, \src1 -.if \w > 4 - vadd.i32 \dst2, \dst2, \src2 -.endif -.endm + // 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. -.macro add3 w + // Insert padding in q0/4.b[w] onwards + movrel_local lr, right_ext_mask + sub lr, lr, r5 + vld1.8 {q13}, [lr] + + vbit q0, q14, q13 + vbit q4, q15, q13 + + // Update the precalculated squares + vmull.u8 q1, d0, d0 + vmull.u8 q2, d1, d1 + vmull.u8 q5, d8, d8 + vmull.u8 q6, d9, d9 + +4: // Loop horizontally vext.8 d16, d0, d1, #1 vext.8 d17, d0, d1, #2 vext.8 d18, d8, d9, #1 @@ -684,19 +547,22 @@ function sgr_box3_h_8bpc_neon, export=1 vext.8 q10, q5, q6, #2 vext.8 q11, q5, q6, #4 - vaddl_u16_n q12, q13, d2, d3, d16, d17, \w - vaddw_u16_n q12, q13, d18, d19, \w + vaddl.u16 q12, d2, d16 + vaddl.u16 q13, d3, d17 + vaddw.u16 q12, q12, d18 + vaddw.u16 q13, q13, d19 - vaddl_u16_n q8, q9, d10, d11, d20, d21, \w - vaddw_u16_n q8, q9, d22, d23, \w -.endm - add3 8 + vaddl.u16 q8, d10, d20 + vaddl.u16 q9, d11, d21 + vaddw.u16 q8, q8, d22 + vaddw.u16 q9, q9, d23 + + subs r5, r5, #8 vst1.16 {q3}, [r1, :128]! vst1.16 {q7}, [r11, :128]! vst1.32 {q12, q13}, [r0, :128]! vst1.32 {q8, q9}, [r10, :128]! - subs r5, r5, #8 ble 9f tst r7, #2 // LR_HAVE_RIGHT vld1.8 {d6}, [r3]! @@ -711,86 +577,6 @@ function sgr_box3_h_8bpc_neon, export=1 bne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 6 <= w < 10 - add3 4 - vst1.16 {d6}, [r1, :64]! - vst1.16 {d14}, [r11, :64]! - vst1.32 {q12}, [r0, :128]! - vst1.32 {q8}, [r10, :128]! - - subs r5, r5, #4 // 2 <= w < 6 - vext.8 q0, q0, q0, #4 - vext.8 q4, q4, q4, #4 - -6: // Pad the right edge and produce the last few pixels. - // 2 <= w < 6, 2-5 pixels valid in q0 - sub lr, r5, #2 - // lr = (pixels valid - 2) - adr r11, L(box3_variable_shift_tbl) - ldr lr, [r11, lr, lsl #2] - add r11, r11, lr - bx r11 - - .align 2 -L(box3_variable_shift_tbl): - .word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB - .word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB - .word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB - .word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB - - // Shift q0 right, shifting out invalid pixels, - // shift q0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - vext.8 q0, q0, q0, #2 - vext.8 q4, q4, q4, #2 - vext.8 q0, q0, q14, #14 - vext.8 q4, q4, q15, #14 - b 88f -33: // 3 pixels valid - vext.8 q0, q0, q0, #3 - vext.8 q4, q4, q4, #3 - vext.8 q0, q0, q14, #13 - vext.8 q4, q4, q15, #13 - b 88f -44: // 4 pixels valid - vext.8 q0, q0, q0, #4 - vext.8 q4, q4, q4, #4 - vext.8 q0, q0, q14, #12 - vext.8 q4, q4, q15, #12 - b 88f -55: // 5 pixels valid - vext.8 q0, q0, q0, #5 - vext.8 q4, q4, q4, #5 - vext.8 q0, q0, q14, #11 - vext.8 q4, q4, q15, #11 - -88: - // Restore r11 after using it for a temporary value above - add r11, r1, #(2*SUM_STRIDE) - vmull.u8 q1, d0, d0 - vmull.u8 q2, d1, d1 - vmull.u8 q5, d8, d8 - vmull.u8 q6, d9, d9 - - add3 4 - subs r5, r5, #4 - vst1.16 {d6}, [r1, :64]! - vst1.16 {d14}, [r11, :64]! - vst1.32 {q12}, [r0, :128]! - vst1.32 {q8}, [r10, :128]! - ble 9f - vext.8 q0, q0, q0, #4 - vext.8 q1, q1, q2, #8 - vext.8 q4, q4, q4, #4 - vext.8 q5, q5, q6, #8 - // Only one needed pixel left, but do a normal 4 pixel - // addition anyway - add3 4 - vst1.16 {d6}, [r1, :64]! - vst1.16 {d14}, [r11, :64]! - vst1.32 {q12}, [r0, :128]! - vst1.32 {q8}, [r10, :128]! - 9: subs r6, r6, #2 ble 0f @@ -806,7 +592,6 @@ L(box3_variable_shift_tbl): 0: vpop {q4-q7} pop {r4-r11,pc} -.purgem add3 endfunc // void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, @@ -829,23 +614,11 @@ function sgr_box5_h_8bpc_neon, export=1 mov r9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - // Subtract the number of pixels read from the input from the stride. - tst r7, #2 // LR_HAVE_RIGHT - bne 0f - // !LR_HAVE_RIGHT - add lr, r5, #3 - bic lr, lr, #3 - add r8, r5, #13 - b 1f -0: add lr, r5, #7 bic lr, lr, #7 - add r8, r5, #15 -1: sub r9, r9, lr, lsl #1 - bic r8, r8, #7 - sub r4, r4, r8 + add lr, lr, #8 + sub r4, r4, lr // Store the width for the vertical loop mov r8, r5 @@ -915,15 +688,31 @@ function sgr_box5_h_8bpc_neon, export=1 // Restore r11 after using it for a temporary value add r11, r1, #(2*SUM_STRIDE) 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp r5, #11 bge 4f // If w >= 11, all used input pixels are valid - cmp r5, #7 - bge 5f // If w >= 7, we can produce 4 pixels - b 6f + + // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel_local lr, right_ext_mask, -1 + sub lr, lr, r5 + vld1.8 {q13}, [lr] + + vbit q0, q14, q13 + vbit q4, q15, q13 + + // Update the precalculated squares + vmull.u8 q1, d0, d0 + vmull.u8 q2, d1, d1 + vmull.u8 q5, d8, d8 + vmull.u8 q6, d9, d9 4: // Loop horizontally -.macro add5 w vext.8 d16, d0, d1, #1 vext.8 d17, d0, d1, #2 vext.8 d18, d0, d1, #3 @@ -945,35 +734,33 @@ function sgr_box5_h_8bpc_neon, export=1 vext.8 q9, q1, q2, #4 vext.8 q10, q1, q2, #6 vext.8 q11, q1, q2, #8 - vaddl_u16_n q12, q13, d2, d3, d16, d17, \w - vaddl_u16_n q8, q9, d18, d19, d20, d21, \w - vaddw_u16_n q12, q13, d22, d23, \w - vadd_i32_n q12, q13, q8, q9, \w + vaddl.u16 q12, d2, d16 + vaddl.u16 q13, d3, d17 + vaddl.u16 q8, d18, d20 + vaddl.u16 q9, d19, d21 + vaddw.u16 q12, q12, d22 + vaddw.u16 q13, q13, d23 + vadd.i32 q12, q12, q8 + vadd.i32 q13, q13, q9 vext.8 q8, q5, q6, #2 vext.8 q9, q5, q6, #4 vext.8 q10, q5, q6, #6 vext.8 q11, q5, q6, #8 -.if \w > 4 - vaddl_u16_n q1, q5, d10, d11, d16, d17, 8 - vaddl_u16_n q8, q9, d18, d19, d20, d21, 8 - vaddw_u16_n q1, q5, d22, d23, 8 + vaddl.u16 q1, d10, d16 + vaddl.u16 q5, d11, d17 + vaddl.u16 q8, d18, d20 + vaddl.u16 q9, d19, d21 + vaddw.u16 q1, q1, d22 + vaddw.u16 q5, q5, d23 vadd.i32 q10, q1, q8 vadd.i32 q11, q5, q9 -.else - // Can't clobber q1/q5 if only doing 4 pixels - vaddl.u16 q8, d10, d16 - vaddl.u16 q9, d18, d20 - vaddw.u16 q8, q8, d22 - vadd.i32 q10, q8, q9 -.endif -.endm - add5 8 + + subs r5, r5, #8 vst1.16 {q3}, [r1, :128]! vst1.16 {q7}, [r11, :128]! vst1.32 {q12, q13}, [r0, :128]! vst1.32 {q10, q11}, [r10, :128]! - subs r5, r5, #8 ble 9f tst r7, #2 // LR_HAVE_RIGHT vld1.8 {d6}, [r3]! @@ -987,98 +774,6 @@ function sgr_box5_h_8bpc_neon, export=1 bne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 7 <= w < 11 - add5 4 - vst1.16 {d6}, [r1, :64]! - vst1.16 {d14}, [r11, :64]! - vst1.32 {q12}, [r0, :128]! - vst1.32 {q10}, [r10, :128]! - - subs r5, r5, #4 // 3 <= w < 7 - vext.8 q0, q0, q0, #4 - vext.8 q4, q4, q4, #4 - -6: // Pad the right edge and produce the last few pixels. - // w < 7, w+1 pixels valid in q0/q4 - sub lr, r5, #1 - // lr = pixels valid - 2 - adr r11, L(box5_variable_shift_tbl) - ldr lr, [r11, lr, lsl #2] - add r11, r11, lr - bx r11 - - .align 2 -L(box5_variable_shift_tbl): - .word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB - - // Shift q0 right, shifting out invalid pixels, - // shift q0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - vext.8 q0, q0, q0, #2 - vext.8 q4, q4, q4, #2 - vext.8 q0, q0, q14, #14 - vext.8 q4, q4, q15, #14 - b 88f -33: // 3 pixels valid - vext.8 q0, q0, q0, #3 - vext.8 q4, q4, q4, #3 - vext.8 q0, q0, q14, #13 - vext.8 q4, q4, q15, #13 - b 88f -44: // 4 pixels valid - vext.8 q0, q0, q0, #4 - vext.8 q4, q4, q4, #4 - vext.8 q0, q0, q14, #12 - vext.8 q4, q4, q15, #12 - b 88f -55: // 5 pixels valid - vext.8 q0, q0, q0, #5 - vext.8 q4, q4, q4, #5 - vext.8 q0, q0, q14, #11 - vext.8 q4, q4, q15, #11 - b 88f -66: // 6 pixels valid - vext.8 q0, q0, q0, #6 - vext.8 q4, q4, q4, #6 - vext.8 q0, q0, q14, #10 - vext.8 q4, q4, q15, #10 - b 88f -77: // 7 pixels valid - vext.8 q0, q0, q0, #7 - vext.8 q4, q4, q4, #7 - vext.8 q0, q0, q14, #9 - vext.8 q4, q4, q15, #9 - -88: - // Restore r11 after using it for a temporary value above - add r11, r1, #(2*SUM_STRIDE) - vmull.u8 q1, d0, d0 - vmull.u8 q2, d1, d1 - vmull.u8 q5, d8, d8 - vmull.u8 q6, d9, d9 - - add5 4 - subs r5, r5, #4 - vst1.16 {d6}, [r1, :64]! - vst1.16 {d14}, [r11, :64]! - vst1.32 {q12}, [r0, :128]! - vst1.32 {q10}, [r10, :128]! - ble 9f - vext.8 q0, q0, q0, #4 - vext.8 q1, q1, q2, #8 - vext.8 q4, q4, q4, #4 - vext.8 q5, q5, q6, #8 - add5 4 - vst1.16 {d6}, [r1, :64]! - vst1.16 {d14}, [r11, :64]! - vst1.32 {q12}, [r0, :128]! - vst1.32 {q10}, [r10, :128]! - 9: subs r6, r6, #2 ble 0f @@ -1094,7 +789,6 @@ L(box5_variable_shift_tbl): 0: vpop {q4-q7} pop {r4-r11,pc} -.purgem add5 endfunc sgr_funcs 8 diff --git a/src/arm/32/looprestoration16.S b/src/arm/32/looprestoration16.S index 7cda0cb2d3..11a28bc772 100644 --- a/src/arm/32/looprestoration16.S +++ b/src/arm/32/looprestoration16.S @@ -28,6 +28,18 @@ #include "src/arm/asm.S" #include "util.S" +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + // void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], // const pixel *src, ptrdiff_t stride, // const int16_t fh[7], const intptr_t w, @@ -55,27 +67,19 @@ function wiener_filter_h_16bpc_neon, export=1 bic r10, r10, #7 lsl r10, r10, #1 - // Clear the last unused element of q0, to allow filtering a single - // pixel with one plain vmul+vpadd. - mov r12, #0 - vmov.16 d1[3], r12 - // Set up pointers for reading/writing alternate rows add r12, r0, r10 lsl r10, r10, #1 add lr, r2, r3 lsl r3, r3, #1 - // Subtract the width from mid_stride - sub r10, r10, r5, lsl #1 - - // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. - cmp r5, #8 - add r11, r5, #13 + // Subtract the aligned width from mid_stride + add r11, r5, #7 bic r11, r11, #7 - bge 1f - mov r11, #16 -1: + sub r10, r10, r11, lsl #1 + + // Subtract the number of pixels read from the source stride + add r11, r11, #8 sub r3, r3, r11, lsl #1 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL @@ -143,12 +147,25 @@ function wiener_filter_h_16bpc_neon, export=1 vdup.16 q11, r11 vdup.16 q12, r9 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp r5, #11 bge 4f // If w >= 11, all used input pixels are valid - cmp r5, #7 - bge 5f // If w >= 7, we can filter 4 pixels - b 6f + + // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel_local r4, right_ext_mask, -6 + sub r4, r4, r5, lsl #1 + vld1.8 {q9, q10}, [r4] + + vbit q2, q11, q9 + vbit q3, q11, q10 + vbit q4, q12, q9 + vbit q5, q12, q10 4: // Loop horizontally vext.8 q8, q2, q3, #2 @@ -209,10 +226,10 @@ function wiener_filter_h_16bpc_neon, export=1 vmin.u16 q7, q7, q10 vsub.i16 q6, q6, q15 vsub.i16 q7, q7, q15 + subs r5, r5, #8 vst1.16 {q6}, [r0, :128]! vst1.16 {q7}, [r12, :128]! - subs r5, r5, #8 ble 9f tst r7, #2 // LR_HAVE_RIGHT vmov q2, q3 @@ -222,148 +239,6 @@ function wiener_filter_h_16bpc_neon, export=1 bne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. -5: // Filter 4 pixels, 7 <= w < 11 -.macro filter_4 - vext.8 d18, d4, d5, #6 - vext.8 d16, d4, d5, #2 - vext.8 d17, d4, d5, #4 - vext.8 d19, d5, d6, #2 - vext.8 d20, d5, d6, #4 - vmull.s16 q6, d4, d0[0] - vmlal.s16 q6, d16, d0[1] - vmlal.s16 q6, d17, d0[2] - vmlal.s16 q6, d18, d0[3] - vmlal.s16 q6, d5, d1[0] - vmlal.s16 q6, d19, d1[1] - vmlal.s16 q6, d20, d1[2] - - vext.8 d18, d8, d9, #6 - vext.8 d16, d8, d9, #2 - vext.8 d17, d8, d9, #4 - vext.8 d19, d9, d10, #2 - vext.8 d20, d9, d10, #4 - vmull.s16 q7, d8, d0[0] - vmlal.s16 q7, d16, d0[1] - vmlal.s16 q7, d17, d0[2] - vmlal.s16 q7, d18, d0[3] - vmlal.s16 q7, d9, d1[0] - vmlal.s16 q7, d19, d1[1] - vmlal.s16 q7, d20, d1[2] - - vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1 - vadd.i32 q6, q6, q14 - vadd.i32 q7, q7, q14 - vrshl.s32 q6, q6, q13 - vrshl.s32 q7, q7, q13 - vqmovun.s32 d12, q6 - vqmovun.s32 d13, q7 - vmin.u16 q6, q6, q10 - vsub.i16 q6, q6, q15 -.endm - filter_4 - vst1.16 {d12}, [r0, :64]! - vst1.16 {d13}, [r12, :64]! - - subs r5, r5, #4 // 3 <= w < 7 - vext.8 q2, q2, q3, #8 - vext.8 q3, q3, q3, #8 - vext.8 q4, q4, q5, #8 - vext.8 q5, q5, q5, #8 - -6: // Pad the right edge and filter the last few pixels. - // w < 7, w+3 pixels valid in q2-q3 - cmp r5, #5 - blt 7f - bgt 8f - // w == 5, 8 pixels valid in q2, q3 invalid - vmov q3, q11 - vmov q5, q12 - b 88f - -7: // 1 <= w < 5, 4-7 pixels valid in q2 - sub r9, r5, #1 - // r9 = (pixels valid - 4) - adr r11, L(variable_shift_tbl) - ldr r9, [r11, r9, lsl #2] - add r11, r11, r9 - vmov q3, q11 - vmov q5, q12 - bx r11 - - .align 2 -L(variable_shift_tbl): - .word 44f - L(variable_shift_tbl) + CONFIG_THUMB - .word 55f - L(variable_shift_tbl) + CONFIG_THUMB - .word 66f - L(variable_shift_tbl) + CONFIG_THUMB - .word 77f - L(variable_shift_tbl) + CONFIG_THUMB - -44: // 4 pixels valid in q2/q4, fill the high half with padding. - vmov d5, d6 - vmov d9, d10 - b 88f - // Shift q2 right, shifting out invalid pixels, - // shift q2 left to the original offset, shifting in padding pixels. -55: // 5 pixels valid - vext.8 q2, q2, q2, #10 - vext.8 q2, q2, q3, #6 - vext.8 q4, q4, q4, #10 - vext.8 q4, q4, q5, #6 - b 88f -66: // 6 pixels valid - vext.8 q2, q2, q2, #12 - vext.8 q2, q2, q3, #4 - vext.8 q4, q4, q4, #12 - vext.8 q4, q4, q5, #4 - b 88f -77: // 7 pixels valid - vext.8 q2, q2, q2, #14 - vext.8 q2, q2, q3, #2 - vext.8 q4, q4, q4, #14 - vext.8 q4, q4, q5, #2 - b 88f - -8: // w > 5, w == 6, 9 pixels valid in q2-q3, 1 pixel valid in q3 - vext.8 q3, q3, q3, #2 - vext.8 q3, q3, q11, #14 - vext.8 q5, q5, q5, #2 - vext.8 q5, q5, q12, #14 - -88: - // w < 7, q2-q3 padded properly - cmp r5, #4 - blt 888f - - // w >= 4, filter 4 pixels - filter_4 - vst1.16 {d12}, [r0, :64]! - vst1.16 {d13}, [r12, :64]! - subs r5, r5, #4 // 0 <= w < 4 - vext.8 q2, q2, q3, #8 - vext.8 q4, q4, q5, #8 - beq 9f -888: // 1 <= w < 4, filter 1 pixel at a time - vmull.s16 q6, d4, d0 - vmull.s16 q7, d5, d1 - vmull.s16 q8, d8, d0 - vmull.s16 q9, d9, d1 - vadd.i32 q6, q7 - vadd.i32 q8, q9 - vpadd.i32 d12, d12, d13 - vpadd.i32 d13, d16, d17 - vpadd.i32 d12, d12, d13 - vadd.i32 d12, d12, d28 - vmvn.i16 d20, #0x8000 // 0x7fff = (1 << 15) - 1 - vrshl.s32 d12, d12, d26 - vqmovun.s32 d12, q6 - vmin.u16 d12, d12, d20 - vsub.i16 d12, d12, d30 - vst1.16 {d12[0]}, [r0, :16]! - vst1.16 {d12[1]}, [r12, :16]! - subs r5, r5, #1 - vext.8 q2, q2, q3, #2 - vext.8 q4, q4, q5, #2 - bgt 888b - 9: subs r6, r6, #2 ble 0f @@ -377,7 +252,6 @@ L(variable_shift_tbl): 0: vpop {q4-q7} pop {r4-r11,pc} -.purgem filter_4 endfunc // void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, @@ -591,25 +465,15 @@ function sgr_box3_h_16bpc_neon, export=1 mov r9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - tst r7, #2 // LR_HAVE_RIGHT - bne 0f - // !LR_HAVE_RIGHT - add lr, r5, #3 - bic lr, lr, #3 - b 1f -0: add lr, r5, #7 bic lr, lr, #7 -1: sub r9, r9, lr, lsl #1 // Store the width for the vertical loop mov r8, r5 // Subtract the number of pixels read from the input from the stride - add lr, r5, #14 - bic lr, lr, #7 + add lr, lr, #8 sub r4, r4, lr, lsl #1 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL @@ -678,16 +542,26 @@ function sgr_box3_h_16bpc_neon, export=1 // Restore r11 after using it for a temporary value add r11, r1, #(2*SUM_STRIDE) 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp r5, #10 bge 4f // If w >= 10, all used input pixels are valid - cmp r5, #6 - bge 5f // If w >= 6, we can filter 4 pixels - b 6f + + // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. + + // Insert padding in q0/1.h[w] onwards + movrel_local lr, right_ext_mask + sub lr, lr, r5, lsl #1 + vld1.8 {q12, q13}, [lr] + + vbit q0, q14, q12 + vbit q1, q14, q13 + vbit q4, q15, q12 + vbit q5, q15, q13 4: // Loop horizontally -.macro add3 w -.if \w > 4 vext.8 q8, q0, q1, #2 vext.8 q10, q4, q5, #2 vext.8 q9, q0, q1, #4 @@ -696,16 +570,6 @@ function sgr_box3_h_16bpc_neon, export=1 vadd.i16 q3, q4, q10 vadd.i16 q2, q2, q9 vadd.i16 q3, q3, q11 -.else - vext.8 d16, d0, d1, #2 - vext.8 d20, d8, d9, #2 - vext.8 d18, d0, d1, #4 - vext.8 d22, d8, d9, #4 - vadd.i16 d4, d0, d16 - vadd.i16 d6, d8, d20 - vadd.i16 d4, d4, d18 - vadd.i16 d6, d6, d22 -.endif vmull.u16 q6, d0, d0 vmlal.u16 q6, d16, d16 @@ -713,22 +577,18 @@ function sgr_box3_h_16bpc_neon, export=1 vmull.u16 q12, d8, d8 vmlal.u16 q12, d20, d20 vmlal.u16 q12, d22, d22 -.if \w > 4 vmull.u16 q7, d1, d1 vmlal.u16 q7, d17, d17 vmlal.u16 q7, d19, d19 vmull.u16 q13, d9, d9 vmlal.u16 q13, d21, d21 vmlal.u16 q13, d23, d23 -.endif -.endm - add3 8 + subs r5, r5, #8 vst1.16 {q2}, [r1, :128]! vst1.16 {q3}, [r11, :128]! vst1.32 {q6, q7}, [r0, :128]! vst1.32 {q12, q13}, [r10, :128]! - subs r5, r5, #8 ble 9f tst r7, #2 // LR_HAVE_RIGHT vmov q0, q1 @@ -739,78 +599,6 @@ function sgr_box3_h_16bpc_neon, export=1 bne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 6 <= w < 10 - add3 4 - vst1.16 {d4}, [r1, :64]! - vst1.16 {d6}, [r11, :64]! - vst1.32 {q6}, [r0, :128]! - vst1.32 {q12}, [r10, :128]! - - subs r5, r5, #4 // 2 <= w < 6 - vext.8 q0, q0, q1, #8 - vext.8 q4, q4, q5, #8 - -6: // Pad the right edge and produce the last few pixels. - // 2 <= w < 6, 2-5 pixels valid in q0 - sub lr, r5, #2 - // lr = (pixels valid - 2) - adr r11, L(box3_variable_shift_tbl) - ldr lr, [r11, lr, lsl #2] - add r11, r11, lr - bx r11 - - .align 2 -L(box3_variable_shift_tbl): - .word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB - .word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB - .word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB - .word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB - - // Shift q0 right, shifting out invalid pixels, - // shift q0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - vext.8 q0, q0, q0, #4 - vext.8 q4, q4, q4, #4 - vext.8 q0, q0, q14, #12 - vext.8 q4, q4, q15, #12 - b 88f -33: // 3 pixels valid - vext.8 q0, q0, q0, #6 - vext.8 q4, q4, q4, #6 - vext.8 q0, q0, q14, #10 - vext.8 q4, q4, q15, #10 - b 88f -44: // 4 pixels valid - vmov d1, d28 - vmov d9, d30 - b 88f -55: // 5 pixels valid - vext.8 q0, q0, q0, #10 - vext.8 q4, q4, q4, #10 - vext.8 q0, q0, q14, #6 - vext.8 q4, q4, q15, #6 - -88: - // Restore r11 after using it for a temporary value above - add r11, r1, #(2*SUM_STRIDE) - - add3 4 - subs r5, r5, #4 - vst1.16 {d4}, [r1, :64]! - vst1.16 {d6}, [r11, :64]! - vst1.32 {q6}, [r0, :128]! - vst1.32 {q12}, [r10, :128]! - ble 9f - vext.8 q0, q0, q0, #8 - vext.8 q4, q4, q4, #8 - // Only one needed pixel left, but do a normal 4 pixel - // addition anyway - add3 4 - vst1.16 {d4}, [r1, :64]! - vst1.16 {d6}, [r11, :64]! - vst1.32 {q6}, [r0, :128]! - vst1.32 {q12}, [r10, :128]! - 9: subs r6, r6, #2 ble 0f @@ -826,7 +614,6 @@ L(box3_variable_shift_tbl): 0: vpop {q4-q7} pop {r4-r11,pc} -.purgem add3 endfunc // void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, @@ -849,23 +636,11 @@ function sgr_box5_h_16bpc_neon, export=1 mov r9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - // Subtract the number of pixels read from the input from the stride. - tst r7, #2 // LR_HAVE_RIGHT - bne 0f - // !LR_HAVE_RIGHT - add lr, r5, #3 - bic lr, lr, #3 - add r8, r5, #13 - b 1f -0: add lr, r5, #7 bic lr, lr, #7 - add r8, r5, #15 -1: sub r9, r9, lr, lsl #1 - bic r8, r8, #7 - sub r4, r4, r8, lsl #1 + add lr, lr, #8 + sub r4, r4, lr, lsl #1 // Store the width for the vertical loop mov r8, r5 @@ -935,16 +710,27 @@ function sgr_box5_h_16bpc_neon, export=1 // Restore r11 after using it for a temporary value add r11, r1, #(2*SUM_STRIDE) 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp r5, #11 bge 4f // If w >= 11, all used input pixels are valid - cmp r5, #7 - bge 5f // If w >= 7, we can produce 4 pixels - b 6f + + // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel_local lr, right_ext_mask, -2 + sub lr, lr, r5, lsl #1 + vld1.8 {q12, q13}, [lr] + + vbit q0, q14, q12 + vbit q1, q14, q13 + vbit q4, q15, q12 + vbit q5, q15, q13 4: // Loop horizontally -.macro add5 w -.if \w > 4 vext.8 q8, q0, q1, #2 vext.8 q10, q4, q5, #2 vext.8 q9, q0, q1, #4 @@ -953,16 +739,6 @@ function sgr_box5_h_16bpc_neon, export=1 vadd.i16 q3, q4, q10 vadd.i16 q2, q2, q9 vadd.i16 q3, q3, q11 -.else - vext.8 d16, d0, d1, #2 - vext.8 d20, d8, d9, #2 - vext.8 d18, d0, d1, #4 - vext.8 d22, d8, d9, #4 - vadd.i16 d4, d0, d16 - vadd.i16 d6, d8, d20 - vadd.i16 d4, d4, d18 - vadd.i16 d6, d6, d22 -.endif vmull.u16 q6, d0, d0 vmlal.u16 q6, d16, d16 @@ -970,16 +746,13 @@ function sgr_box5_h_16bpc_neon, export=1 vmull.u16 q12, d8, d8 vmlal.u16 q12, d20, d20 vmlal.u16 q12, d22, d22 -.if \w > 4 vmull.u16 q7, d1, d1 vmlal.u16 q7, d17, d17 vmlal.u16 q7, d19, d19 vmull.u16 q13, d9, d9 vmlal.u16 q13, d21, d21 vmlal.u16 q13, d23, d23 -.endif -.if \w > 4 vext.8 q8, q0, q1, #6 vext.8 q10, q4, q5, #6 vext.8 q9, q0, q1, #8 @@ -988,35 +761,22 @@ function sgr_box5_h_16bpc_neon, export=1 vadd.i16 q3, q3, q10 vadd.i16 q2, q2, q9 vadd.i16 q3, q3, q11 -.else - vext.8 d16, d0, d1, #6 - // d18 would be equal to d1; using d1 instead - vext.8 d20, d8, d9, #6 - // d22 would be equal to d9; using d9 instead - vadd.i16 d4, d4, d16 - vadd.i16 d6, d6, d20 - vadd.i16 d4, d4, d1 - vadd.i16 d6, d6, d9 -.endif vmlal.u16 q6, d16, d16 vmlal.u16 q6, d1, d1 vmlal.u16 q12, d20, d20 vmlal.u16 q12, d9, d9 -.if \w > 4 vmlal.u16 q7, d17, d17 vmlal.u16 q7, d19, d19 vmlal.u16 q13, d21, d21 vmlal.u16 q13, d23, d23 -.endif -.endm - add5 8 + + subs r5, r5, #8 vst1.16 {q2}, [r1, :128]! vst1.16 {q3}, [r11, :128]! vst1.32 {q6, q7}, [r0, :128]! vst1.32 {q12, q13}, [r10, :128]! - subs r5, r5, #8 ble 9f tst r7, #2 // LR_HAVE_RIGHT vmov q0, q1 @@ -1026,92 +786,6 @@ function sgr_box5_h_16bpc_neon, export=1 bne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 7 <= w < 11 - add5 4 - vst1.16 {d4}, [r1, :64]! - vst1.16 {d6}, [r11, :64]! - vst1.32 {q6}, [r0, :128]! - vst1.32 {q12}, [r10, :128]! - - subs r5, r5, #4 // 3 <= w < 7 - vext.8 q0, q0, q1, #8 - vext.8 q4, q4, q5, #8 - -6: // Pad the right edge and produce the last few pixels. - // w < 7, w+1 pixels valid in q0/q4 - sub lr, r5, #1 - // lr = pixels valid - 2 - adr r11, L(box5_variable_shift_tbl) - ldr lr, [r11, lr, lsl #2] - vmov q1, q14 - vmov q5, q15 - add r11, r11, lr - bx r11 - - .align 2 -L(box5_variable_shift_tbl): - .word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB - .word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB - - // Shift q0 right, shifting out invalid pixels, - // shift q0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - vext.8 q0, q0, q0, #4 - vext.8 q4, q4, q4, #4 - vext.8 q0, q0, q14, #12 - vext.8 q4, q4, q15, #12 - b 88f -33: // 3 pixels valid - vext.8 q0, q0, q0, #6 - vext.8 q4, q4, q4, #6 - vext.8 q0, q0, q14, #10 - vext.8 q4, q4, q15, #10 - b 88f -44: // 4 pixels valid - vmov d1, d28 - vmov d9, d30 - b 88f -55: // 5 pixels valid - vext.8 q0, q0, q0, #10 - vext.8 q4, q4, q4, #10 - vext.8 q0, q0, q14, #6 - vext.8 q4, q4, q15, #6 - b 88f -66: // 6 pixels valid - vext.8 q0, q0, q0, #12 - vext.8 q4, q4, q4, #12 - vext.8 q0, q0, q14, #4 - vext.8 q4, q4, q15, #4 - b 88f -77: // 7 pixels valid - vext.8 q0, q0, q0, #14 - vext.8 q4, q4, q4, #14 - vext.8 q0, q0, q14, #2 - vext.8 q4, q4, q15, #2 - -88: - // Restore r11 after using it for a temporary value above - add r11, r1, #(2*SUM_STRIDE) - - add5 4 - subs r5, r5, #4 - vst1.16 {d4}, [r1, :64]! - vst1.16 {d6}, [r11, :64]! - vst1.32 {q6}, [r0, :128]! - vst1.32 {q12}, [r10, :128]! - ble 9f - vext.8 q0, q0, q1, #8 - vext.8 q4, q4, q5, #8 - add5 4 - vst1.16 {d4}, [r1, :64]! - vst1.16 {d6}, [r11, :64]! - vst1.32 {q6}, [r0, :128]! - vst1.32 {q12}, [r10, :128]! - 9: subs r6, r6, #2 ble 0f @@ -1127,7 +801,6 @@ L(box5_variable_shift_tbl): 0: vpop {q4-q7} pop {r4-r11,pc} -.purgem add5 endfunc sgr_funcs 16 diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S index 5d025153ef..fe40ae9875 100644 --- a/src/arm/64/looprestoration.S +++ b/src/arm/64/looprestoration.S @@ -28,6 +28,18 @@ #include "src/arm/asm.S" #include "util.S" +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + // void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], // const pixel *src, ptrdiff_t stride, // const int16_t fh[8], intptr_t w, @@ -43,26 +55,19 @@ function wiener_filter_h_8bpc_neon, export=1 bic w10, w10, #7 lsl w10, w10, #1 - // Clear the last unused element of v0, to allow filtering a single - // pixel with one plain mul+addv. - ins v0.h[7], wzr - // Set up pointers for reading/writing alternate rows add x12, x0, x10 lsl w10, w10, #1 add x13, x2, x3 lsl x3, x3, #1 - // Subtract the width from mid_stride - sub x10, x10, w5, uxtw #1 - - // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. - cmp w5, #8 - add w11, w5, #13 + // Subtract the aligned width from mid_stride + add w11, w5, #7 bic w11, w11, #7 - b.ge 1f - mov w11, #16 -1: + sub x10, x10, w11, uxtw #1 + + // Subtract the number of pixels read from the source stride + add w11, w11, #8 sub x3, x3, w11, uxtw // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL @@ -127,15 +132,27 @@ function wiener_filter_h_8bpc_neon, export=1 dup v28.8h, v28.h[0] dup v29.8h, v29.h[0] 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp w5, #11 b.ge 4f // If w >= 11, all used input pixels are valid - cmp w5, #7 - b.ge 5f // If w >= 7, we can filter 4 pixels - b 6f + + // 1 <= w < 11, w+3 pixels valid in v2-v3. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v2/3.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x4, right_ext_mask, -6 + sub x4, x4, w5, uxtw #1 + ld1 {v26.16b, v27.16b}, [x4] + + bit v2.16b, v28.16b, v26.16b + bit v3.16b, v28.16b, v27.16b + bit v4.16b, v29.16b, v26.16b + bit v5.16b, v29.16b, v27.16b 4: // Loop horizontally -.macro filter wd // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. @@ -145,43 +162,43 @@ function wiener_filter_h_8bpc_neon, export=1 ext v19.16b, v2.16b, v3.16b, #8 ext v20.16b, v2.16b, v3.16b, #10 ext v21.16b, v2.16b, v3.16b, #12 - mul v6\wd, v2\wd, v0.h[0] - mla v6\wd, v16\wd, v0.h[1] - mla v6\wd, v17\wd, v0.h[2] - mla v6\wd, v18\wd, v0.h[3] - mla v6\wd, v19\wd, v0.h[4] - mla v6\wd, v20\wd, v0.h[5] - mla v6\wd, v21\wd, v0.h[6] + mul v6.8h, v2.8h, v0.h[0] + mla v6.8h, v16.8h, v0.h[1] + mla v6.8h, v17.8h, v0.h[2] + mla v6.8h, v18.8h, v0.h[3] + mla v6.8h, v19.8h, v0.h[4] + mla v6.8h, v20.8h, v0.h[5] + mla v6.8h, v21.8h, v0.h[6] ext v22.16b, v4.16b, v5.16b, #2 ext v23.16b, v4.16b, v5.16b, #4 ext v24.16b, v4.16b, v5.16b, #6 ext v25.16b, v4.16b, v5.16b, #8 ext v26.16b, v4.16b, v5.16b, #10 ext v27.16b, v4.16b, v5.16b, #12 - mul v7\wd, v4\wd, v0.h[0] - mla v7\wd, v22\wd, v0.h[1] - mla v7\wd, v23\wd, v0.h[2] - mla v7\wd, v24\wd, v0.h[3] - mla v7\wd, v25\wd, v0.h[4] - mla v7\wd, v26\wd, v0.h[5] - mla v7\wd, v27\wd, v0.h[6] - - shl v18\wd, v18\wd, #7 - shl v24\wd, v24\wd, #7 - sub v18\wd, v18\wd, v30\wd - sub v24\wd, v24\wd, v30\wd - sqadd v6\wd, v6\wd, v18\wd - sqadd v7\wd, v7\wd, v24\wd - sshr v6\wd, v6\wd, #3 - sshr v7\wd, v7\wd, #3 - add v6\wd, v6\wd, v31\wd - add v7\wd, v7\wd, v31\wd -.endm - filter .8h + mul v7.8h, v4.8h, v0.h[0] + mla v7.8h, v22.8h, v0.h[1] + mla v7.8h, v23.8h, v0.h[2] + mla v7.8h, v24.8h, v0.h[3] + mla v7.8h, v25.8h, v0.h[4] + mla v7.8h, v26.8h, v0.h[5] + mla v7.8h, v27.8h, v0.h[6] + + shl v18.8h, v18.8h, #7 + shl v24.8h, v24.8h, #7 + sub v18.8h, v18.8h, v30.8h + sub v24.8h, v24.8h, v30.8h + sqadd v6.8h, v6.8h, v18.8h + sqadd v7.8h, v7.8h, v24.8h + sshr v6.8h, v6.8h, #3 + sshr v7.8h, v7.8h, #3 + add v6.8h, v6.8h, v31.8h + add v7.8h, v7.8h, v31.8h + + subs w5, w5, #8 + st1 {v6.8h}, [x0], #16 st1 {v7.8h}, [x12], #16 - subs w5, w5, #8 b.le 9f tst w7, #2 // LR_HAVE_RIGHT mov v2.16b, v3.16b @@ -193,102 +210,6 @@ function wiener_filter_h_8bpc_neon, export=1 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. -5: // Filter 4 pixels, 7 <= w < 11 - filter .4h - st1 {v6.4h}, [x0], #8 - st1 {v7.4h}, [x12], #8 - - subs w5, w5, #4 // 3 <= w < 7 - ext v2.16b, v2.16b, v3.16b, #8 - ext v3.16b, v3.16b, v3.16b, #8 - ext v4.16b, v4.16b, v5.16b, #8 - ext v5.16b, v5.16b, v5.16b, #8 - -6: // Pad the right edge and filter the last few pixels. - // w < 7, w+3 pixels valid in v2-v3 - cmp w5, #5 - b.lt 7f - b.gt 8f - // w == 5, 8 pixels valid in v2, v3 invalid - mov v3.16b, v28.16b - mov v5.16b, v29.16b - b 88f - -7: // 1 <= w < 5, 4-7 pixels valid in v2 - sub w9, w5, #1 - // w9 = (pixels valid - 4) - adr x11, L(variable_shift_tbl) - ldrh w9, [x11, w9, uxtw #1] - sub x11, x11, w9, uxth - mov v3.16b, v28.16b - mov v5.16b, v29.16b - br x11 -44: // 4 pixels valid in v2/v4, fill the high half with padding. - ins v2.d[1], v3.d[0] - ins v4.d[1], v5.d[0] - b 88f - // Shift v2 right, shifting out invalid pixels, - // shift v2 left to the original offset, shifting in padding pixels. -55: // 5 pixels valid - ext v2.16b, v2.16b, v2.16b, #10 - ext v2.16b, v2.16b, v3.16b, #6 - ext v4.16b, v4.16b, v4.16b, #10 - ext v4.16b, v4.16b, v5.16b, #6 - b 88f -66: // 6 pixels valid, fill the upper 2 pixels with padding. - ins v2.s[3], v3.s[0] - ins v4.s[3], v5.s[0] - b 88f -77: // 7 pixels valid, fill the last pixel with padding. - ins v2.h[7], v3.h[0] - ins v4.h[7], v5.h[0] - b 88f - -L(variable_shift_tbl): - .hword L(variable_shift_tbl) - 44b - .hword L(variable_shift_tbl) - 55b - .hword L(variable_shift_tbl) - 66b - .hword L(variable_shift_tbl) - 77b - -8: // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3 - ins v28.h[0], v3.h[0] - ins v29.h[0], v5.h[0] - mov v3.16b, v28.16b - mov v5.16b, v29.16b - -88: - // w < 7, v2-v3 padded properly - cmp w5, #4 - b.lt 888f - - // w >= 4, filter 4 pixels - filter .4h - st1 {v6.4h}, [x0], #8 - st1 {v7.4h}, [x12], #8 - subs w5, w5, #4 // 0 <= w < 4 - ext v2.16b, v2.16b, v3.16b, #8 - ext v4.16b, v4.16b, v5.16b, #8 - b.eq 9f -888: // 1 <= w < 4, filter 1 pixel at a time - mul v6.8h, v2.8h, v0.8h - mul v7.8h, v4.8h, v0.8h - addv h6, v6.8h - addv h7, v7.8h - dup v16.4h, v2.h[3] - ins v16.h[1], v4.h[3] - ins v6.h[1], v7.h[0] - shl v16.4h, v16.4h, #7 - sub v16.4h, v16.4h, v30.4h - sqadd v6.4h, v6.4h, v16.4h - sshr v6.4h, v6.4h, #3 - add v6.4h, v6.4h, v31.4h - st1 {v6.h}[0], [x0], #2 - st1 {v6.h}[1], [x12], #2 - subs w5, w5, #1 - ext v2.16b, v2.16b, v3.16b, #2 - ext v4.16b, v4.16b, v5.16b, #2 - b.gt 888b - 9: subs w6, w6, #2 b.le 0f @@ -301,7 +222,6 @@ L(variable_shift_tbl): b 1b 0: ret -.purgem filter endfunc // void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride, @@ -498,25 +418,15 @@ function sgr_box3_h_8bpc_neon, export=1 mov x9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - tst w7, #2 // LR_HAVE_RIGHT - b.ne 0f - // !LR_HAVE_RIGHT - add w13, w5, #3 - bic w13, w13, #3 - b 1f -0: add w13, w5, #7 bic w13, w13, #7 -1: sub x9, x9, w13, uxtw #1 // Store the width for the vertical loop mov w8, w5 // Subtract the number of pixels read from the input from the stride - add w13, w5, #14 - bic w13, w13, #7 + add w13, w13, #8 sub x4, x4, w13, uxtw // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL @@ -578,37 +488,33 @@ function sgr_box3_h_8bpc_neon, export=1 ldr b30, [x3, w13, sxtw] ldr b31, [x12, w13, sxtw] // Fill v30/v31 with the right padding pixel - dup v30.8b, v30.b[0] - dup v31.8b, v31.b[0] + dup v30.16b, v30.b[0] + dup v31.16b, v31.b[0] 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp w5, #10 b.ge 4f // If w >= 10, all used input pixels are valid - cmp w5, #6 - b.ge 5f // If w >= 6, we can filter 4 pixels - b 6f -4: // Loop horizontally -.macro uaddl_nh dst1, dst2, src1, src2, w - uaddl \dst1, \src1\().4h, \src2\().4h -.if \w > 4 - uaddl2 \dst2, \src1\().8h, \src2\().8h -.endif -.endm -.macro uaddw_nh dst1, dst2, src, w - uaddw \dst1, \dst1, \src\().4h -.if \w > 4 - uaddw2 \dst2, \dst2, \src\().8h -.endif -.endm -.macro add_nh dst1, dst2, src1, src2, w - add \dst1, \dst1, \src1 -.if \w > 4 - add \dst2, \dst2, \src2 -.endif -.endm + // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. + + // Insert padding in v0/4.b[w] onwards + movrel x13, right_ext_mask + sub x13, x13, w5, uxtw + ld1 {v29.16b}, [x13] + + bit v0.16b, v30.16b, v29.16b + bit v4.16b, v31.16b, v29.16b -.macro add3 w + // Update the precalculated squares + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + umull v5.8h, v4.8b, v4.8b + umull2 v6.8h, v4.16b, v4.16b + +4: // Loop horizontally ext v16.16b, v0.16b, v0.16b, #1 ext v17.16b, v0.16b, v0.16b, #2 ext v18.16b, v4.16b, v4.16b, #1 @@ -623,19 +529,23 @@ function sgr_box3_h_8bpc_neon, export=1 ext v22.16b, v5.16b, v6.16b, #2 ext v23.16b, v5.16b, v6.16b, #4 - uaddl_nh v26.4s, v27.4s, v1, v20, \w - uaddw_nh v26.4s, v27.4s, v21, \w + uaddl v26.4s, v1.4h, v20.4h + uaddl2 v27.4s, v1.8h, v20.8h + uaddw v26.4s, v26.4s, v21.4h + uaddw2 v27.4s, v27.4s, v21.8h + + uaddl v28.4s, v5.4h, v22.4h + uaddl2 v29.4s, v5.8h, v22.8h + uaddw v28.4s, v28.4s, v23.4h + uaddw2 v29.4s, v29.4s, v23.8h + + subs w5, w5, #8 - uaddl_nh v28.4s, v29.4s, v5, v22, \w - uaddw_nh v28.4s, v29.4s, v23, \w -.endm - add3 8 st1 {v3.8h}, [x1], #16 st1 {v7.8h}, [x11], #16 st1 {v26.4s,v27.4s}, [x0], #32 st1 {v28.4s,v29.4s}, [x10], #32 - subs w5, w5, #8 b.le 9f tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8b}, [x3], #8 @@ -650,83 +560,6 @@ function sgr_box3_h_8bpc_neon, export=1 b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 6 <= w < 10 - add3 4 - st1 {v3.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v26.4s}, [x0], #16 - st1 {v28.4s}, [x10], #16 - - subs w5, w5, #4 // 2 <= w < 6 - ext v0.16b, v0.16b, v0.16b, #4 - ext v4.16b, v4.16b, v4.16b, #4 - -6: // Pad the right edge and produce the last few pixels. - // 2 <= w < 6, 2-5 pixels valid in v0 - sub w13, w5, #2 - // w13 = (pixels valid - 2) - adr x14, L(box3_variable_shift_tbl) - ldrh w13, [x14, w13, uxtw #1] - sub x13, x14, w13, uxth - br x13 - // Shift v0 right, shifting out invalid pixels, - // shift v0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - ext v0.16b, v0.16b, v0.16b, #2 - ext v4.16b, v4.16b, v4.16b, #2 - ext v0.16b, v0.16b, v30.16b, #14 - ext v4.16b, v4.16b, v31.16b, #14 - b 88f -33: // 3 pixels valid - ext v0.16b, v0.16b, v0.16b, #3 - ext v4.16b, v4.16b, v4.16b, #3 - ext v0.16b, v0.16b, v30.16b, #13 - ext v4.16b, v4.16b, v31.16b, #13 - b 88f -44: // 4 pixels valid - ext v0.16b, v0.16b, v0.16b, #4 - ext v4.16b, v4.16b, v4.16b, #4 - ext v0.16b, v0.16b, v30.16b, #12 - ext v4.16b, v4.16b, v31.16b, #12 - b 88f -55: // 5 pixels valid - ext v0.16b, v0.16b, v0.16b, #5 - ext v4.16b, v4.16b, v4.16b, #5 - ext v0.16b, v0.16b, v30.16b, #11 - ext v4.16b, v4.16b, v31.16b, #11 - b 88f - -L(box3_variable_shift_tbl): - .hword L(box3_variable_shift_tbl) - 22b - .hword L(box3_variable_shift_tbl) - 33b - .hword L(box3_variable_shift_tbl) - 44b - .hword L(box3_variable_shift_tbl) - 55b - -88: - umull v1.8h, v0.8b, v0.8b - umull2 v2.8h, v0.16b, v0.16b - umull v5.8h, v4.8b, v4.8b - umull2 v6.8h, v4.16b, v4.16b - - add3 4 - subs w5, w5, #4 - st1 {v3.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v26.4s}, [x0], #16 - st1 {v28.4s}, [x10], #16 - b.le 9f - ext v0.16b, v0.16b, v0.16b, #4 - ext v4.16b, v4.16b, v4.16b, #4 - ext v1.16b, v1.16b, v2.16b, #8 - ext v5.16b, v5.16b, v6.16b, #8 - // Only one needed pixel left, but do a normal 4 pixel - // addition anyway - add3 4 - st1 {v3.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v26.4s}, [x0], #16 - st1 {v28.4s}, [x10], #16 - 9: subs w6, w6, #2 b.le 0f @@ -741,7 +574,6 @@ L(box3_variable_shift_tbl): b 1b 0: ret -.purgem add3 endfunc // void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, @@ -760,23 +592,11 @@ function sgr_box5_h_8bpc_neon, export=1 mov x9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - // Subtract the number of pixels read from the input from the stride. - tst w7, #2 // LR_HAVE_RIGHT - b.ne 0f - // !LR_HAVE_RIGHT - add w13, w5, #3 - bic w13, w13, #3 - add w14, w5, #13 - b 1f -0: add w13, w5, #7 bic w13, w13, #7 - add w14, w5, #15 -1: sub x9, x9, w13, uxtw #1 - bic w14, w14, #7 - sub x4, x4, w14, uxtw + add w13, w13, #8 + sub x4, x4, w13, uxtw // Store the width for the vertical loop mov w8, w5 @@ -839,18 +659,34 @@ function sgr_box5_h_8bpc_neon, export=1 ldr b30, [x3, w13, sxtw] ldr b31, [x12, w13, sxtw] // Fill v30/v31 with the right padding pixel - dup v30.8b, v30.b[0] - dup v31.8b, v31.b[0] + dup v30.16b, v30.b[0] + dup v31.16b, v31.b[0] 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp w5, #11 b.ge 4f // If w >= 11, all used input pixels are valid - cmp w5, #7 - b.ge 5f // If w >= 7, we can produce 4 pixels - b 6f + + // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v0/4.b[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel x13, right_ext_mask, -1 + sub x13, x13, w5, uxtw + ld1 {v29.16b}, [x13] + + bit v0.16b, v30.16b, v29.16b + bit v4.16b, v31.16b, v29.16b + + // Update the precalculated squares + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + umull v5.8h, v4.8b, v4.8b + umull2 v6.8h, v4.16b, v4.16b 4: // Loop horizontally -.macro add5 w ext v16.16b, v0.16b, v0.16b, #1 ext v17.16b, v0.16b, v0.16b, #2 ext v18.16b, v0.16b, v0.16b, #3 @@ -877,22 +713,30 @@ function sgr_box5_h_8bpc_neon, export=1 ext v22.16b, v5.16b, v6.16b, #6 ext v23.16b, v5.16b, v6.16b, #8 - uaddl_nh v26.4s, v27.4s, v1, v16, \w - uaddl_nh v16.4s, v17.4s, v17, v18, \w - uaddl_nh v28.4s, v29.4s, v5, v20, \w - uaddw_nh v26.4s, v27.4s, v19, \w - uaddl_nh v20.4s, v21.4s, v21, v22, \w - uaddw_nh v28.4s, v29.4s, v23, \w - add_nh v26.4s, v27.4s, v16.4s, v17.4s, \w - add_nh v28.4s, v29.4s, v20.4s, v21.4s, \w -.endm - add5 8 + uaddl v26.4s, v1.4h, v16.4h + uaddl2 v27.4s, v1.8h, v16.8h + uaddl v16.4s, v17.4h, v18.4h + uaddl2 v17.4s, v17.8h, v18.8h + uaddl v28.4s, v5.4h, v20.4h + uaddl2 v29.4s, v5.8h, v20.8h + uaddw v26.4s, v26.4s, v19.4h + uaddw2 v27.4s, v27.4s, v19.8h + uaddl v20.4s, v21.4h, v22.4h + uaddl2 v21.4s, v21.8h, v22.8h + uaddw v28.4s, v28.4s, v23.4h + uaddw2 v29.4s, v29.4s, v23.8h + add v26.4s, v26.4s, v16.4s + add v27.4s, v27.4s, v17.4s + add v28.4s, v28.4s, v20.4s + add v29.4s, v29.4s, v21.4s + + subs w5, w5, #8 + st1 {v3.8h}, [x1], #16 st1 {v7.8h}, [x11], #16 st1 {v26.4s,v27.4s}, [x0], #32 st1 {v28.4s,v29.4s}, [x10], #32 - subs w5, w5, #8 b.le 9f tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8b}, [x3], #8 @@ -906,95 +750,6 @@ function sgr_box5_h_8bpc_neon, export=1 b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 7 <= w < 11 - add5 4 - st1 {v3.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v26.4s}, [x0], #16 - st1 {v28.4s}, [x10], #16 - - subs w5, w5, #4 // 3 <= w < 7 - ext v0.16b, v0.16b, v0.16b, #4 - ext v4.16b, v4.16b, v4.16b, #4 - -6: // Pad the right edge and produce the last few pixels. - // w < 7, w+1 pixels valid in v0/v4 - sub w13, w5, #1 - // w13 = pixels valid - 2 - adr x14, L(box5_variable_shift_tbl) - ldrh w13, [x14, w13, uxtw #1] - sub x13, x14, w13, uxth - br x13 - // Shift v0 right, shifting out invalid pixels, - // shift v0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - ext v0.16b, v0.16b, v0.16b, #2 - ext v4.16b, v4.16b, v4.16b, #2 - ext v0.16b, v0.16b, v30.16b, #14 - ext v4.16b, v4.16b, v31.16b, #14 - b 88f -33: // 3 pixels valid - ext v0.16b, v0.16b, v0.16b, #3 - ext v4.16b, v4.16b, v4.16b, #3 - ext v0.16b, v0.16b, v30.16b, #13 - ext v4.16b, v4.16b, v31.16b, #13 - b 88f -44: // 4 pixels valid - ext v0.16b, v0.16b, v0.16b, #4 - ext v4.16b, v4.16b, v4.16b, #4 - ext v0.16b, v0.16b, v30.16b, #12 - ext v4.16b, v4.16b, v31.16b, #12 - b 88f -55: // 5 pixels valid - ext v0.16b, v0.16b, v0.16b, #5 - ext v4.16b, v4.16b, v4.16b, #5 - ext v0.16b, v0.16b, v30.16b, #11 - ext v4.16b, v4.16b, v31.16b, #11 - b 88f -66: // 6 pixels valid - ext v0.16b, v0.16b, v0.16b, #6 - ext v4.16b, v4.16b, v4.16b, #6 - ext v0.16b, v0.16b, v30.16b, #10 - ext v4.16b, v4.16b, v31.16b, #10 - b 88f -77: // 7 pixels valid - ext v0.16b, v0.16b, v0.16b, #7 - ext v4.16b, v4.16b, v4.16b, #7 - ext v0.16b, v0.16b, v30.16b, #9 - ext v4.16b, v4.16b, v31.16b, #9 - b 88f - -L(box5_variable_shift_tbl): - .hword L(box5_variable_shift_tbl) - 22b - .hword L(box5_variable_shift_tbl) - 33b - .hword L(box5_variable_shift_tbl) - 44b - .hword L(box5_variable_shift_tbl) - 55b - .hword L(box5_variable_shift_tbl) - 66b - .hword L(box5_variable_shift_tbl) - 77b - -88: - umull v1.8h, v0.8b, v0.8b - umull2 v2.8h, v0.16b, v0.16b - umull v5.8h, v4.8b, v4.8b - umull2 v6.8h, v4.16b, v4.16b - - add5 4 - subs w5, w5, #4 - st1 {v3.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v26.4s}, [x0], #16 - st1 {v28.4s}, [x10], #16 - b.le 9f - ext v0.16b, v0.16b, v0.16b, #4 - ext v1.16b, v1.16b, v2.16b, #8 - ext v4.16b, v4.16b, v4.16b, #4 - ext v5.16b, v5.16b, v6.16b, #8 - add5 4 - st1 {v3.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v26.4s}, [x0], #16 - st1 {v28.4s}, [x10], #16 - 9: subs w6, w6, #2 b.le 0f @@ -1009,7 +764,6 @@ L(box5_variable_shift_tbl): b 1b 0: ret -.purgem add5 endfunc sgr_funcs 8 diff --git a/src/arm/64/looprestoration16.S b/src/arm/64/looprestoration16.S index c5e853f64e..3e01a3543e 100644 --- a/src/arm/64/looprestoration16.S +++ b/src/arm/64/looprestoration16.S @@ -28,6 +28,18 @@ #include "src/arm/asm.S" #include "util.S" +const right_ext_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +right_ext_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + // void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], // const pixel *src, ptrdiff_t stride, // const int16_t fh[7], const intptr_t w, @@ -51,26 +63,19 @@ function wiener_filter_h_16bpc_neon, export=1 bic w10, w10, #7 lsl w10, w10, #1 - // Clear the last unused element of v0, to allow filtering a single - // pixel with one plain mul+addv. - ins v0.h[7], wzr - // Set up pointers for reading/writing alternate rows add x12, x0, x10 lsl w10, w10, #1 add x13, x2, x3 lsl x3, x3, #1 - // Subtract the width from mid_stride - sub x10, x10, w5, uxtw #1 - - // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels. - cmp w5, #8 - add w11, w5, #13 + // Subtract the aligned width from mid_stride + add w11, w5, #7 bic w11, w11, #7 - b.ge 1f - mov w11, #16 -1: + sub x10, x10, w11, uxtw #1 + + // Subtract the number of pixels read from the source stride + add w11, w11, #8 sub x3, x3, w11, uxtw #1 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL @@ -135,34 +140,27 @@ function wiener_filter_h_16bpc_neon, export=1 dup v27.8h, v27.h[0] dup v28.8h, v28.h[0] 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp w5, #11 b.ge 4f // If w >= 11, all used input pixels are valid - cmp w5, #7 - b.ge 5f // If w >= 7, we can filter 4 pixels - b 6f -4: // Loop horizontally -.macro add_sz d0, d1, s0, s1, c, wd - add \d0\().4s, \s0\().4s, \c\().4s -.ifc \wd, .8h - add \d1\().4s, \s1\().4s, \c\().4s -.endif -.endm -.macro srshl_sz d0, d1, s0, s1, c, wd - srshl \d0\().4s, \s0\().4s, \c\().4s -.ifc \wd, .8h - srshl \d1\().4s, \s1\().4s, \c\().4s -.endif -.endm -.macro sqxtun_sz dst, s0, s1, wd - sqxtun \dst\().4h, \s0\().4s -.ifc \wd, .8h - sqxtun2 \dst\().8h, \s1\().4s -.endif -.endm + // 1 <= w < 11, w+3 pixels valid in v2-v3. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v2/3.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x4, right_ext_mask, -6 + sub x4, x4, w5, uxtw #1 + ld1 {v25.16b, v26.16b}, [x4] -.macro filter wd + bit v2.16b, v27.16b, v25.16b + bit v3.16b, v27.16b, v26.16b + bit v4.16b, v28.16b, v25.16b + bit v5.16b, v28.16b, v26.16b + +4: // Loop horizontally // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. @@ -179,7 +177,6 @@ function wiener_filter_h_16bpc_neon, export=1 smlal v6.4s, v19.4h, v0.h[4] smlal v6.4s, v20.4h, v0.h[5] smlal v6.4s, v21.4h, v0.h[6] -.ifc \wd, .8h smull2 v7.4s, v2.8h, v0.h[0] smlal2 v7.4s, v16.8h, v0.h[1] smlal2 v7.4s, v17.8h, v0.h[2] @@ -187,7 +184,7 @@ function wiener_filter_h_16bpc_neon, export=1 smlal2 v7.4s, v19.8h, v0.h[4] smlal2 v7.4s, v20.8h, v0.h[5] smlal2 v7.4s, v21.8h, v0.h[6] -.endif + ext v19.16b, v4.16b, v5.16b, #2 ext v20.16b, v4.16b, v5.16b, #4 ext v21.16b, v4.16b, v5.16b, #6 @@ -201,7 +198,6 @@ function wiener_filter_h_16bpc_neon, export=1 smlal v16.4s, v22.4h, v0.h[4] smlal v16.4s, v23.4h, v0.h[5] smlal v16.4s, v24.4h, v0.h[6] -.ifc \wd, .8h smull2 v17.4s, v4.8h, v0.h[0] smlal2 v17.4s, v19.8h, v0.h[1] smlal2 v17.4s, v20.8h, v0.h[2] @@ -209,24 +205,30 @@ function wiener_filter_h_16bpc_neon, export=1 smlal2 v17.4s, v22.8h, v0.h[4] smlal2 v17.4s, v23.8h, v0.h[5] smlal2 v17.4s, v24.8h, v0.h[6] -.endif - mvni v24\wd, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 - add_sz v6, v7, v6, v7, v30, \wd - add_sz v16, v17, v16, v17, v30, \wd - srshl_sz v6, v7, v6, v7, v29, \wd - srshl_sz v16, v17, v16, v17, v29, \wd - sqxtun_sz v6, v6, v7, \wd - sqxtun_sz v7, v16, v17, \wd - umin v6\wd, v6\wd, v24\wd - umin v7\wd, v7\wd, v24\wd - sub v6\wd, v6\wd, v31\wd - sub v7\wd, v7\wd, v31\wd -.endm - filter .8h + + mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v16.4s, v16.4s, v29.4s + srshl v17.4s, v17.4s, v29.4s + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v16.4s + sqxtun2 v7.8h, v17.4s + umin v6.8h, v6.8h, v24.8h + umin v7.8h, v7.8h, v24.8h + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + subs w5, w5, #8 + st1 {v6.8h}, [x0], #16 st1 {v7.8h}, [x12], #16 - subs w5, w5, #8 b.le 9f tst w7, #2 // LR_HAVE_RIGHT mov v2.16b, v3.16b @@ -236,105 +238,6 @@ function wiener_filter_h_16bpc_neon, export=1 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. -5: // Filter 4 pixels, 7 <= w < 11 - filter .4h - st1 {v6.4h}, [x0], #8 - st1 {v7.4h}, [x12], #8 - - subs w5, w5, #4 // 3 <= w < 7 - ext v2.16b, v2.16b, v3.16b, #8 - ext v3.16b, v3.16b, v3.16b, #8 - ext v4.16b, v4.16b, v5.16b, #8 - ext v5.16b, v5.16b, v5.16b, #8 - -6: // Pad the right edge and filter the last few pixels. - // w < 7, w+3 pixels valid in v2-v3 - cmp w5, #5 - b.lt 7f - b.gt 8f - // w == 5, 8 pixels valid in v2, v3 invalid - mov v3.16b, v27.16b - mov v5.16b, v28.16b - b 88f - -7: // 1 <= w < 5, 4-7 pixels valid in v2 - sub w9, w5, #1 - // w9 = (pixels valid - 4) - adr x11, L(variable_shift_tbl) - ldrh w9, [x11, w9, uxtw #1] - sub x11, x11, w9, uxth - mov v3.16b, v27.16b - mov v5.16b, v28.16b - br x11 -44: // 4 pixels valid in v2/v4, fill the high half with padding. - ins v2.d[1], v3.d[0] - ins v4.d[1], v5.d[0] - b 88f - // Shift v2 right, shifting out invalid pixels, - // shift v2 left to the original offset, shifting in padding pixels. -55: // 5 pixels valid - ext v2.16b, v2.16b, v2.16b, #10 - ext v2.16b, v2.16b, v3.16b, #6 - ext v4.16b, v4.16b, v4.16b, #10 - ext v4.16b, v4.16b, v5.16b, #6 - b 88f -66: // 6 pixels valid, fill the upper 2 pixels with padding. - ins v2.s[3], v3.s[0] - ins v4.s[3], v5.s[0] - b 88f -77: // 7 pixels valid, fill the last pixel with padding. - ins v2.h[7], v3.h[0] - ins v4.h[7], v5.h[0] - b 88f - -L(variable_shift_tbl): - .hword L(variable_shift_tbl) - 44b - .hword L(variable_shift_tbl) - 55b - .hword L(variable_shift_tbl) - 66b - .hword L(variable_shift_tbl) - 77b - -8: // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3 - ins v27.h[0], v3.h[0] - ins v28.h[0], v5.h[0] - mov v3.16b, v27.16b - mov v5.16b, v28.16b - -88: - // w < 7, v2-v3 padded properly - cmp w5, #4 - b.lt 888f - - // w >= 4, filter 4 pixels - filter .4h - st1 {v6.4h}, [x0], #8 - st1 {v7.4h}, [x12], #8 - subs w5, w5, #4 // 0 <= w < 4 - ext v2.16b, v2.16b, v3.16b, #8 - ext v4.16b, v4.16b, v5.16b, #8 - b.eq 9f -888: // 1 <= w < 4, filter 1 pixel at a time - smull v6.4s, v2.4h, v0.4h - smull2 v7.4s, v2.8h, v0.8h - smull v16.4s, v4.4h, v0.4h - smull2 v17.4s, v4.8h, v0.8h - add v6.4s, v6.4s, v7.4s - add v16.4s, v16.4s, v17.4s - addv s6, v6.4s - addv s7, v16.4s - ins v6.s[1], v7.s[0] - mvni v24.4h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 - add v6.2s, v6.2s, v30.2s - srshl v6.2s, v6.2s, v29.2s - sqxtun v6.4h, v6.4s - umin v6.4h, v6.4h, v24.4h - sub v6.4h, v6.4h, v31.4h - st1 {v6.h}[0], [x0], #2 - st1 {v6.h}[1], [x12], #2 - subs w5, w5, #1 - ext v2.16b, v2.16b, v3.16b, #2 - ext v4.16b, v4.16b, v5.16b, #2 - b.gt 888b - 9: subs w6, w6, #2 b.le 0f @@ -347,7 +250,6 @@ L(variable_shift_tbl): b 1b 0: ret -.purgem filter endfunc // void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, @@ -552,25 +454,15 @@ function sgr_box3_h_16bpc_neon, export=1 mov x9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - tst w7, #2 // LR_HAVE_RIGHT - b.ne 0f - // !LR_HAVE_RIGHT - add w13, w5, #3 - bic w13, w13, #3 - b 1f -0: add w13, w5, #7 bic w13, w13, #7 -1: sub x9, x9, w13, uxtw #1 // Store the width for the vertical loop mov w8, w5 // Subtract the number of pixels read from the input from the stride - add w13, w5, #14 - bic w13, w13, #7 + add w13, w13, #8 sub x4, x4, w13, uxtw #1 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL @@ -634,48 +526,55 @@ function sgr_box3_h_16bpc_neon, export=1 dup v30.8h, v30.h[0] dup v31.8h, v31.h[0] 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp w5, #10 b.ge 4f // If w >= 10, all used input pixels are valid - cmp w5, #6 - b.ge 5f // If w >= 6, we can filter 4 pixels - b 6f + + // 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called + // again; it's not strictly needed in those cases (we pad enough here), + // but keeping the code as simple as possible. + + // Insert padding in v0/1.h[w] onwards + movrel x13, right_ext_mask + sub x13, x13, w5, uxtw #1 + ld1 {v28.16b, v29.16b}, [x13] + + bit v0.16b, v30.16b, v28.16b + bit v1.16b, v30.16b, v29.16b + bit v16.16b, v31.16b, v28.16b + bit v17.16b, v31.16b, v29.16b 4: // Loop horizontally -.macro add3 w, wd ext v26.16b, v0.16b, v1.16b, #2 ext v28.16b, v16.16b, v17.16b, #2 ext v27.16b, v0.16b, v1.16b, #4 ext v29.16b, v16.16b, v17.16b, #4 - add v6\wd, v0\wd, v26\wd + add v6.8h, v0.8h, v26.8h umull v22.4s, v0.4h, v0.4h umlal v22.4s, v26.4h, v26.4h umlal v22.4s, v27.4h, v27.4h - add v7\wd, v16\wd, v28\wd + add v7.8h, v16.8h, v28.8h umull v24.4s, v16.4h, v16.4h umlal v24.4s, v28.4h, v28.4h umlal v24.4s, v29.4h, v29.4h - add v6\wd, v6\wd, v27\wd -.if \w > 4 + add v6.8h, v6.8h, v27.8h umull2 v23.4s, v0.8h, v0.8h umlal2 v23.4s, v26.8h, v26.8h umlal2 v23.4s, v27.8h, v27.8h -.endif - add v7\wd, v7\wd, v29\wd -.if \w > 4 + add v7.8h, v7.8h, v29.8h umull2 v25.4s, v16.8h, v16.8h umlal2 v25.4s, v28.8h, v28.8h umlal2 v25.4s, v29.8h, v29.8h -.endif -.endm - add3 8, .8h + + subs w5, w5, #8 + st1 {v6.8h}, [x1], #16 st1 {v7.8h}, [x11], #16 st1 {v22.4s,v23.4s}, [x0], #32 st1 {v24.4s,v25.4s}, [x10], #32 - subs w5, w5, #8 b.le 9f tst w7, #2 // LR_HAVE_RIGHT mov v0.16b, v1.16b @@ -686,76 +585,6 @@ function sgr_box3_h_16bpc_neon, export=1 b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 6 <= w < 10 - add3 4, .4h - st1 {v6.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v22.4s}, [x0], #16 - st1 {v24.4s}, [x10], #16 - - subs w5, w5, #4 // 2 <= w < 6 - ext v0.16b, v0.16b, v1.16b, #8 - ext v16.16b, v16.16b, v17.16b, #8 - -6: // Pad the right edge and produce the last few pixels. - // 2 <= w < 6, 2-5 pixels valid in v0 - sub w13, w5, #2 - // w13 = (pixels valid - 2) - adr x14, L(box3_variable_shift_tbl) - ldrh w13, [x14, w13, uxtw #1] - sub x13, x14, w13, uxth - br x13 - // Shift v0 right, shifting out invalid pixels, - // shift v0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - ext v0.16b, v0.16b, v0.16b, #4 - ext v16.16b, v16.16b, v16.16b, #4 - ext v0.16b, v0.16b, v30.16b, #12 - ext v16.16b, v16.16b, v31.16b, #12 - b 88f -33: // 3 pixels valid - ext v0.16b, v0.16b, v0.16b, #6 - ext v16.16b, v16.16b, v16.16b, #6 - ext v0.16b, v0.16b, v30.16b, #10 - ext v16.16b, v16.16b, v31.16b, #10 - b 88f -44: // 4 pixels valid - ext v0.16b, v0.16b, v0.16b, #8 - ext v16.16b, v16.16b, v16.16b, #8 - ext v0.16b, v0.16b, v30.16b, #8 - ext v16.16b, v16.16b, v31.16b, #8 - b 88f -55: // 5 pixels valid - ext v0.16b, v0.16b, v0.16b, #10 - ext v16.16b, v16.16b, v16.16b, #10 - ext v0.16b, v0.16b, v30.16b, #6 - ext v16.16b, v16.16b, v31.16b, #6 - b 88f - -L(box3_variable_shift_tbl): - .hword L(box3_variable_shift_tbl) - 22b - .hword L(box3_variable_shift_tbl) - 33b - .hword L(box3_variable_shift_tbl) - 44b - .hword L(box3_variable_shift_tbl) - 55b - -88: - add3 4, .4h - subs w5, w5, #4 - st1 {v6.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v22.4s}, [x0], #16 - st1 {v24.4s}, [x10], #16 - b.le 9f - ext v0.16b, v0.16b, v0.16b, #8 - ext v16.16b, v16.16b, v16.16b, #8 - // Only one needed pixel left, but do a normal 4 pixel - // addition anyway - add3 4, .4h - st1 {v6.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v22.4s}, [x0], #16 - st1 {v24.4s}, [x10], #16 - 9: subs w6, w6, #2 b.le 0f @@ -770,7 +599,6 @@ L(box3_variable_shift_tbl): b 1b 0: ret -.purgem add3 endfunc // void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, @@ -789,23 +617,11 @@ function sgr_box5_h_16bpc_neon, export=1 mov x9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. - // With LR_HAVE_RIGHT, align to 8, without it, align to 4. - // Subtract the number of pixels read from the input from the stride. - tst w7, #2 // LR_HAVE_RIGHT - b.ne 0f - // !LR_HAVE_RIGHT - add w13, w5, #3 - bic w13, w13, #3 - add w14, w5, #13 - b 1f -0: add w13, w5, #7 bic w13, w13, #7 - add w14, w5, #15 -1: sub x9, x9, w13, uxtw #1 - bic w14, w14, #7 - sub x4, x4, w14, uxtw #1 + add w13, w13, #8 + sub x4, x4, w13, uxtw #1 // Store the width for the vertical loop mov w8, w5 @@ -870,70 +686,74 @@ function sgr_box5_h_16bpc_neon, export=1 dup v30.8h, v30.h[0] dup v31.8h, v31.h[0] 3: // !LR_HAVE_RIGHT - // If we'll have to pad the right edge we need to quit early here. + + // Check whether we need to pad the right edge cmp w5, #11 b.ge 4f // If w >= 11, all used input pixels are valid - cmp w5, #7 - b.ge 5f // If w >= 7, we can produce 4 pixels - b 6f + + // 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel x13, right_ext_mask, -2 + sub x13, x13, w5, uxtw #1 + ld1 {v28.16b, v29.16b}, [x13] + + bit v0.16b, v30.16b, v28.16b + bit v1.16b, v30.16b, v29.16b + bit v16.16b, v31.16b, v28.16b + bit v17.16b, v31.16b, v29.16b 4: // Loop horizontally -.macro add5 w, wd ext v26.16b, v0.16b, v1.16b, #2 ext v28.16b, v16.16b, v17.16b, #2 ext v27.16b, v0.16b, v1.16b, #4 ext v29.16b, v16.16b, v17.16b, #4 - add v6\wd, v0\wd, v26\wd + add v6.8h, v0.8h, v26.8h umull v22.4s, v0.4h, v0.4h umlal v22.4s, v26.4h, v26.4h umlal v22.4s, v27.4h, v27.4h - add v7\wd, v16\wd, v28\wd + add v7.8h, v16.8h, v28.8h umull v24.4s, v16.4h, v16.4h umlal v24.4s, v28.4h, v28.4h umlal v24.4s, v29.4h, v29.4h - add v6\wd, v6\wd, v27\wd -.if \w > 4 + add v6.8h, v6.8h, v27.8h umull2 v23.4s, v0.8h, v0.8h umlal2 v23.4s, v26.8h, v26.8h umlal2 v23.4s, v27.8h, v27.8h -.endif - add v7\wd, v7\wd, v29\wd -.if \w > 4 + add v7.8h, v7.8h, v29.8h umull2 v25.4s, v16.8h, v16.8h umlal2 v25.4s, v28.8h, v28.8h umlal2 v25.4s, v29.8h, v29.8h -.endif ext v26.16b, v0.16b, v1.16b, #6 ext v28.16b, v16.16b, v17.16b, #6 ext v27.16b, v0.16b, v1.16b, #8 ext v29.16b, v16.16b, v17.16b, #8 - add v6\wd, v6\wd, v26\wd + add v6.8h, v6.8h, v26.8h umlal v22.4s, v26.4h, v26.4h umlal v22.4s, v27.4h, v27.4h - add v7\wd, v7\wd, v28\wd + add v7.8h, v7.8h, v28.8h umlal v24.4s, v28.4h, v28.4h umlal v24.4s, v29.4h, v29.4h - add v6\wd, v6\wd, v27\wd -.if \w > 4 + add v6.8h, v6.8h, v27.8h umlal2 v23.4s, v26.8h, v26.8h umlal2 v23.4s, v27.8h, v27.8h -.endif - add v7\wd, v7\wd, v29\wd -.if \w > 4 + add v7.8h, v7.8h, v29.8h umlal2 v25.4s, v28.8h, v28.8h umlal2 v25.4s, v29.8h, v29.8h -.endif -.endm - add5 8, .8h + + subs w5, w5, #8 + st1 {v6.8h}, [x1], #16 st1 {v7.8h}, [x11], #16 st1 {v22.4s,v23.4s}, [x0], #32 st1 {v24.4s,v25.4s}, [x10], #32 - subs w5, w5, #8 b.le 9f tst w7, #2 // LR_HAVE_RIGHT mov v0.16b, v1.16b @@ -944,90 +764,6 @@ function sgr_box5_h_16bpc_neon, export=1 b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. -5: // Produce 4 pixels, 7 <= w < 11 - add5 4, .4h - st1 {v6.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v22.4s}, [x0], #16 - st1 {v24.4s}, [x10], #16 - - subs w5, w5, #4 // 3 <= w < 7 - ext v0.16b, v0.16b, v1.16b, #8 - ext v16.16b, v16.16b, v17.16b, #8 - -6: // Pad the right edge and produce the last few pixels. - // w < 7, w+1 pixels valid in v0/v4 - sub w13, w5, #1 - // w13 = pixels valid - 2 - adr x14, L(box5_variable_shift_tbl) - ldrh w13, [x14, w13, uxtw #1] - mov v1.16b, v30.16b - mov v17.16b, v31.16b - sub x13, x14, w13, uxth - br x13 - // Shift v0 right, shifting out invalid pixels, - // shift v0 left to the original offset, shifting in padding pixels. -22: // 2 pixels valid - ext v0.16b, v0.16b, v0.16b, #4 - ext v16.16b, v16.16b, v16.16b, #4 - ext v0.16b, v0.16b, v30.16b, #12 - ext v16.16b, v16.16b, v31.16b, #12 - b 88f -33: // 3 pixels valid - ext v0.16b, v0.16b, v0.16b, #6 - ext v16.16b, v16.16b, v16.16b, #6 - ext v0.16b, v0.16b, v30.16b, #10 - ext v16.16b, v16.16b, v31.16b, #10 - b 88f -44: // 4 pixels valid - ext v0.16b, v0.16b, v0.16b, #8 - ext v16.16b, v16.16b, v16.16b, #8 - ext v0.16b, v0.16b, v30.16b, #8 - ext v16.16b, v16.16b, v31.16b, #8 - b 88f -55: // 5 pixels valid - ext v0.16b, v0.16b, v0.16b, #10 - ext v16.16b, v16.16b, v16.16b, #10 - ext v0.16b, v0.16b, v30.16b, #6 - ext v16.16b, v16.16b, v31.16b, #6 - b 88f -66: // 6 pixels valid - ext v0.16b, v0.16b, v0.16b, #12 - ext v16.16b, v16.16b, v16.16b, #12 - ext v0.16b, v0.16b, v30.16b, #4 - ext v16.16b, v16.16b, v31.16b, #4 - b 88f -77: // 7 pixels valid - ext v0.16b, v0.16b, v0.16b, #14 - ext v16.16b, v16.16b, v16.16b, #14 - ext v0.16b, v0.16b, v30.16b, #2 - ext v16.16b, v16.16b, v31.16b, #2 - b 88f - -L(box5_variable_shift_tbl): - .hword L(box5_variable_shift_tbl) - 22b - .hword L(box5_variable_shift_tbl) - 33b - .hword L(box5_variable_shift_tbl) - 44b - .hword L(box5_variable_shift_tbl) - 55b - .hword L(box5_variable_shift_tbl) - 66b - .hword L(box5_variable_shift_tbl) - 77b - -88: - add5 4, .4h - subs w5, w5, #4 - st1 {v6.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v22.4s}, [x0], #16 - st1 {v24.4s}, [x10], #16 - b.le 9f - ext v0.16b, v0.16b, v1.16b, #8 - ext v16.16b, v16.16b, v17.16b, #8 - add5 4, .4h - st1 {v6.4h}, [x1], #8 - st1 {v7.4h}, [x11], #8 - st1 {v22.4s}, [x0], #16 - st1 {v24.4s}, [x10], #16 - 9: subs w6, w6, #2 b.le 0f @@ -1042,7 +778,6 @@ L(box5_variable_shift_tbl): b 1b 0: ret -.purgem add5 endfunc sgr_funcs 16 From 62763119b1d81826034097893bc952ec9f86d3f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 15 Jan 2021 13:43:06 +0200 Subject: [PATCH 111/155] arm: looprestoration: Exploit wiener filter symmetry in the horz filter This gives a minor speedup on 8 bpc and a bit bigger speedup on 16 bpc. Sample speedups from arm64: Before: Cortex A53 A72 A73 wiener_7tap_8bpc_neon: 143885.7 101571.5 96187.2 wiener_7tap_10bpc_neon: 171210.8 119410.4 122447.8 After: wiener_7tap_8bpc_neon: 142985.0 94400.8 89959.3 wiener_7tap_10bpc_neon: 168818.4 113980.2 116662.0 --- src/arm/32/looprestoration.S | 56 ++++++++++++++++------------------ src/arm/32/looprestoration16.S | 55 +++++++++++++++------------------ src/arm/64/looprestoration.S | 24 +++++++-------- src/arm/64/looprestoration16.S | 34 +++++++++------------ 4 files changed, 78 insertions(+), 91 deletions(-) diff --git a/src/arm/32/looprestoration.S b/src/arm/32/looprestoration.S index a9a5ccdce7..69836e293b 100644 --- a/src/arm/32/looprestoration.S +++ b/src/arm/32/looprestoration.S @@ -46,9 +46,9 @@ endconst // int h, enum LrEdgeFlags edges); function wiener_filter_h_8bpc_neon, export=1 push {r4-r11,lr} - vpush {q4} - ldrd r4, r5, [sp, #52] - ldrd r6, r7, [sp, #60] + vpush {q4-q7} + ldrd r4, r5, [sp, #100] + ldrd r6, r7, [sp, #108] mov r8, r5 vld1.16 {q0}, [r4, :128] movw r9, #(1 << 14) - (1 << 2) @@ -159,34 +159,32 @@ function wiener_filter_h_8bpc_neon, export=1 vbit q9, q13, q11 4: // Loop horizontally - // This is tuned as some sort of compromise between Cortex A7, A8, - // A9 and A53. - vmul.s16 q3, q1, d0[0] - vext.8 q10, q1, q2, #2 vext.8 q11, q1, q2, #4 - vmla.s16 q3, q10, d0[1] - vmla.s16 q3, q11, d0[2] - vext.8 q10, q1, q2, #6 - vext.8 q11, q1, q2, #8 - vmla.s16 q3, q10, d0[3] - vmla.s16 q3, q11, d1[0] - vext.8 q10, q1, q2, #10 - vext.8 q11, q1, q2, #12 - vmla.s16 q3, q10, d1[1] - vmla.s16 q3, q11, d1[2] - - vmul.s16 q10, q8, d0[0] - vext.8 q11, q8, q9, #2 + vext.8 q5, q1, q2, #8 + vext.8 q10, q1, q2, #2 + vext.8 q6, q1, q2, #10 + vext.8 q7, q1, q2, #12 + vext.8 q4, q1, q2, #6 + vadd.i16 q5, q5, q11 + vadd.i16 q6, q6, q10 + vadd.i16 q7, q7, q1 + vmul.s16 q3, q4, d0[3] + vmla.s16 q3, q5, d1[0] + vmla.s16 q3, q6, d1[1] + vmla.s16 q3, q7, d1[2] + vext.8 q4, q8, q9, #4 - vmla.s16 q10, q11, d0[1] - vmla.s16 q10, q4, d0[2] - vext.8 q11, q8, q9, #6 - vext.8 q4, q8, q9, #8 - vmla.s16 q10, q11, d0[3] - vmla.s16 q10, q4, d1[0] - vext.8 q11, q8, q9, #10 + vext.8 q6, q8, q9, #8 + vext.8 q11, q8, q9, #2 + vext.8 q7, q8, q9, #10 + vadd.i16 q6, q6, q4 vext.8 q4, q8, q9, #12 - vmla.s16 q10, q11, d1[1] + vext.8 q5, q8, q9, #6 + vadd.i16 q7, q7, q11 + vadd.i16 q4, q4, q8 + vmul.s16 q10, q5, d0[3] + vmla.s16 q10, q6, d1[0] + vmla.s16 q10, q7, d1[1] vmla.s16 q10, q4, d1[2] vext.8 q1, q1, q2, #6 @@ -227,7 +225,7 @@ function wiener_filter_h_8bpc_neon, export=1 mov r5, r8 b 1b 0: - vpop {q4} + vpop {q4-q7} pop {r4-r11,pc} endfunc diff --git a/src/arm/32/looprestoration16.S b/src/arm/32/looprestoration16.S index 11a28bc772..d699617a87 100644 --- a/src/arm/32/looprestoration16.S +++ b/src/arm/32/looprestoration16.S @@ -168,46 +168,41 @@ function wiener_filter_h_16bpc_neon, export=1 vbit q5, q12, q10 4: // Loop horizontally - vext.8 q8, q2, q3, #2 - vext.8 q9, q2, q3, #4 - vext.8 q10, q2, q3, #6 - vmull.s16 q6, d4, d0[0] - vmlal.s16 q6, d16, d0[1] - vmlal.s16 q6, d18, d0[2] - vmlal.s16 q6, d20, d0[3] - vmull.s16 q7, d5, d0[0] - vmlal.s16 q7, d17, d0[1] - vmlal.s16 q7, d19, d0[2] - vmlal.s16 q7, d21, d0[3] + vext.8 q7, q2, q3, #4 vext.8 q8, q2, q3, #8 + vext.8 q6, q2, q3, #2 vext.8 q9, q2, q3, #10 - vext.8 q10, q2, q3, #12 + vadd.i16 q8, q8, q7 + vadd.i16 q9, q9, q6 + vext.8 q6, q2, q3, #12 + vext.8 q7, q2, q3, #6 + vadd.i16 q2, q2, q6 + vmull.s16 q6, d14, d0[3] vmlal.s16 q6, d16, d1[0] vmlal.s16 q6, d18, d1[1] - vmlal.s16 q6, d20, d1[2] + vmlal.s16 q6, d4, d1[2] + vmull.s16 q7, d15, d0[3] vmlal.s16 q7, d17, d1[0] vmlal.s16 q7, d19, d1[1] - vmlal.s16 q7, d21, d1[2] - vext.8 q2, q4, q5, #2 - vext.8 q10, q4, q5, #6 - vmull.s16 q8, d8, d0[0] - vmlal.s16 q8, d4, d0[1] - vmlal.s16 q8, d20, d0[3] - vmull.s16 q9, d9, d0[0] - vmlal.s16 q9, d5, d0[1] - vmlal.s16 q9, d21, d0[3] - vext.8 q2, q4, q5, #4 + vmlal.s16 q7, d5, d1[2] + + vext.8 q8, q4, q5, #4 vext.8 q10, q4, q5, #8 - vmlal.s16 q8, d4, d0[2] - vmlal.s16 q8, d20, d1[0] - vmlal.s16 q9, d5, d0[2] - vmlal.s16 q9, d21, d1[0] + vext.8 q9, q4, q5, #2 vext.8 q2, q4, q5, #10 - vext.8 q10, q4, q5, #12 + vadd.i16 q10, q10, q8 + vadd.i16 q2, q2, q9 + vext.8 q8, q4, q5, #12 + vext.8 q9, q4, q5, #6 + vadd.i16 q4, q4, q8 + vmull.s16 q8, d18, d0[3] + vmlal.s16 q8, d20, d1[0] vmlal.s16 q8, d4, d1[1] - vmlal.s16 q8, d20, d1[2] + vmlal.s16 q8, d8, d1[2] + vmull.s16 q9, d19, d0[3] + vmlal.s16 q9, d21, d1[0] vmlal.s16 q9, d5, d1[1] - vmlal.s16 q9, d21, d1[2] + vmlal.s16 q9, d9, d1[2] vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1 vadd.i32 q6, q6, q14 diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S index fe40ae9875..fdf4e34147 100644 --- a/src/arm/64/looprestoration.S +++ b/src/arm/64/looprestoration.S @@ -156,29 +156,29 @@ function wiener_filter_h_8bpc_neon, export=1 // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - ext v16.16b, v2.16b, v3.16b, #2 ext v17.16b, v2.16b, v3.16b, #4 - ext v18.16b, v2.16b, v3.16b, #6 ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 ext v20.16b, v2.16b, v3.16b, #10 ext v21.16b, v2.16b, v3.16b, #12 - mul v6.8h, v2.8h, v0.h[0] - mla v6.8h, v16.8h, v0.h[1] - mla v6.8h, v17.8h, v0.h[2] - mla v6.8h, v18.8h, v0.h[3] + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + mul v6.8h, v18.8h, v0.h[3] mla v6.8h, v19.8h, v0.h[4] mla v6.8h, v20.8h, v0.h[5] mla v6.8h, v21.8h, v0.h[6] - ext v22.16b, v4.16b, v5.16b, #2 ext v23.16b, v4.16b, v5.16b, #4 - ext v24.16b, v4.16b, v5.16b, #6 ext v25.16b, v4.16b, v5.16b, #8 + ext v22.16b, v4.16b, v5.16b, #2 ext v26.16b, v4.16b, v5.16b, #10 ext v27.16b, v4.16b, v5.16b, #12 - mul v7.8h, v4.8h, v0.h[0] - mla v7.8h, v22.8h, v0.h[1] - mla v7.8h, v23.8h, v0.h[2] - mla v7.8h, v24.8h, v0.h[3] + ext v24.16b, v4.16b, v5.16b, #6 + add v25.8h, v25.8h, v23.8h + add v26.8h, v26.8h, v22.8h + add v27.8h, v27.8h, v4.8h + mul v7.8h, v24.8h, v0.h[3] mla v7.8h, v25.8h, v0.h[4] mla v7.8h, v26.8h, v0.h[5] mla v7.8h, v27.8h, v0.h[6] diff --git a/src/arm/64/looprestoration16.S b/src/arm/64/looprestoration16.S index 3e01a3543e..3d43a848d6 100644 --- a/src/arm/64/looprestoration16.S +++ b/src/arm/64/looprestoration16.S @@ -164,44 +164,38 @@ function wiener_filter_h_16bpc_neon, export=1 // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - ext v16.16b, v2.16b, v3.16b, #2 ext v17.16b, v2.16b, v3.16b, #4 - ext v18.16b, v2.16b, v3.16b, #6 ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 ext v20.16b, v2.16b, v3.16b, #10 ext v21.16b, v2.16b, v3.16b, #12 - smull v6.4s, v2.4h, v0.h[0] - smlal v6.4s, v16.4h, v0.h[1] - smlal v6.4s, v17.4h, v0.h[2] - smlal v6.4s, v18.4h, v0.h[3] + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + smull v6.4s, v18.4h, v0.h[3] smlal v6.4s, v19.4h, v0.h[4] smlal v6.4s, v20.4h, v0.h[5] smlal v6.4s, v21.4h, v0.h[6] - smull2 v7.4s, v2.8h, v0.h[0] - smlal2 v7.4s, v16.8h, v0.h[1] - smlal2 v7.4s, v17.8h, v0.h[2] - smlal2 v7.4s, v18.8h, v0.h[3] + smull2 v7.4s, v18.8h, v0.h[3] smlal2 v7.4s, v19.8h, v0.h[4] smlal2 v7.4s, v20.8h, v0.h[5] smlal2 v7.4s, v21.8h, v0.h[6] - ext v19.16b, v4.16b, v5.16b, #2 ext v20.16b, v4.16b, v5.16b, #4 - ext v21.16b, v4.16b, v5.16b, #6 ext v22.16b, v4.16b, v5.16b, #8 + ext v19.16b, v4.16b, v5.16b, #2 ext v23.16b, v4.16b, v5.16b, #10 ext v24.16b, v4.16b, v5.16b, #12 - smull v16.4s, v4.4h, v0.h[0] - smlal v16.4s, v19.4h, v0.h[1] - smlal v16.4s, v20.4h, v0.h[2] - smlal v16.4s, v21.4h, v0.h[3] + ext v21.16b, v4.16b, v5.16b, #6 + add v22.8h, v22.8h, v20.8h + add v23.8h, v23.8h, v19.8h + add v24.8h, v24.8h, v4.8h + smull v16.4s, v21.4h, v0.h[3] smlal v16.4s, v22.4h, v0.h[4] smlal v16.4s, v23.4h, v0.h[5] smlal v16.4s, v24.4h, v0.h[6] - smull2 v17.4s, v4.8h, v0.h[0] - smlal2 v17.4s, v19.8h, v0.h[1] - smlal2 v17.4s, v20.8h, v0.h[2] - smlal2 v17.4s, v21.8h, v0.h[3] + smull2 v17.4s, v21.8h, v0.h[3] smlal2 v17.4s, v22.8h, v0.h[4] smlal2 v17.4s, v23.8h, v0.h[5] smlal2 v17.4s, v24.8h, v0.h[6] From 5a2403583962d3c98fbfe4a9e224e32d1d9f315c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 15 Jan 2021 14:22:56 +0200 Subject: [PATCH 112/155] arm: looprestoration: Exploit wiener filter symmetry in the vert filter Only doing this for 8bpc; for higher bitdepths, adding the input coefficients can overflow a signed 16 bit element. Before: Cortex A53 A72 A73 wiener_7tap_8bpc_neon: 142985.0 94400.8 89959.3 After: wiener_7tap_8bpc_neon: 136614.4 88828.3 86997.0 --- src/arm/32/looprestoration.S | 31 +++++++++++++++---------------- src/arm/64/looprestoration.S | 25 +++++++++++-------------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/src/arm/32/looprestoration.S b/src/arm/32/looprestoration.S index 69836e293b..be5c658d6d 100644 --- a/src/arm/32/looprestoration.S +++ b/src/arm/32/looprestoration.S @@ -235,8 +235,9 @@ endfunc // ptrdiff_t mid_stride); function wiener_filter_v_8bpc_neon, export=1 push {r4-r7,lr} - ldrd r4, r5, [sp, #20] - ldrd r6, r7, [sp, #28] + vpush {q4-q6} + ldrd r4, r5, [sp, #68] + ldrd r6, r7, [sp, #76] mov lr, r4 vld1.16 {q0}, [r5, :128] @@ -280,20 +281,17 @@ function wiener_filter_v_8bpc_neon, export=1 // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - vmull.s16 q2, d16, d0[0] - vmlal.s16 q2, d18, d0[1] - vmlal.s16 q2, d20, d0[2] - vmlal.s16 q2, d22, d0[3] - vmlal.s16 q2, d24, d1[0] - vmlal.s16 q2, d26, d1[1] - vmlal.s16 q2, d28, d1[2] - vmull.s16 q3, d17, d0[0] - vmlal.s16 q3, d19, d0[1] - vmlal.s16 q3, d21, d0[2] - vmlal.s16 q3, d23, d0[3] - vmlal.s16 q3, d25, d1[0] - vmlal.s16 q3, d27, d1[1] - vmlal.s16 q3, d29, d1[2] + vadd.i16 q4, q10, q12 + vadd.i16 q5, q9, q13 + vadd.i16 q6, q8, q14 + vmull.s16 q2, d22, d0[3] + vmlal.s16 q2, d8, d1[0] + vmlal.s16 q2, d10, d1[1] + vmlal.s16 q2, d12, d1[2] + vmull.s16 q3, d23, d0[3] + vmlal.s16 q3, d9, d1[0] + vmlal.s16 q3, d11, d1[1] + vmlal.s16 q3, d13, d1[2] vqrshrun.s32 d4, q2, #11 vqrshrun.s32 d5, q3, #11 vqmovun.s16 d4, q2 @@ -402,6 +400,7 @@ function wiener_filter_v_8bpc_neon, export=1 b 1b 0: + vpop {q4-q6} pop {r4-r7,pc} .purgem filter endfunc diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S index fdf4e34147..a72c8b9553 100644 --- a/src/arm/64/looprestoration.S +++ b/src/arm/64/looprestoration.S @@ -272,20 +272,17 @@ function wiener_filter_v_8bpc_neon, export=1 // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - smull v2.4s, v16.4h, v0.h[0] - smlal v2.4s, v17.4h, v0.h[1] - smlal v2.4s, v18.4h, v0.h[2] - smlal v2.4s, v19.4h, v0.h[3] - smlal v2.4s, v20.4h, v0.h[4] - smlal v2.4s, v21.4h, v0.h[5] - smlal v2.4s, v22.4h, v0.h[6] - smull2 v3.4s, v16.8h, v0.h[0] - smlal2 v3.4s, v17.8h, v0.h[1] - smlal2 v3.4s, v18.8h, v0.h[2] - smlal2 v3.4s, v19.8h, v0.h[3] - smlal2 v3.4s, v20.8h, v0.h[4] - smlal2 v3.4s, v21.8h, v0.h[5] - smlal2 v3.4s, v22.8h, v0.h[6] + add v25.8h, v18.8h, v20.8h + add v26.8h, v17.8h, v21.8h + add v27.8h, v16.8h, v22.8h + smull v2.4s, v19.4h, v0.h[3] + smlal v2.4s, v25.4h, v0.h[4] + smlal v2.4s, v26.4h, v0.h[5] + smlal v2.4s, v27.4h, v0.h[6] + smull2 v3.4s, v19.8h, v0.h[3] + smlal2 v3.4s, v25.8h, v0.h[4] + smlal2 v3.4s, v26.8h, v0.h[5] + smlal2 v3.4s, v27.8h, v0.h[6] sqrshrun v2.4h, v2.4s, #11 sqrshrun2 v2.8h, v3.4s, #11 sqxtun v2.8b, v2.8h From d0ebfc6240888e0cb9a1714bb693d6e0fda1610c Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Tue, 26 Jan 2021 11:26:01 -0500 Subject: [PATCH 113/155] arm64: cdef 8bpc: Accumulate sum in bytes Use mla (8-bit -> 8-bit) instead of smlal (8-bit -> 16-bit). Before: Cortex A53 A72 A73 cdef_filter_4x4_8bpc_neon: 389.7 264.0 261.7 cdef_filter_4x8_8bpc_neon: 687.2 476.2 465.5 cdef_filter_8x8_8bpc_neon: 1152.9 752.1 789.5 After: cdef_filter_4x4_8bpc_neon: 385.2 263.4 259.2 cdef_filter_4x8_8bpc_neon: 677.5 473.8 459.8 cdef_filter_8x8_8bpc_neon: 1134.4 744.6 774.6 --- src/arm/64/cdef.S | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/src/arm/64/cdef.S b/src/arm/64/cdef.S index 6104470a63..fd86dae4be 100644 --- a/src/arm/64/cdef.S +++ b/src/arm/64/cdef.S @@ -363,10 +363,8 @@ find_dir 8 neg v20.16b, v21.16b // -imin() bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign() bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign() - smlal v1.8h, v18.8b, v19.8b // sum += taps[k] * constrain() - smlal v1.8h, v22.8b, v19.8b // sum += taps[k] * constrain() - smlal2 v2.8h, v18.16b, v19.16b // sum += taps[k] * constrain() - smlal2 v2.8h, v22.16b, v19.16b // sum += taps[k] * constrain() + mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain() + mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain() .endm // void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride, @@ -418,8 +416,11 @@ function cdef_filter\w\suffix\()_edged_8bpc_neon ld1 {v0.s}[3], [x14] // px .endif - movi v1.8h, #0 // sum - movi v2.8h, #0 // sum + // We need 9-bits or two 8-bit accululators to fit the sum. + // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228. + // Start sum at -1 instead of 0 to help handle rounding later. + movi v1.16b, #255 // sum + movi v2.16b, #0 // sum .if \min mov v3.16b, v0.16b // min mov v4.16b, v0.16b // max @@ -468,16 +469,16 @@ function cdef_filter\w\suffix\()_edged_8bpc_neon .endif b.ne 2b - sshr v5.8h, v1.8h, #15 // -(sum < 0) - sshr v6.8h, v2.8h, #15 // -(sum < 0) - add v1.8h, v1.8h, v5.8h // sum - (sum < 0) - add v2.8h, v2.8h, v6.8h // sum - (sum < 0) - srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4 - srshr v2.8h, v2.8h, #4 // (8 + sum - (sum < 0)) >> 4 - uaddw v1.8h, v1.8h, v0.8b // px + (8 + sum ...) >> 4 - uaddw2 v2.8h, v2.8h, v0.16b // px + (8 + sum ...) >> 4 - sqxtun v0.8b, v1.8h - sqxtun2 v0.16b, v2.8h + // Perform halving adds since the value won't fit otherwise. + // To handle the offset for negative values, use both halving w/ and w/o rounding. + srhadd v5.16b, v1.16b, v2.16b // sum >> 1 + shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1 + sshr v1.16b, v5.16b, #7 // sum < 0 + bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1 + + srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4 + + usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4 .if \min umin v0.16b, v0.16b, v4.16b umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max) From f4a6e77e2f7c8c93c47dcd1636b033191ef2af03 Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Thu, 4 Feb 2021 18:23:21 -0500 Subject: [PATCH 114/155] arm64: loopfilter16: Remove extra immediate move --- src/arm/64/loopfilter16.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/arm/64/loopfilter16.S b/src/arm/64/loopfilter16.S index e703020555..8c9f98b135 100644 --- a/src/arm/64/loopfilter16.S +++ b/src/arm/64/loopfilter16.S @@ -150,10 +150,9 @@ function lpf_8_wd\wd\()_neon movi v6.8h, #4 add v2.8h, v2.8h, v4.8h smin v2.8h, v2.8h, v3.8h // f = iclip_diff() - movi v7.8h, #3 smax v2.8h, v2.8h, v9.8h // f = iclip_diff() sqadd v4.8h, v6.8h, v2.8h // f + 4 - sqadd v5.8h, v7.8h, v2.8h // f + 3 + sqadd v5.8h, v5.8h, v2.8h // f + 3 smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1) smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1) sshr v4.8h, v4.8h, #3 // f1 From 5d74afc08baadccbf442460ec0be85b233516071 Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Thu, 4 Feb 2021 18:52:24 -0500 Subject: [PATCH 115/155] arm64: loopfilter: Avoid leaving 8-bits Avoid moving between 8 and 16-bit vectors where possible. --- src/arm/64/loopfilter.S | 49 +++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/src/arm/64/loopfilter.S b/src/arm/64/loopfilter.S index b26d954555..1d426f1692 100644 --- a/src/arm/64/loopfilter.S +++ b/src/arm/64/loopfilter.S @@ -132,12 +132,11 @@ function lpf_16_wd\wd\()_neon .endif b.eq 1f // skip wd == 4 case .endif - - usubl v2.8h, v22.8b, v25.8b // p1 - q1 - usubl2 v3.8h, v22.16b, v25.16b + movi v3.16b, #128 + eor v2.16b, v22.16b, v3.16b // p1 - 128 + eor v3.16b, v25.16b, v3.16b // q1 - 128 cmhi v0.16b, v0.16b, v12.16b // hev - sqxtn v2.8b, v2.8h // iclip_diff(p1 - q1) - sqxtn2 v2.16b, v3.8h + sqsub v2.16b, v2.16b, v3.16b // iclip_diff(p1 - q1) and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1) bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev) usubl v2.8h, v24.8b, v23.8b @@ -155,35 +154,23 @@ function lpf_16_wd\wd\()_neon sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127) sshr v4.16b, v4.16b, #3 // f1 sshr v5.16b, v5.16b, #3 // f2 - uxtl v2.8h, v23.8b // p0 - uxtl2 v3.8h, v23.16b - uxtl v6.8h, v24.8b // q0 - uxtl2 v7.8h, v24.16b - saddw v2.8h, v2.8h, v5.8b - saddw2 v3.8h, v3.8h, v5.16b - ssubw v6.8h, v6.8h, v4.8b - ssubw2 v7.8h, v7.8h, v4.16b + mov v2.16b, v23.16b // p0 + mov v3.16b, v24.16b // q0 + neg v6.16b, v4.16b // -f1 srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1 - sqxtun v2.8b, v2.8h // out p0 - sqxtun2 v2.16b, v3.8h - sqxtun v6.8b, v6.8h // out q0 - sqxtun2 v6.16b, v7.8h + // p0 + f2, q0 - f1 + usqadd v2.16b, v5.16b // out p0 + usqadd v3.16b, v6.16b // out q0 + neg v6.16b, v4.16b // -((f1 + 1) >> 1) bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4) - uxtl v2.8h, v22.8b // p1 - uxtl2 v3.8h, v22.16b - bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4) - uxtl v6.8h, v25.8b // q1 - uxtl2 v7.8h, v25.16b - saddw v2.8h, v2.8h, v4.8b - saddw2 v3.8h, v3.8h, v4.16b - ssubw v6.8h, v6.8h, v4.8b - ssubw2 v7.8h, v7.8h, v4.16b - sqxtun v2.8b, v2.8h // out p1 - sqxtun2 v2.16b, v3.8h - sqxtun v6.8b, v6.8h // out q1 - sqxtun2 v6.16b, v7.8h + bit v24.16b, v3.16b, v1.16b // if (fm && wd >= 4) + mov v2.16b, v22.16b // p1 + mov v3.16b, v25.16b // q1 + // p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1) + usqadd v2.16b, v4.16b // out p1 + usqadd v3.16b, v6.16b // out q1 bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev) - bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev) + bit v25.16b, v3.16b, v0.16b // if (fm && wd >= 4 && !hev) 1: .if \wd == 6 From 8a853fe596db40766151ee8b4a0077bf02f98235 Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Thu, 4 Feb 2021 16:26:58 -0500 Subject: [PATCH 116/155] arm64: warped motion: Various optimizations - Reorder loads of filters to benifit in order cores. - Use full 128-bit vectors to transpose 8x8 bytes. zip1 is called in the first stage which will hurt performance on some older big cores. - Rework horz stage for 8 bit mode: * Use smull instead of mul * Replace existing narrow and long instructions * Replace mov after calling with right shift Before: Cortex A55 A53 A72 A73 warp_8x8_8bpc_neon: 1683.2 1860.6 1065.0 1102.6 warp_8x8t_8bpc_neon: 1673.2 1846.4 1057.0 1098.4 warp_8x8_16bpc_neon: 1870.7 2031.7 1147.3 1220.7 warp_8x8t_16bpc_neon: 1848.0 2006.2 1121.6 1188.0 After: warp_8x8_8bpc_neon: 1267.2 1446.2 807.0 871.5 warp_8x8t_8bpc_neon: 1245.4 1422.0 810.2 868.4 warp_8x8_16bpc_neon: 1769.8 1929.3 1132.0 1238.2 warp_8x8t_16bpc_neon: 1747.3 1904.1 1101.5 1207.9 Cortex-A55 Before: warp_8x8_8bpc_neon: 1683.2 warp_8x8t_8bpc_neon: 1673.2 warp_8x8_16bpc_neon: 1870.7 warp_8x8t_16bpc_neon: 1848.0 After: warp_8x8_8bpc_neon: 1267.2 warp_8x8t_8bpc_neon: 1245.4 warp_8x8_16bpc_neon: 1769.8 warp_8x8t_16bpc_neon: 1747.3 --- src/arm/64/mc.S | 107 ++++++++++++++++++++-------------------------- src/arm/64/mc16.S | 12 +----- src/arm/64/util.S | 63 +++++++++++++++------------ 3 files changed, 85 insertions(+), 97 deletions(-) diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index 32ed6e901a..6ebcc42a6f 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -2916,8 +2916,8 @@ filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 .macro load_filter_row dst, src, inc asr w13, \src, #10 - ldr \dst, [x11, w13, sxtw #3] add \src, \src, \inc + ldr \dst, [x11, w13, sxtw #3] .endm function warp_filter_horz_neon @@ -2926,57 +2926,44 @@ function warp_filter_horz_neon ld1 {v16.8b, v17.8b}, [x2], x3 load_filter_row d0, w12, w7 - uxtl v16.8h, v16.8b load_filter_row d1, w12, w7 - uxtl v17.8h, v17.8b load_filter_row d2, w12, w7 - sxtl v0.8h, v0.8b load_filter_row d3, w12, w7 - sxtl v1.8h, v1.8b load_filter_row d4, w12, w7 - sxtl v2.8h, v2.8b load_filter_row d5, w12, w7 - sxtl v3.8h, v3.8b load_filter_row d6, w12, w7 - sxtl v4.8h, v4.8b + // subtract by 128 to allow using smull + eor v16.8b, v16.8b, v22.8b + eor v17.8b, v17.8b, v22.8b load_filter_row d7, w12, w7 - sxtl v5.8h, v5.8b - ext v18.16b, v16.16b, v17.16b, #2*1 - mul v23.8h, v16.8h, v0.8h - sxtl v6.8h, v6.8b - ext v19.16b, v16.16b, v17.16b, #2*2 - mul v18.8h, v18.8h, v1.8h - sxtl v7.8h, v7.8b - ext v20.16b, v16.16b, v17.16b, #2*3 - mul v19.8h, v19.8h, v2.8h - ext v21.16b, v16.16b, v17.16b, #2*4 - saddlp v23.4s, v23.8h - mul v20.8h, v20.8h, v3.8h - ext v22.16b, v16.16b, v17.16b, #2*5 - saddlp v18.4s, v18.8h - mul v21.8h, v21.8h, v4.8h - saddlp v19.4s, v19.8h - mul v22.8h, v22.8h, v5.8h - saddlp v20.4s, v20.8h - saddlp v21.4s, v21.8h - saddlp v22.4s, v22.8h - addp v18.4s, v23.4s, v18.4s - ext v23.16b, v16.16b, v17.16b, #2*6 - addp v19.4s, v19.4s, v20.4s - mul v23.8h, v23.8h, v6.8h - ext v20.16b, v16.16b, v17.16b, #2*7 - mul v20.8h, v20.8h, v7.8h - saddlp v23.4s, v23.8h - addp v21.4s, v21.4s, v22.4s - saddlp v20.4s, v20.8h - addp v20.4s, v23.4s, v20.4s - addp v18.4s, v18.4s, v19.4s - addp v20.4s, v21.4s, v20.4s - add w5, w5, w8 + ext v18.8b, v16.8b, v17.8b, #1 + ext v19.8b, v16.8b, v17.8b, #2 + smull v0.8h, v0.8b, v16.8b + smull v1.8h, v1.8b, v18.8b + ext v18.8b, v16.8b, v17.8b, #3 + ext v20.8b, v16.8b, v17.8b, #4 + smull v2.8h, v2.8b, v19.8b + smull v3.8h, v3.8b, v18.8b + ext v18.8b, v16.8b, v17.8b, #5 + ext v19.8b, v16.8b, v17.8b, #6 + smull v4.8h, v4.8b, v20.8b + smull v5.8h, v5.8b, v18.8b + ext v18.8b, v16.8b, v17.8b, #7 + smull v6.8h, v6.8b, v19.8b + smull v7.8h, v7.8b, v18.8b + + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + + addp v0.8h, v0.8h, v4.8h - rshrn v16.4h, v18.4s, #3 - rshrn2 v16.8h, v20.4s, #3 + add w5, w5, w8 ret endfunc @@ -3002,25 +2989,32 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1 lsl x1, x1, #1 .endif + movi v22.8b, #128 +.ifb \t + movi v23.8h, #128 +.else + movi v23.8h, #8, lsl #8 +.endif + bl warp_filter_horz_neon - mov v24.16b, v16.16b + srshr v24.8h, v0.8h, #3 bl warp_filter_horz_neon - mov v25.16b, v16.16b + srshr v25.8h, v0.8h, #3 bl warp_filter_horz_neon - mov v26.16b, v16.16b + srshr v26.8h, v0.8h, #3 bl warp_filter_horz_neon - mov v27.16b, v16.16b + srshr v27.8h, v0.8h, #3 bl warp_filter_horz_neon - mov v28.16b, v16.16b + srshr v28.8h, v0.8h, #3 bl warp_filter_horz_neon - mov v29.16b, v16.16b + srshr v29.8h, v0.8h, #3 bl warp_filter_horz_neon - mov v30.16b, v16.16b + srshr v30.8h, v0.8h, #3 1: add w14, w6, #512 bl warp_filter_horz_neon - mov v31.16b, v16.16b + srshr v31.8h, v0.8h, #3 load_filter_row d0, w14, w9 load_filter_row d1, w14, w9 @@ -3030,15 +3024,7 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1 load_filter_row d5, w14, w9 load_filter_row d6, w14, w9 load_filter_row d7, w14, w9 - transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 - sxtl v0.8h, v0.8b - sxtl v1.8h, v1.8b - sxtl v2.8h, v2.8b - sxtl v3.8h, v3.8b - sxtl v4.8h, v4.8b - sxtl v5.8h, v5.8b - sxtl v6.8h, v6.8b - sxtl v7.8h, v7.8b + transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl // This ordering of smull/smlal/smull2/smlal2 is highly // beneficial for Cortex A53 here. @@ -3066,6 +3052,7 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1 sqrshrn2 v16.8h, v17.4s, #\shift mov v27.16b, v28.16b mov v28.16b, v29.16b + add v16.8h, v16.8h, v23.8h .ifb \t sqxtun v16.8b, v16.8h .endif diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S index 7a8460e404..cec82a3e04 100644 --- a/src/arm/64/mc16.S +++ b/src/arm/64/mc16.S @@ -3188,8 +3188,8 @@ filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 .macro load_filter_row dst, src, inc asr w13, \src, #10 - ldr \dst, [x11, w13, sxtw #3] add \src, \src, \inc + ldr \dst, [x11, w13, sxtw #3] .endm function warp_filter_horz_neon @@ -3343,15 +3343,7 @@ function warp_affine_8x8\t\()_16bpc_neon, export=1 load_filter_row d5, w14, w9 load_filter_row d6, w14, w9 load_filter_row d7, w14, w9 - transpose_8x8b v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 - sxtl v0.8h, v0.8b - sxtl v1.8h, v1.8b - sxtl v2.8h, v2.8b - sxtl v3.8h, v3.8b - sxtl v4.8h, v4.8b - sxtl v5.8h, v5.8b - sxtl v6.8h, v6.8b - sxtl v7.8h, v7.8b + transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl // This ordering of smull/smlal/smull2/smlal2 is highly // beneficial for Cortex A53 here. diff --git a/src/arm/64/util.S b/src/arm/64/util.S index fc0e0d04f1..6a65523d80 100644 --- a/src/arm/64/util.S +++ b/src/arm/64/util.S @@ -59,33 +59,42 @@ #endif .endm -.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 - trn1 \t8\().8b, \r0\().8b, \r1\().8b - trn2 \t9\().8b, \r0\().8b, \r1\().8b - trn1 \r1\().8b, \r2\().8b, \r3\().8b - trn2 \r3\().8b, \r2\().8b, \r3\().8b - trn1 \r0\().8b, \r4\().8b, \r5\().8b - trn2 \r5\().8b, \r4\().8b, \r5\().8b - trn1 \r2\().8b, \r6\().8b, \r7\().8b - trn2 \r7\().8b, \r6\().8b, \r7\().8b - - trn1 \r4\().4h, \r0\().4h, \r2\().4h - trn2 \r2\().4h, \r0\().4h, \r2\().4h - trn1 \r6\().4h, \r5\().4h, \r7\().4h - trn2 \r7\().4h, \r5\().4h, \r7\().4h - trn1 \r5\().4h, \t9\().4h, \r3\().4h - trn2 \t9\().4h, \t9\().4h, \r3\().4h - trn1 \r3\().4h, \t8\().4h, \r1\().4h - trn2 \t8\().4h, \t8\().4h, \r1\().4h - - trn1 \r0\().2s, \r3\().2s, \r4\().2s - trn2 \r4\().2s, \r3\().2s, \r4\().2s - trn1 \r1\().2s, \r5\().2s, \r6\().2s - trn2 \r5\().2s, \r5\().2s, \r6\().2s - trn2 \r6\().2s, \t8\().2s, \r2\().2s - trn1 \r2\().2s, \t8\().2s, \r2\().2s - trn1 \r3\().2s, \t9\().2s, \r7\().2s - trn2 \r7\().2s, \t9\().2s, \r7\().2s +.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl + // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 + zip1 \r0\().16b, \r0\().16b, \r1\().16b + // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7 + zip1 \r2\().16b, \r2\().16b, \r3\().16b + // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7 + zip1 \r4\().16b, \r4\().16b, \r5\().16b + // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7 + zip1 \r6\().16b, \r6\().16b, \r7\().16b + + // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6 + trn1 \r1\().8h, \r0\().8h, \r2\().8h + // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7 + trn2 \r3\().8h, \r0\().8h, \r2\().8h + // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6 + trn1 \r5\().8h, \r4\().8h, \r6\().8h + // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7 + trn2 \r7\().8h, \r4\().8h, \r6\().8h + + // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4 + trn1 \r0\().4s, \r1\().4s, \r5\().4s + // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6 + trn2 \r2\().4s, \r1\().4s, \r5\().4s + // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5 + trn1 \r1\().4s, \r3\().4s, \r7\().4s + // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7 + trn2 \r3\().4s, \r3\().4s, \r7\().4s + + \xtl\()2 \r4\().8h, \r0\().16b + \xtl \r0\().8h, \r0\().8b + \xtl\()2 \r6\().8h, \r2\().16b + \xtl \r2\().8h, \r2\().8b + \xtl\()2 \r5\().8h, \r1\().16b + \xtl \r1\().8h, \r1\().8b + \xtl\()2 \r7\().8h, \r3\().16b + \xtl \r3\().8h, \r3\().8b .endm .macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 From f3e8cefde156bed72d0cab9f75b2dd830dfd1dbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 5 Feb 2021 10:56:47 +0200 Subject: [PATCH 117/155] arm32: loopfilter16: Remove an extra immediate move --- src/arm/32/loopfilter16.S | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/arm/32/loopfilter16.S b/src/arm/32/loopfilter16.S index e673075068..d7daf21f1a 100644 --- a/src/arm/32/loopfilter16.S +++ b/src/arm/32/loopfilter16.S @@ -141,13 +141,12 @@ function lpf_4_wd\wd\()_neon vmov.i16 d6, #3 vbic d0, d1, d0 // (fm && wd >= 4 && !hev) vmul.i16 d2, d2, d6 - vmov.i16 d6, #4 + vmov.i16 d7, #4 vadd.i16 d2, d2, d4 vmin.s16 d2, d2, d3 // f = iclip_diff() - vmov.i16 d7, #3 vmax.s16 d2, d2, d9 // f = iclip_diff() - vqadd.s16 d4, d6, d2 // f + 4 - vqadd.s16 d5, d7, d2 // f + 3 + vqadd.s16 d4, d7, d2 // f + 4 + vqadd.s16 d5, d6, d2 // f + 3 vmin.s16 d4, d4, d3 // imin(f + 4, 128 << bitdepth_min_8 - 1) vmin.s16 d5, d5, d3 // imin(f + 3, 128 << bitdepth_min_8 - 1) vshr.s16 d4, d4, #3 // f1 From f8dba4bb9963d5d03288c956cdb859c7d54b0e5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 2 Feb 2021 14:38:06 +0200 Subject: [PATCH 118/155] arm: Move the sub_sp and sub_sp_aligned macros to the shared util file The arm32 version is less generic and has a bit more caveats, but still belongs as a shared utility in a header. --- src/arm/32/itx.S | 47 ------------------------------------------- src/arm/32/util.S | 50 ++++++++++++++++++++++++++++++++++++++++++++++ src/arm/64/itx.S | 23 --------------------- src/arm/64/itx16.S | 23 --------------------- src/arm/64/util.S | 23 +++++++++++++++++++++ 5 files changed, 73 insertions(+), 93 deletions(-) diff --git a/src/arm/32/itx.S b/src/arm/32/itx.S index 867eb194df..1e4938c74f 100644 --- a/src/arm/32/itx.S +++ b/src/arm/32/itx.S @@ -1480,53 +1480,6 @@ function inv_txfm_add_vert_4x16_neon pop {pc} endfunc -.macro sub_sp_align space -#if CONFIG_THUMB - mov r7, sp - and r7, r7, #15 -#else - and r7, sp, #15 -#endif - sub sp, sp, r7 - // Now the stack is aligned, store the amount of adjustment back - // on the stack, as we don't want to waste a register as frame - // pointer. - str r7, [sp, #-16]! -#ifdef _WIN32 -.if \space > 8192 - // Here, we'd need to touch two (or more) pages while decrementing - // the stack pointer. - .error "sub_sp_align doesn't support values over 8K at the moment" -.elseif \space > 4096 - sub r7, sp, #4096 - ldr r12, [r7] - sub r7, r7, #(\space - 4096) - mov sp, r7 -.else - sub sp, sp, #\space -.endif -#else -.if \space >= 4096 - sub sp, sp, #(\space)/4096*4096 -.endif -.if (\space % 4096) != 0 - sub sp, sp, #(\space)%4096 -.endif -#endif -.endm - -.macro add_sp_align space -.if \space >= 4096 - add sp, sp, #(\space)/4096*4096 -.endif -.if (\space % 4096) != 0 - add sp, sp, #(\space)%4096 -.endif - ldr r7, [sp], #16 - // Add back the original stack adjustment - add sp, sp, r7 -.endm - function inv_txfm_add_16x16_neon sub_sp_align 512 ldrh r11, [r10], #2 diff --git a/src/arm/32/util.S b/src/arm/32/util.S index 6af0158e09..065749a7b2 100644 --- a/src/arm/32/util.S +++ b/src/arm/32/util.S @@ -69,6 +69,56 @@ #endif .endm +// This macro clobbers r7 (and r12 on windows) and stores data at the +// bottom of the stack; sp+16 is the start of the space allocated that +// the caller can use. +.macro sub_sp_align space +#if CONFIG_THUMB + mov r7, sp + and r7, r7, #15 +#else + and r7, sp, #15 +#endif + sub sp, sp, r7 + // Now the stack is aligned, store the amount of adjustment back + // on the stack, as we don't want to waste a register as frame + // pointer. + str r7, [sp, #-16]! +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub r7, sp, #4096 + ldr r12, [r7] + sub r7, r7, #(\space - 4096) + mov sp, r7 +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + +.macro add_sp_align space +.if \space >= 4096 + add sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + add sp, sp, #(\space)%4096 +.endif + ldr r7, [sp], #16 + // Add back the original stack adjustment + add sp, sp, r7 +.endm + .macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 vtrn.32 \q0, \q2 vtrn.32 \q1, \q3 diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S index 245af0e786..a83b0fd663 100644 --- a/src/arm/64/itx.S +++ b/src/arm/64/itx.S @@ -3002,29 +3002,6 @@ function inv_txfm_add_vert_dct_8x64_neon br x14 endfunc -.macro sub_sp space -#ifdef _WIN32 -.if \space > 8192 - // Here, we'd need to touch two (or more) pages while decrementing - // the stack pointer. - .error "sub_sp_align doesn't support values over 8K at the moment" -.elseif \space > 4096 - sub x16, sp, #4096 - ldr xzr, [x16] - sub sp, x16, #(\space - 4096) -.else - sub sp, sp, #\space -.endif -#else -.if \space >= 4096 - sub sp, sp, #(\space)/4096*4096 -.endif -.if (\space % 4096) != 0 - sub sp, sp, #(\space)%4096 -.endif -#endif -.endm - function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 idct_dc 64, 64, 2 diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S index 266f57e36e..2cdb583f66 100644 --- a/src/arm/64/itx16.S +++ b/src/arm/64/itx16.S @@ -3240,29 +3240,6 @@ function inv_txfm_add_vert_dct_8x64_neon br x14 endfunc -.macro sub_sp space -#ifdef _WIN32 -.if \space > 8192 - // Here, we'd need to touch two (or more) pages while decrementing - // the stack pointer. - .error "sub_sp_align doesn't support values over 8K at the moment" -.elseif \space > 4096 - sub x16, sp, #4096 - ldr xzr, [x16] - sub sp, x16, #(\space - 4096) -.else - sub sp, sp, #\space -.endif -#else -.if \space >= 4096 - sub sp, sp, #(\space)/4096*4096 -.endif -.if (\space % 4096) != 0 - sub sp, sp, #(\space)%4096 -.endif -#endif -.endm - function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 idct_dc 64, 64, 2 diff --git a/src/arm/64/util.S b/src/arm/64/util.S index 6a65523d80..9013fd4b1e 100644 --- a/src/arm/64/util.S +++ b/src/arm/64/util.S @@ -59,6 +59,29 @@ #endif .endm +.macro sub_sp space +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub x16, sp, #4096 + ldr xzr, [x16] + sub sp, x16, #(\space - 4096) +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + .macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 zip1 \r0\().16b, \r0\().16b, \r1\().16b From 2cf4b8e0bf79390fccff60c6ae065d6321064d9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 8 Feb 2021 13:53:16 +0200 Subject: [PATCH 119/155] arm32: mc: Optimize warp by doing horz filtering in 8 bit Additionally reschedule instructions for loading, to reduce stalls on in order cores. This applies the changes from a3b8157edc3b8a055190ae33497666dec2df81d4 on the arm32 version. Before: Cortex A7 A8 A9 A53 A72 A73 warp_8x8_8bpc_neon: 3659.3 1746.0 1931.9 2128.8 1173.7 1188.9 warp_8x8t_8bpc_neon: 3650.8 1724.6 1919.8 2105.0 1147.7 1206.9 warp_8x8_16bpc_neon: 4039.4 2111.9 2337.1 2462.5 1334.6 1396.5 warp_8x8t_16bpc_neon: 3973.9 2137.1 2299.6 2413.2 1282.8 1369.6 After: warp_8x8_8bpc_neon: 2920.8 1269.8 1410.3 1767.3 860.2 1004.8 warp_8x8t_8bpc_neon: 2904.9 1283.9 1397.5 1743.7 863.6 1024.7 warp_8x8_16bpc_neon: 3895.5 2060.7 2339.8 2376.6 1331.1 1394.0 warp_8x8t_16bpc_neon: 3822.7 2026.7 2298.7 2325.4 1278.1 1360.8 --- src/arm/32/mc.S | 118 +++++++++++++++++++++------------------------- src/arm/32/mc16.S | 2 +- 2 files changed, 56 insertions(+), 64 deletions(-) diff --git a/src/arm/32/mc.S b/src/arm/32/mc.S index d4a90b99c4..223b31279c 100644 --- a/src/arm/32/mc.S +++ b/src/arm/32/mc.S @@ -2966,8 +2966,8 @@ filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6 .endm .macro load_filter_coef dst, src, inc - vld1.8 {\dst}, [r12, :64] add \src, \src, \inc + vld1.8 {\dst}, [r12, :64] .endm .macro load_filter_row dst, src, inc @@ -2978,71 +2978,56 @@ filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6 function warp_filter_horz_neon load_filter_ptr r5 // filter 0 vld1.16 {q7}, [r2], r3 + vmov.i8 q6, #128 load_filter_coef d0, r5, r7 // filter 0 - vmovl.u8 q6, d14 // original pixels - load_filter_row d2, r5, r7 // filter 1 - vmovl.u8 q7, d15 // original pixels - load_filter_row d4, r5, r7 // filter 2 - vmovl.s8 q0, d0 // filter 0 - vext.8 q3, q6, q7, #2*1 // filter 1 pixels + load_filter_row d1, r5, r7 // filter 1 + load_filter_row d2, r5, r7 // filter 2 load_filter_ptr r5 // filter 3 - vmovl.s8 q1, d2 // filter 1 - vmul.i16 q5, q6, q0 // filter 0 output - load_filter_coef d0, r5, r7 // filter 3 - vmovl.s8 q2, d4 // filter 2 + veor q7, q7, q6 // subtract by 128 to allow using vmull + load_filter_coef d3, r5, r7 // filter 3 + vext.8 d12, d14, d15, #1 // filter 1 pixels + vext.8 d13, d14, d15, #2 // filter 2 pixels load_filter_ptr r5 // filter 4 - vext.8 q4, q6, q7, #2*2 // filter 2 pixels - vmul.i16 q3, q3, q1 // filter 1 output - load_filter_coef d2, r5, r7 // filter 4 - vmul.i16 q4, q4, q2 // filter 2 output - vext.8 q2, q6, q7, #2*3 // filter 3 pixels - vmovl.s8 q0, d0 // filter 3 - vpaddl.s16 q5, q5 // pixel 0 (4x32) - vpaddl.s16 q3, q3 // pixel 1 (4x32) - vmul.i16 q0, q2, q0 // filter 3 output + vmull.s8 q2, d14, d0 // filter 0 output + vmull.s8 q3, d12, d1 // filter 1 output + load_filter_coef d0, r5, r7 // filter 4 load_filter_ptr r5 // filter 5 - vext.8 q2, q6, q7, #2*4 // filter 4 pixels - vmovl.s8 q1, d2 // filter 4 - vpaddl.s16 q4, q4 // pixel 2 (4x32) - vpadd.s32 d10, d10, d11 // pixel 0 (2x32) - vpadd.s32 d11, d6, d7 // pixel 1 (2x32) - load_filter_coef d6, r5, r7 // filter 5 - vmul.i16 q1, q2, q1 // filter 4 output - vpadd.s32 d8, d8, d9 // pixel 2 (2x32) + vext.8 d12, d14, d15, #3 // filter 3 pixels + vmull.s8 q4, d13, d2 // filter 2 output + vext.8 d13, d14, d15, #4 // filter 4 pixels + vpadd.i16 d4, d4, d5 // pixel 0 (4x16) + vpadd.i16 d5, d6, d7 // pixel 1 (4x16) + load_filter_coef d1, r5, r7 // filter 5 load_filter_ptr r5 // filter 6 - vpaddl.s16 q0, q0 // pixel 3 (4x32) - vpadd.s32 d10, d10, d11 // pixel 0,1 - vext.8 q2, q6, q7, #2*5 // filter 5 pixels - vmovl.s8 q3, d6 // filter 5 - vpaddl.s16 q1, q1 // pixel 4 (4x32) - vpadd.s32 d9, d0, d1 // pixel 3 (2x32) + vmull.s8 q5, d12, d3 // filter 3 output + vext.8 d12, d14, d15, #5 // filter 5 pixels + vmull.s8 q3, d13, d0 // filter 4 output load_filter_coef d0, r5, r7 // filter 6 - vmul.i16 q2, q2, q3 // filter 5 output - vpadd.s32 d11, d8, d9 // pixel 2,3 + vext.8 d13, d14, d15, #6 // filter 6 pixels load_filter_ptr r5 // filter 7 - vpaddl.s16 q2, q2 // pixel 5 (4x32) - vpadd.s32 d8, d2, d3 // pixel 4 (2x32) - vext.8 q3, q6, q7, #2*6 // filter 6 pixels - vmovl.s8 q0, d0 // filter 6 - vpadd.s32 d9, d4, d5 // pixel 5 (2x32) - load_filter_coef d4, r5, r7 // filter 7 - vpadd.s32 d8, d8, d9 // pixel 4,5 - vext.8 q1, q6, q7, #2*7 // filter 7 pixels - vmovl.s8 q2, d4 // filter 7 - vmul.i16 q3, q3, q0 // filter 6 output - vmul.i16 q1, q1, q2 // filter 7 output + vpadd.i16 d8, d8, d9 // pixel 2 (4x16) + vpadd.i16 d9, d10, d11 // pixel 3 (4x16) + vmull.s8 q5, d12, d1 // filter 5 output + load_filter_coef d1, r5, r7 // filter 7 + vext.8 d14, d14, d15, #7 // filter 7 pixels + vpadd.i16 d6, d6, d7 // pixel 4 (4x16) + vpadd.i16 d10, d10, d11 // pixel 5 (4x16) + vmull.s8 q6, d13, d0 // filter 6 output + vmull.s8 q7, d14, d1 // filter 7 output + sub r5, r5, r7, lsl #3 - vpaddl.s16 q3, q3 // pixel 6 (4x32) - vpaddl.s16 q1, q1 // pixel 7 (4x32) - vpadd.s32 d6, d6, d7 // pixel 6 (2x32) - vpadd.s32 d2, d2, d3 // pixel 7 (2x32) - vpadd.s32 d9, d6, d2 // pixel 6,7 - add r5, r5, r8 + vpadd.i16 d4, d4, d5 // pixel 0,1 (2x16) + vpadd.i16 d5, d8, d9 // pixel 2,3 (2x16) + vpadd.i16 d12, d12, d13 // pixel 6 (4x16) + vpadd.i16 d14, d14, d15 // pixel 7 (4x16) + vpadd.i16 d6, d6, d10 // pixel 4,5 (2x16) + vpadd.i16 d10, d12, d14 // pixel 6,7 (2x16) + vpadd.i16 d4, d4, d5 // pixel 0-3 + vpadd.i16 d5, d6, d10 // pixel 4-7 - vrshrn.s32 d10, q5, #3 - vrshrn.s32 d11, q4, #3 + add r5, r5, r8 bx lr endfunc @@ -3074,23 +3059,23 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1 add r6, r6, #512 bl warp_filter_horz_neon - vmov q8, q5 + vrshr.s16 q8, q2, #3 bl warp_filter_horz_neon - vmov q9, q5 + vrshr.s16 q9, q2, #3 bl warp_filter_horz_neon - vmov q10, q5 + vrshr.s16 q10, q2, #3 bl warp_filter_horz_neon - vmov q11, q5 + vrshr.s16 q11, q2, #3 bl warp_filter_horz_neon - vmov q12, q5 + vrshr.s16 q12, q2, #3 bl warp_filter_horz_neon - vmov q13, q5 + vrshr.s16 q13, q2, #3 bl warp_filter_horz_neon - vmov q14, q5 + vrshr.s16 q14, q2, #3 1: bl warp_filter_horz_neon - vmov q15, q5 + vrshr.s16 q15, q2, #3 load_filter_row d8, r6, r9 load_filter_row d9, r6, r9 @@ -3133,12 +3118,19 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1 vmlal.s16 q1, d29, d5 vmlal.s16 q1, d31, d7 +.ifb \t + vmov.i16 q7, #128 +.else + vmov.i16 q7, #0x800 +.endif + vmov q8, q9 vmov q9, q10 vqrshrn.s32 d0, q0, #\shift vmov q10, q11 vqrshrn.s32 d1, q1, #\shift vmov q11, q12 + vadd.i16 q0, q0, q7 vmov q12, q13 .ifb \t vqmovun.s16 d0, q0 diff --git a/src/arm/32/mc16.S b/src/arm/32/mc16.S index eb7b3b549e..8d4f03263e 100644 --- a/src/arm/32/mc16.S +++ b/src/arm/32/mc16.S @@ -3154,8 +3154,8 @@ filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10 .endm .macro load_filter_coef dst, src, inc - vld1.8 {\dst}, [r12, :64] add \src, \src, \inc + vld1.8 {\dst}, [r12, :64] .endm .macro load_filter_row dst, src, inc From e90785c7e55255f891cbf0a4a35a612f39e3e288 Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Mon, 8 Feb 2021 18:32:26 -0500 Subject: [PATCH 120/155] arm64: mc: Improve first tap for inorder cores Change order of multiply accumulates to allow inorder cores to forward the results. --- src/arm/64/mc.S | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index 6ebcc42a6f..ef7f23b226 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -2180,16 +2180,7 @@ L(\type\()_8tap_filter_4): lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 - ld1 {v28.8b, v29.8b}, [\src], \s_strd - uxtl v28.8h, v28.8b - uxtl v29.8h, v29.8b - mul v24.8h, v28.8h, v0.h[0] -.irpc i, 1234567 - ext v26.16b, v28.16b, v29.16b, #(2*\i) - mla v24.8h, v26.8h, v0.h[\i] -.endr - srshr v16.8h, v24.8h, #2 - + bl L(\type\()_8tap_filter_8_first) bl L(\type\()_8tap_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b @@ -2267,16 +2258,7 @@ L(\type\()_8tap_filter_4): lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 - ld1 {v28.8b, v29.8b}, [\src], \s_strd - uxtl v28.8h, v28.8b - uxtl v29.8h, v29.8b - mul v24.8h, v28.8h, v0.h[0] -.irpc i, 1234567 - ext v26.16b, v28.16b, v29.16b, #(2*\i) - mla v24.8h, v26.8h, v0.h[\i] -.endr - srshr v16.8h, v24.8h, #2 - + bl L(\type\()_8tap_filter_8_first) bl L(\type\()_8tap_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b @@ -2363,6 +2345,28 @@ L(\type\()_8tap_filter_4): 0: br x15 +L(\type\()_8tap_filter_8_first): + ld1 {v28.8b, v29.8b}, [\src], \s_strd + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + mul v16.8h, v28.8h, v0.h[0] + ext v24.16b, v28.16b, v29.16b, #(2*1) + ext v25.16b, v28.16b, v29.16b, #(2*2) + ext v26.16b, v28.16b, v29.16b, #(2*3) + ext v27.16b, v28.16b, v29.16b, #(2*4) + mla v16.8h, v24.8h, v0.h[1] + mla v16.8h, v25.8h, v0.h[2] + mla v16.8h, v26.8h, v0.h[3] + mla v16.8h, v27.8h, v0.h[4] + ext v24.16b, v28.16b, v29.16b, #(2*5) + ext v25.16b, v28.16b, v29.16b, #(2*6) + ext v26.16b, v28.16b, v29.16b, #(2*7) + mla v16.8h, v24.8h, v0.h[5] + mla v16.8h, v25.8h, v0.h[6] + mla v16.8h, v26.8h, v0.h[7] + srshr v16.8h, v16.8h, #2 + ret + L(\type\()_8tap_filter_8): ld1 {v28.8b, v29.8b}, [\sr2], \s_strd ld1 {v30.8b, v31.8b}, [\src], \s_strd From 471ddfe6e16f1eba2615ba0f2e72ac510f11ea99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 28 Jan 2021 13:16:33 +0200 Subject: [PATCH 121/155] arm64: looprestoration: Rewrite the wiener functions Make them operate in a more cache friendly manner, interleaving horizontal and vertical filtering (reducing the amount of stack used from 51 KB to 4 KB), similar to what was done for x86 in 78d27b7d1c923f632bc266470436e7f46a940d70. This also adds separate 5tap versions of the filters and unrolls the vertical filter a bit more (which maybe could have been done without doing the rewrite). This does, however, increase the compiled code size by around 3.5 KB. Before: Cortex A53 A72 A73 wiener_5tap_8bpc_neon: 136855.6 91446.2 87363.6 wiener_7tap_8bpc_neon: 136861.6 91454.9 87374.5 wiener_5tap_10bpc_neon: 167685.3 114720.3 116522.1 wiener_5tap_12bpc_neon: 167677.5 114724.7 116511.9 wiener_7tap_10bpc_neon: 167681.6 114738.5 116567.0 wiener_7tap_12bpc_neon: 167673.8 114720.8 116515.4 After: wiener_5tap_8bpc_neon: 87102.1 60460.6 66803.8 wiener_7tap_8bpc_neon: 110831.7 78489.0 82015.9 wiener_5tap_10bpc_neon: 109999.2 90259.0 89238.0 wiener_5tap_12bpc_neon: 109978.3 90255.7 89220.7 wiener_7tap_10bpc_neon: 137877.6 107578.5 103435.6 wiener_7tap_12bpc_neon: 137868.8 107568.9 103390.4 --- src/arm/64/looprestoration.S | 1140 +++++++++++++++++++++-------- src/arm/64/looprestoration16.S | 1251 ++++++++++++++++++++++++-------- 2 files changed, 1805 insertions(+), 586 deletions(-) diff --git a/src/arm/64/looprestoration.S b/src/arm/64/looprestoration.S index a72c8b9553..35e71b8860 100644 --- a/src/arm/64/looprestoration.S +++ b/src/arm/64/looprestoration.S @@ -33,124 +33,208 @@ const right_ext_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 right_ext_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst -// void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], -// const pixel *src, ptrdiff_t stride, -// const int16_t fh[8], intptr_t w, -// int h, enum LrEdgeFlags edges); -function wiener_filter_h_8bpc_neon, export=1 - mov w8, w5 - ld1 {v0.8h}, [x4] - mov w9, #(1 << 14) - (1 << 2) - dup v30.8h, w9 +// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t p_stride, +// const pixel (*left)[4], +// const pixel *lpf, const ptrdiff_t lpf_stride, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges); +function wiener_filter7_8bpc_neon, export=1 + ldr w8, [sp] + stp x29, x30, [sp, #-16]! + mov x29, sp + ld1 {v0.8h, v1.8h}, [x7] + tst w8, #4 // LR_HAVE_TOP + sub_sp 384*2*6 + + mov w17, #(1 << 14) - (1 << 2) + dup v30.8h, w17 movi v31.8h, #8, lsl #8 - // Calculate mid_stride - add w10, w5, #7 - bic w10, w10, #7 - lsl w10, w10, #1 - // Set up pointers for reading/writing alternate rows - add x12, x0, x10 - lsl w10, w10, #1 - add x13, x2, x3 - lsl x3, x3, #1 + // x9 - t6 + // x10 - t5 + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_7) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter7_h_8bpc_neon + add x3, x3, x4 // lpf += lpf_stride + mov x9, x14 // t6 + mov x10, x14 // t5 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + add x3, x3, x4, lsl #2 + add x3, x3, x4 // lpf += lpf_stride*5 + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter7_h_8bpc_neon + subs w6, w6, #1 // h-- + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + mov x13, x14 // t2 + subs w6, w6, #1 // h-- + b.eq L(v2_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += p_stride + +L(main_7): + add x15, x14, #384*2 // t0 = t1 + 384*2 +L(main_loop_7): + bl wiener_filter7_hv_8bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_loop_7) + tst w8, #8 // LR_HAVE_BOTTOM + b.eq L(v3_7) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + sub x4, x4, x1 // lpf_stride - p_stride + bl wiener_filter7_hv_8bpc_neon + add x3, x3, x4 // src += lpf_stride - p_stride + bl wiener_filter7_hv_8bpc_neon +L(v1_7): + bl wiener_filter7_v_8bpc_neon + + mov sp, x29 + ldp x29, x30, [sp], #16 + ret + +L(no_top_7): + add x3, x3, x4, lsl #2 + add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter7_h_8bpc_neon + subs w6, w6, #1 // h-- + mov x9, x14 // t6 + mov x10, x14 // t5 + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + subs w6, w6, #1 // h-- + mov x13, x14 // t2 + b.eq L(v2_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_8bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += p_stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter7_hv_8bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v3_7) + add x15, x15, #384*2*4 // t0 += 384*2*4 + bl wiener_filter7_hv_8bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_7) +L(v3_7): + bl wiener_filter7_v_8bpc_neon +L(v2_7): + bl wiener_filter7_v_8bpc_neon + b L(v1_7) +endfunc - // Subtract the aligned width from mid_stride - add w11, w5, #7 - bic w11, w11, #7 - sub x10, x10, w11, uxtw #1 - // Subtract the number of pixels read from the source stride - add w11, w11, #8 - sub x3, x3, w11, uxtw +function wiener_filter7_h_8bpc_neon + stp x3, x5, [sp, #-32]! + str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w7, #1 // LR_HAVE_LEFT - b.eq 2f + tst w8, #1 // LR_HAVE_LEFT + b.eq 1f // LR_HAVE_LEFT - cbnz x1, 0f + cbnz x2, 0f // left == NULL - sub x2, x2, #3 - sub x13, x13, #3 - b 1f -0: // LR_HAVE_LEFT, left != NULL -2: // !LR_HAVE_LEFT, increase the stride. - // For this case we don't read the left 3 pixels from the src pointer, - // but shift it as if we had done that. - add x3, x3, #3 - - -1: // Loop vertically - ld1 {v3.16b}, [x2], #16 - ld1 {v5.16b}, [x13], #16 + sub x3, x3, #3 + ld1 {v3.16b}, [x3], #16 + b 2f - tst w7, #1 // LR_HAVE_LEFT - b.eq 0f - cbz x1, 2f +0: // LR_HAVE_LEFT, left != NULL - ld1 {v2.s}[3], [x1], #4 - // Move x2/x13 back to account for the last 3 bytes we loaded earlier, + ld1 {v3.16b}, [x3], #16 + ld1 {v2.s}[3], [x2], #4 + // Move x3 back to account for the last 3 bytes we loaded earlier, // which we'll shift out. - sub x2, x2, #3 - sub x13, x13, #3 - ld1 {v4.s}[3], [x1], #4 - ext v3.16b, v2.16b, v3.16b, #13 - ext v5.16b, v4.16b, v5.16b, #13 + sub x3, x3, #3 + ext v3.16b, v2.16b, v3.16b, #13 b 2f -0: + +1: + ld1 {v3.16b}, [x3], #16 // !LR_HAVE_LEFT, fill v2 with the leftmost byte // and shift v3 to have 3x the first byte at the front. - dup v2.16b, v3.b[0] - dup v4.16b, v5.b[0] - // Move x2 back to account for the last 3 bytes we loaded before, + dup v2.16b, v3.b[0] + // Move x3 back to account for the last 3 bytes we loaded before, // which we shifted out. - sub x2, x2, #3 - sub x13, x13, #3 - ext v3.16b, v2.16b, v3.16b, #13 - ext v5.16b, v4.16b, v5.16b, #13 + sub x3, x3, #3 + ext v3.16b, v2.16b, v3.16b, #13 2: - uxtl v2.8h, v3.8b - uxtl2 v3.8h, v3.16b - uxtl v4.8h, v5.8b - uxtl2 v5.8h, v5.16b + ld1 {v4.8b}, [x3], #8 + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v4.8b - tst w7, #2 // LR_HAVE_RIGHT + tst w8, #2 // LR_HAVE_RIGHT b.ne 4f - // If we'll need to pad the right edge, load that byte to pad with - // here since we can find it pretty easily from here. - sub w9, w5, #14 - ldr b28, [x2, w9, sxtw] - ldr b29, [x13, w9, sxtw] - // Fill v28/v29 with the right padding pixel - dup v28.8h, v28.h[0] - dup v29.8h, v29.h[0] + 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #11 - b.ge 4f // If w >= 11, all used input pixels are valid + cmp w5, #19 + b.ge 4f // If w >= 19, all used input pixels are valid - // 1 <= w < 11, w+3 pixels valid in v2-v3. For w=9 or w=10, + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. - // Insert padding in v2/3.h[w+3] onwards; fuse the +3 (*2) into the + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w5, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. - movrel x4, right_ext_mask, -6 - sub x4, x4, w5, uxtw #1 - ld1 {v26.16b, v27.16b}, [x4] + movrel x7, right_ext_mask, -6 + ldr b28, [x3, w17, sxtw] + sub x7, x7, w5, uxtw #1 + dup v28.8h, v28.h[0] + ld1 {v25.16b, v26.16b, v27.16b}, [x7] - bit v2.16b, v28.16b, v26.16b - bit v3.16b, v28.16b, v27.16b - bit v4.16b, v29.16b, v26.16b - bit v5.16b, v29.16b, v27.16b + bit v2.16b, v28.16b, v25.16b + bit v3.16b, v28.16b, v26.16b + bit v4.16b, v28.16b, v27.16b 4: // Loop horizontally // Interleaving the mul/mla chains actually hurts performance @@ -165,234 +249,724 @@ function wiener_filter_h_8bpc_neon, export=1 add v19.8h, v19.8h, v17.8h add v20.8h, v20.8h, v16.8h add v21.8h, v21.8h, v2.8h + shl v22.8h, v18.8h, #7 mul v6.8h, v18.8h, v0.h[3] mla v6.8h, v19.8h, v0.h[4] mla v6.8h, v20.8h, v0.h[5] mla v6.8h, v21.8h, v0.h[6] - ext v23.16b, v4.16b, v5.16b, #4 - ext v25.16b, v4.16b, v5.16b, #8 - ext v22.16b, v4.16b, v5.16b, #2 - ext v26.16b, v4.16b, v5.16b, #10 - ext v27.16b, v4.16b, v5.16b, #12 - ext v24.16b, v4.16b, v5.16b, #6 - add v25.8h, v25.8h, v23.8h - add v26.8h, v26.8h, v22.8h - add v27.8h, v27.8h, v4.8h - mul v7.8h, v24.8h, v0.h[3] - mla v7.8h, v25.8h, v0.h[4] - mla v7.8h, v26.8h, v0.h[5] - mla v7.8h, v27.8h, v0.h[6] - - shl v18.8h, v18.8h, #7 - shl v24.8h, v24.8h, #7 - sub v18.8h, v18.8h, v30.8h - sub v24.8h, v24.8h, v30.8h - sqadd v6.8h, v6.8h, v18.8h - sqadd v7.8h, v7.8h, v24.8h + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + shl v23.8h, v18.8h, #7 + mul v7.8h, v18.8h, v0.h[3] + mla v7.8h, v19.8h, v0.h[4] + mla v7.8h, v20.8h, v0.h[5] + mla v7.8h, v21.8h, v0.h[6] + + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h sshr v6.8h, v6.8h, #3 sshr v7.8h, v7.8h, #3 add v6.8h, v6.8h, v31.8h add v7.8h, v7.8h, v31.8h - subs w5, w5, #8 + subs w5, w5, #16 - st1 {v6.8h}, [x0], #16 - st1 {v7.8h}, [x12], #16 + st1 {v6.8h, v7.8h}, [x14], #32 - b.le 9f - tst w7, #2 // LR_HAVE_RIGHT - mov v2.16b, v3.16b - mov v4.16b, v5.16b - ld1 {v3.8b}, [x2], #8 - ld1 {v5.8b}, [x13], #8 - uxtl v3.8h, v3.8b - uxtl v5.8h, v5.8b + b.le 0f + mov v2.16b, v4.16b + ld1 {v4.16b}, [x3], #16 + tst w8, #2 // LR_HAVE_RIGHT + uxtl v3.8h, v4.8b + uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. -9: - subs w6, w6, #2 - b.le 0f - // Jump to the next row and loop horizontally - add x0, x0, x10 - add x12, x12, x10 - add x2, x2, x3 - add x13, x13, x3 - mov w5, w8 - b 1b 0: + ldr x14, [sp, #16] + ldp x3, x5, [sp], #32 ret endfunc -// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride, -// const int16_t *mid, int w, int h, -// const int16_t fv[8], enum LrEdgeFlags edges, -// ptrdiff_t mid_stride); -function wiener_filter_v_8bpc_neon, export=1 - mov w8, w4 - ld1 {v0.8h}, [x5] - - // Calculate the number of rows to move back when looping vertically - mov w11, w4 - tst w6, #4 // LR_HAVE_TOP - b.eq 0f - sub x2, x2, x7, lsl #1 - add w11, w11, #2 +function wiener_filter7_v_8bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, afterwards. + stp x10, x11, [sp, #-64]! + stp x12, x13, [sp, #16] + stp x14, x14, [sp, #32] + stp x0, x5, [sp, #48] +1: + ld1 {v20.8h, v21.8h}, [x11], #32 + ld1 {v24.8h, v25.8h}, [x13], #32 + + ld1 {v18.8h, v19.8h}, [x10], #32 + add v24.8h, v24.8h, v20.8h + ld1 {v26.8h, v27.8h}, [x14], #32 + + ld1 {v16.8h, v17.8h}, [x9], #32 + add v28.8h, v26.8h, v18.8h + ld1 {v22.8h, v23.8h}, [x12], #32 + + add v16.8h, v26.8h, v16.8h + add v25.8h, v25.8h, v21.8h + + smull v2.4s, v22.4h, v1.h[3] + smlal v2.4s, v24.4h, v1.h[4] + smlal v2.4s, v28.4h, v1.h[5] + smlal v2.4s, v16.4h, v1.h[6] + add v29.8h, v27.8h, v19.8h + smull2 v3.4s, v22.8h, v1.h[3] + smlal2 v3.4s, v24.8h, v1.h[4] + smlal2 v3.4s, v28.8h, v1.h[5] + smlal2 v3.4s, v16.8h, v1.h[6] + add v17.8h, v27.8h, v17.8h + smull v4.4s, v23.4h, v1.h[3] + smlal v4.4s, v25.4h, v1.h[4] + smlal v4.4s, v29.4h, v1.h[5] + smlal v4.4s, v17.4h, v1.h[6] + smull2 v5.4s, v23.8h, v1.h[3] + smlal2 v5.4s, v25.8h, v1.h[4] + smlal2 v5.4s, v29.8h, v1.h[5] + smlal2 v5.4s, v17.8h, v1.h[6] + sqrshrun v2.4h, v2.4s, #11 + sqrshrun2 v2.8h, v3.4s, #11 + sqrshrun v3.4h, v4.4s, #11 + sqrshrun2 v3.8h, v5.4s, #11 + sqxtun v2.8b, v2.8h + sqxtun2 v2.16b, v3.8h + subs w5, w5, #16 + st1 {v2.16b}, [x0], #16 + b.gt 1b + + ldp x0, x5, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #64 + + add x0, x0, x1 + ret +endfunc + +function wiener_filter7_hv_8bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, and x15==x9, afterwards. + stp x10, x11, [sp, #-80]! + stp x12, x13, [sp, #16] + stp x14, x15, [sp, #32] + stp x10, x0, [sp, #48] + stp x3, x5, [sp, #64] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w8, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #3 + ld1 {v3.16b}, [x3], #16 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v3.16b}, [x3], #16 + ld1 {v2.s}[3], [x2], #4 + // Move x3 back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #3 + ext v3.16b, v2.16b, v3.16b, #13 + b 2f +1: + ld1 {v3.16b}, [x3], #16 + // !LR_HAVE_LEFT, fill v2 with the leftmost byte + // and shift v3 to have 3x the first byte at the front. + dup v2.16b, v3.b[0] + // Move x3 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub x3, x3, #3 + ext v3.16b, v2.16b, v3.16b, #13 + +2: + ld1 {v4.8b}, [x3], #8 + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v4.8b + + tst w8, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #19 + b.ge 4f // If w >= 19, all used input pixels are valid + + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w5, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -6 + ldr b28, [x3, w17, sxtw] + sub x7, x7, w5, uxtw #1 + dup v28.8h, v28.h[0] + ld1 {v25.16b, v26.16b, v27.16b}, [x7] + + bit v2.16b, v28.16b, v25.16b + bit v3.16b, v28.16b, v26.16b + bit v4.16b, v28.16b, v27.16b + +4: // Loop horizontally + ext v17.16b, v2.16b, v3.16b, #4 + ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 + ext v20.16b, v2.16b, v3.16b, #10 + ext v21.16b, v2.16b, v3.16b, #12 + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + shl v22.8h, v18.8h, #7 + mul v6.8h, v18.8h, v0.h[3] + mla v6.8h, v19.8h, v0.h[4] + mla v6.8h, v20.8h, v0.h[5] + mla v6.8h, v21.8h, v0.h[6] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + shl v23.8h, v18.8h, #7 + mul v7.8h, v18.8h, v0.h[3] + mla v7.8h, v19.8h, v0.h[4] + mla v7.8h, v20.8h, v0.h[5] + mla v7.8h, v21.8h, v0.h[6] + + ld1 {v20.8h, v21.8h}, [x11], #32 + + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + ld1 {v26.8h, v27.8h}, [x13], #32 + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h + ld1 {v18.8h, v19.8h}, [x10], #32 + sshr v6.8h, v6.8h, #3 + sshr v7.8h, v7.8h, #3 + ld1 {v28.8h, v29.8h}, [x14], #32 + add v6.8h, v6.8h, v31.8h + add v7.8h, v7.8h, v31.8h + + ld1 {v16.8h, v17.8h}, [x9], #32 + add v26.8h, v20.8h, v26.8h + + ld1 {v24.8h, v25.8h}, [x12], #32 + add v28.8h, v18.8h, v28.8h + + add v16.8h, v16.8h, v6.8h + add v27.8h, v21.8h, v27.8h + + smull v18.4s, v24.4h, v1.h[3] + smlal v18.4s, v26.4h, v1.h[4] + smlal v18.4s, v28.4h, v1.h[5] + smlal v18.4s, v16.4h, v1.h[6] + add v29.8h, v19.8h, v29.8h + smull2 v19.4s, v24.8h, v1.h[3] + smlal2 v19.4s, v26.8h, v1.h[4] + smlal2 v19.4s, v28.8h, v1.h[5] + smlal2 v19.4s, v16.8h, v1.h[6] + add v17.8h, v17.8h, v7.8h + smull v20.4s, v25.4h, v1.h[3] + smlal v20.4s, v27.4h, v1.h[4] + smlal v20.4s, v29.4h, v1.h[5] + smlal v20.4s, v17.4h, v1.h[6] + smull2 v21.4s, v25.8h, v1.h[3] + smlal2 v21.4s, v27.8h, v1.h[4] + smlal2 v21.4s, v29.8h, v1.h[5] + smlal2 v21.4s, v17.8h, v1.h[6] + sqrshrun v18.4h, v18.4s, #11 + sqrshrun2 v18.8h, v19.4s, #11 + sqrshrun v19.4h, v20.4s, #11 + sqrshrun2 v19.8h, v21.4s, #11 + st1 {v6.8h, v7.8h}, [x15], #32 + sqxtun v18.8b, v18.8h + sqxtun2 v18.16b, v19.8h + subs w5, w5, #16 + + st1 {v18.16b}, [x0], #16 + + b.le 0f + mov v2.16b, v4.16b + ld1 {v4.16b}, [x3], #16 + tst w8, #2 // LR_HAVE_RIGHT + uxtl v3.8h, v4.8b + uxtl2 v4.8h, v4.16b + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + 0: - tst w6, #8 // LR_HAVE_BOTTOM + ldp x3, x5, [sp, #64] + ldp x15, x0, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #80 + + add x3, x3, x1 + add x0, x0, x1 + + ret +endfunc + +// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t p_stride, +// const pixel (*left)[4], +// const pixel *lpf, const ptrdiff_t lpf_stride, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges); +function wiener_filter5_8bpc_neon, export=1 + ldr w8, [sp] + stp x29, x30, [sp, #-16]! + mov x29, sp + ld1 {v0.8h, v1.8h}, [x7] + tst w8, #4 // LR_HAVE_TOP + sub_sp 384*2*4 + + mov w17, #(1 << 14) - (1 << 2) + dup v30.8h, w17 + movi v31.8h, #8, lsl #8 + + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_5) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter5_h_8bpc_neon + add x3, x3, x4 // lpf += lpf_stride + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_8bpc_neon + add x3, x3, x4, lsl #2 + add x3, x3, x4 // lpf += lpf_stride*5 + mov x12, x14 // t3 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter5_h_8bpc_neon + subs w6, w6, #1 // h-- + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_8bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += p_stride + +L(main_5): + mov x15, x11 // t0 = t4 +L(main_loop_5): + bl wiener_filter5_hv_8bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_loop_5) + tst w8, #8 // LR_HAVE_BOTTOM + b.eq L(v2_5) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + sub x4, x4, x1 // lpf_stride - p_stride + bl wiener_filter5_hv_8bpc_neon + add x3, x3, x4 // src += lpf_stride - p_stride + bl wiener_filter5_hv_8bpc_neon +L(end_5): + + mov sp, x29 + ldp x29, x30, [sp], #16 + ret + +L(no_top_5): + add x3, x3, x4, lsl #2 + add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter5_h_8bpc_neon + subs w6, w6, #1 // h-- + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_8bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += p_stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter5_hv_8bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v2_5) + add x15, x15, #384*2*3 // t0 += 384*2*3 + bl wiener_filter5_hv_8bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_5) +L(v2_5): + bl wiener_filter5_v_8bpc_neon + add x0, x0, x1 + mov x11, x12 + mov x12, x13 + mov x13, x14 +L(v1_5): + bl wiener_filter5_v_8bpc_neon + b L(end_5) +endfunc + + +function wiener_filter5_h_8bpc_neon + stp x3, x5, [sp, #-32]! + str x14, [sp, #16] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w8, #1 // LR_HAVE_LEFT b.eq 1f - add w11, w11, #2 + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #2 + ld1 {v3.16b}, [x3], #16 + b 2f -1: // Start of horizontal loop; start one vertical filter slice. - // Load rows into v16-v19 and pad properly. - tst w6, #4 // LR_HAVE_TOP - ld1 {v16.8h}, [x2], x7 - b.eq 2f - // LR_HAVE_TOP - ld1 {v18.8h}, [x2], x7 - mov v17.16b, v16.16b - ld1 {v19.8h}, [x2], x7 - b 3f -2: // !LR_HAVE_TOP - mov v17.16b, v16.16b - mov v18.16b, v16.16b - mov v19.16b, v16.16b - -3: - cmp w4, #4 - b.lt 5f - // Start filtering normally; fill in v20-v22 with unique rows. - ld1 {v20.8h}, [x2], x7 - ld1 {v21.8h}, [x2], x7 - ld1 {v22.8h}, [x2], x7 - -4: -.macro filter compare - subs w4, w4, #1 +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v3.16b}, [x3], #16 + ld1 {v2.s}[3], [x2], #4 + // Move x3 back to account for the last 2 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #2 + ext v3.16b, v2.16b, v3.16b, #14 + b 2f + +1: + ld1 {v3.16b}, [x3], #16 + // !LR_HAVE_LEFT, fill v2 with the leftmost byte + // and shift v3 to have 3x the first byte at the front. + dup v2.16b, v3.b[0] + // Move x3 back to account for the last 2 bytes we loaded before, + // which we shifted out. + sub x3, x3, #2 + ext v3.16b, v2.16b, v3.16b, #14 + +2: + ld1 {v4.8b}, [x3], #8 + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v4.8b + + tst w8, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w5, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -4 + ldr b28, [x3, w17, sxtw] + sub x7, x7, w5, uxtw #1 + dup v28.8h, v28.h[0] + ld1 {v25.16b, v26.16b, v27.16b}, [x7] + + bit v2.16b, v28.16b, v25.16b + bit v3.16b, v28.16b, v26.16b + bit v4.16b, v28.16b, v27.16b + +4: // Loop horizontally // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - add v25.8h, v18.8h, v20.8h - add v26.8h, v17.8h, v21.8h - add v27.8h, v16.8h, v22.8h - smull v2.4s, v19.4h, v0.h[3] - smlal v2.4s, v25.4h, v0.h[4] - smlal v2.4s, v26.4h, v0.h[5] - smlal v2.4s, v27.4h, v0.h[6] - smull2 v3.4s, v19.8h, v0.h[3] - smlal2 v3.4s, v25.8h, v0.h[4] - smlal2 v3.4s, v26.8h, v0.h[5] - smlal2 v3.4s, v27.8h, v0.h[6] - sqrshrun v2.4h, v2.4s, #11 - sqrshrun2 v2.8h, v3.4s, #11 - sqxtun v2.8b, v2.8h - st1 {v2.8b}, [x0], x1 -.if \compare - cmp w4, #4 -.else - b.le 9f -.endif - mov v16.16b, v17.16b - mov v17.16b, v18.16b - mov v18.16b, v19.16b - mov v19.16b, v20.16b - mov v20.16b, v21.16b - mov v21.16b, v22.16b -.endm - filter 1 - b.lt 7f - ld1 {v22.8h}, [x2], x7 - b 4b - -5: // Less than 4 rows in total; not all of v20-v21 are filled yet. - tst w6, #8 // LR_HAVE_BOTTOM - b.eq 6f - // LR_HAVE_BOTTOM - cmp w4, #2 - // We load at least 2 rows in all cases. - ld1 {v20.8h}, [x2], x7 - ld1 {v21.8h}, [x2], x7 - b.gt 53f // 3 rows in total - b.eq 52f // 2 rows in total -51: // 1 row in total, v19 already loaded, load edge into v20-v22. - mov v22.16b, v21.16b - b 8f -52: // 2 rows in total, v19 already loaded, load v20 with content data - // and 2 rows of edge. - ld1 {v22.8h}, [x2], x7 - mov v23.16b, v22.16b - b 8f -53: - // 3 rows in total, v19 already loaded, load v20 and v21 with content - // and 2 rows of edge. - ld1 {v22.8h}, [x2], x7 - ld1 {v23.8h}, [x2], x7 - mov v24.16b, v23.16b - b 8f - -6: - // !LR_HAVE_BOTTOM - cmp w4, #2 - b.gt 63f // 3 rows in total - b.eq 62f // 2 rows in total -61: // 1 row in total, v19 already loaded, pad that into v20-v22. - mov v20.16b, v19.16b - mov v21.16b, v19.16b - mov v22.16b, v19.16b - b 8f -62: // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23. - ld1 {v20.8h}, [x2], x7 - mov v21.16b, v20.16b - mov v22.16b, v20.16b - mov v23.16b, v20.16b - b 8f -63: - // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24. - ld1 {v20.8h}, [x2], x7 - ld1 {v21.8h}, [x2], x7 - mov v22.16b, v21.16b - mov v23.16b, v21.16b - mov v24.16b, v21.16b - b 8f - -7: - // All registers up to v21 are filled already, 3 valid rows left. - // < 4 valid rows left; fill in padding and filter the last - // few rows. - tst w6, #8 // LR_HAVE_BOTTOM - b.eq 71f - // LR_HAVE_BOTTOM; load 2 rows of edge. - ld1 {v22.8h}, [x2], x7 - ld1 {v23.8h}, [x2], x7 - mov v24.16b, v23.16b - b 8f -71: - // !LR_HAVE_BOTTOM, pad 3 rows - mov v22.16b, v21.16b - mov v23.16b, v21.16b - mov v24.16b, v21.16b - -8: // At this point, all registers up to v22-v24 are loaded with - // edge/padding (depending on how many rows are left). - filter 0 // This branches to 9f when done - mov v22.16b, v23.16b - mov v23.16b, v24.16b - b 8b - -9: // End of one vertical slice. - subs w3, w3, #8 + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + shl v22.8h, v17.8h, #7 + mul v6.8h, v17.8h, v0.h[3] + mla v6.8h, v18.8h, v0.h[4] + mla v6.8h, v19.8h, v0.h[5] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + shl v23.8h, v17.8h, #7 + mul v7.8h, v17.8h, v0.h[3] + mla v7.8h, v18.8h, v0.h[4] + mla v7.8h, v19.8h, v0.h[5] + + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h + sshr v6.8h, v6.8h, #3 + sshr v7.8h, v7.8h, #3 + add v6.8h, v6.8h, v31.8h + add v7.8h, v7.8h, v31.8h + + subs w5, w5, #16 + + st1 {v6.8h, v7.8h}, [x14], #32 + b.le 0f - // Move pointers back up to the top and loop horizontally. - msub x0, x1, x8, x0 - msub x2, x7, x11, x2 - add x0, x0, #8 - add x2, x2, #16 - mov w4, w8 - b 1b + mov v2.16b, v4.16b + ld1 {v4.16b}, [x3], #16 + tst w8, #2 // LR_HAVE_RIGHT + uxtl v3.8h, v4.8b + uxtl2 v4.8h, v4.16b + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. 0: + ldr x14, [sp, #16] + ldp x3, x5, [sp], #32 + ret +endfunc + +function wiener_filter5_v_8bpc_neon + stp x11, x12, [sp, #-48]! + stp x13, x14, [sp, #16] + stp x0, x5, [sp, #32] +1: + ld1 {v18.8h, v19.8h}, [x12], #32 + ld1 {v22.8h, v23.8h}, [x14], #32 + ld1 {v16.8h, v17.8h}, [x11], #32 + + add v24.8h, v22.8h, v18.8h + ld1 {v20.8h, v21.8h}, [x13], #32 + add v16.8h, v22.8h, v16.8h + add v25.8h, v23.8h, v19.8h + + smull v2.4s, v20.4h, v1.h[3] + smlal v2.4s, v24.4h, v1.h[4] + smlal v2.4s, v16.4h, v1.h[5] + add v17.8h, v23.8h, v17.8h + smull2 v3.4s, v20.8h, v1.h[3] + smlal2 v3.4s, v24.8h, v1.h[4] + smlal2 v3.4s, v16.8h, v1.h[5] + smull v4.4s, v21.4h, v1.h[3] + smlal v4.4s, v25.4h, v1.h[4] + smlal v4.4s, v17.4h, v1.h[5] + smull2 v5.4s, v21.8h, v1.h[3] + smlal2 v5.4s, v25.8h, v1.h[4] + smlal2 v5.4s, v17.8h, v1.h[5] + sqrshrun v2.4h, v2.4s, #11 + sqrshrun2 v2.8h, v3.4s, #11 + sqrshrun v3.4h, v4.4s, #11 + sqrshrun2 v3.8h, v5.4s, #11 + sqxtun v2.8b, v2.8h + sqxtun2 v2.16b, v3.8h + subs w5, w5, #16 + st1 {v2.16b}, [x0], #16 + b.gt 1b + + ldp x0, x5, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #48 + + ret +endfunc + +function wiener_filter5_hv_8bpc_neon + // Backing up/restoring registers shifted, so that x11 gets the value + // of x12, etc, and x15==x11, afterwards. + stp x12, x13, [sp, #-64]! + stp x14, x15, [sp, #16] + stp x12, x0, [sp, #32] + stp x3, x5, [sp, #48] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w8, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #2 + ld1 {v3.16b}, [x3], #16 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v3.16b}, [x3], #16 + ld1 {v2.s}[3], [x2], #4 + // Move x3 back to account for the last 2 bytes we loaded earlier, + // which we'll shift out. + sub x3, x3, #2 + ext v3.16b, v2.16b, v3.16b, #14 + b 2f +1: + ld1 {v3.16b}, [x3], #16 + // !LR_HAVE_LEFT, fill v2 with the leftmost byte + // and shift v3 to have 2x the first byte at the front. + dup v2.16b, v3.b[0] + // Move x3 back to account for the last 2 bytes we loaded before, + // which we shifted out. + sub x3, x3, #2 + ext v3.16b, v2.16b, v3.16b, #14 + +2: + ld1 {v4.8b}, [x3], #8 + uxtl v2.8h, v3.8b + uxtl2 v3.8h, v3.16b + uxtl v4.8h, v4.8b + + tst w8, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w5, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -4 + ldr b28, [x3, w17, sxtw] + sub x7, x7, w5, uxtw #1 + dup v28.8h, v28.h[0] + ld1 {v25.16b, v26.16b, v27.16b}, [x7] + + bit v2.16b, v28.16b, v25.16b + bit v3.16b, v28.16b, v26.16b + bit v4.16b, v28.16b, v27.16b + +4: // Loop horizontally + + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + shl v22.8h, v17.8h, #7 + mul v6.8h, v17.8h, v0.h[3] + mla v6.8h, v18.8h, v0.h[4] + mla v6.8h, v19.8h, v0.h[5] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + shl v23.8h, v17.8h, #7 + mul v7.8h, v17.8h, v0.h[3] + mla v7.8h, v18.8h, v0.h[4] + mla v7.8h, v19.8h, v0.h[5] + + ld1 {v18.8h, v19.8h}, [x12], #32 + + sub v22.8h, v22.8h, v30.8h + sub v23.8h, v23.8h, v30.8h + ld1 {v24.8h, v25.8h}, [x14], #32 + sqadd v6.8h, v6.8h, v22.8h + sqadd v7.8h, v7.8h, v23.8h + ld1 {v16.8h, v17.8h}, [x11], #32 + sshr v6.8h, v6.8h, #3 + sshr v7.8h, v7.8h, #3 + ld1 {v20.8h, v21.8h}, [x13], #32 + add v6.8h, v6.8h, v31.8h + add v7.8h, v7.8h, v31.8h + + add v24.8h, v24.8h, v18.8h + add v16.8h, v16.8h, v6.8h + + smull v18.4s, v20.4h, v1.h[3] + smlal v18.4s, v24.4h, v1.h[4] + smlal v18.4s, v16.4h, v1.h[5] + add v25.8h, v25.8h, v19.8h + smull2 v19.4s, v20.8h, v1.h[3] + smlal2 v19.4s, v24.8h, v1.h[4] + smlal2 v19.4s, v16.8h, v1.h[5] + add v17.8h, v17.8h, v7.8h + smull v20.4s, v21.4h, v1.h[3] + smlal v20.4s, v25.4h, v1.h[4] + smlal v20.4s, v17.4h, v1.h[5] + smull2 v21.4s, v21.8h, v1.h[3] + smlal2 v21.4s, v25.8h, v1.h[4] + smlal2 v21.4s, v17.8h, v1.h[5] + sqrshrun v18.4h, v18.4s, #11 + sqrshrun2 v18.8h, v19.4s, #11 + sqrshrun v19.4h, v20.4s, #11 + sqrshrun2 v19.8h, v21.4s, #11 + st1 {v6.8h, v7.8h}, [x15], #32 + sqxtun v18.8b, v18.8h + sqxtun2 v18.16b, v19.8h + subs w5, w5, #16 + + st1 {v18.16b}, [x0], #16 + + b.le 0f + mov v2.16b, v4.16b + ld1 {v4.16b}, [x3], #16 + tst w8, #2 // LR_HAVE_RIGHT + uxtl v3.8h, v4.8b + uxtl2 v4.8h, v4.16b + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldp x3, x5, [sp, #48] + ldp x15, x0, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #64 + + add x3, x3, x1 + add x0, x0, x1 + ret -.purgem filter endfunc #define SUM_STRIDE (384+16) diff --git a/src/arm/64/looprestoration16.S b/src/arm/64/looprestoration16.S index 3d43a848d6..fb41cf96f1 100644 --- a/src/arm/64/looprestoration16.S +++ b/src/arm/64/looprestoration16.S @@ -33,132 +33,223 @@ const right_ext_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 right_ext_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst -// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], -// const pixel *src, ptrdiff_t stride, -// const int16_t fh[7], const intptr_t w, -// int h, enum LrEdgeFlags edges, -// const int bitdepth_max); -function wiener_filter_h_16bpc_neon, export=1 - ldr w8, [sp] // bitdepth_max - ld1 {v0.8h}, [x4] - clz w8, w8 +// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride, +// const pixel (*left)[4], +// const pixel *lpf, const ptrdiff_t lpf_stride, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges, +// const int bitdepth_max); +function wiener_filter7_16bpc_neon, export=1 + ldr w8, [sp] + ldr w9, [sp, #8] + stp x29, x30, [sp, #-32]! + stp d8, d9, [sp, #16] + mov x29, sp + ld1 {v0.8h, v1.8h}, [x7] + tst w8, #4 // LR_HAVE_TOP + sub_sp 384*2*6 + + dup v28.8h, w9 // bitdepth_max + clz w9, w9 movi v30.4s, #1 - sub w9, w8, #38 // -(bitdepth + 6) - sub w8, w8, #25 // -round_bits_h - neg w9, w9 // bitdepth + 6 - dup v1.4s, w9 - dup v29.4s, w8 // -round_bits_h + sub w10, w9, #38 // -(bitdepth + 6) + sub w11, w9, #11 // round_bits_v + sub w9, w9, #25 // -round_bits_h + neg w10, w10 // bitdepth + 6 + neg w11, w11 // -round_bits_v + dup v2.4s, w10 + dup v29.4s, w9 // -round_bits_h + dup v27.4s, w11 // -round_bits_v movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 - ushl v30.4s, v30.4s, v1.4s // 1 << (bitdepth + 6) - mov w8, w5 - // Calculate mid_stride - add w10, w5, #7 - bic w10, w10, #7 - lsl w10, w10, #1 + ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) + + zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 + + // x9 - t6 + // x10 - t5 + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_7) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter7_h_16bpc_neon + add x3, x3, x4 // lpf += lpf_stride + mov x9, x14 // t6 + mov x10, x14 // t5 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + add x3, x3, x4, lsl #2 + add x3, x3, x4 // lpf += lpf_stride*5 + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter7_h_16bpc_neon + subs w6, w6, #1 // h-- + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + mov x13, x14 // t2 + subs w6, w6, #1 // h-- + b.eq L(v2_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += p_stride + +L(main_7): + add x15, x14, #384*2 // t0 = t1 + 384*2 +L(main_loop_7): + bl wiener_filter7_hv_16bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_loop_7) + tst w8, #8 // LR_HAVE_BOTTOM + b.eq L(v3_7) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + sub x4, x4, x1 // lpf_stride - p_stride + bl wiener_filter7_hv_16bpc_neon + add x3, x3, x4 // src += lpf_stride - p_stride + bl wiener_filter7_hv_16bpc_neon +L(v1_7): + bl wiener_filter7_v_16bpc_neon + + mov sp, x29 + ldp d8, d9, [sp, #16] + ldp x29, x30, [sp], #32 + ret - // Set up pointers for reading/writing alternate rows - add x12, x0, x10 - lsl w10, w10, #1 - add x13, x2, x3 - lsl x3, x3, #1 +L(no_top_7): + add x3, x3, x4, lsl #2 + add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter7_h_16bpc_neon + subs w6, w6, #1 // h-- + mov x9, x14 // t6 + mov x10, x14 // t5 + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + subs w6, w6, #1 // h-- + mov x13, x14 // t2 + b.eq L(v2_7) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter7_h_16bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v3_7) + add x3, x3, x1 // src += p_stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter7_hv_16bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v3_7) + add x15, x15, #384*2*4 // t0 += 384*2*4 + bl wiener_filter7_hv_16bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_7) +L(v3_7): + bl wiener_filter7_v_16bpc_neon +L(v2_7): + bl wiener_filter7_v_16bpc_neon + b L(v1_7) +endfunc - // Subtract the aligned width from mid_stride - add w11, w5, #7 - bic w11, w11, #7 - sub x10, x10, w11, uxtw #1 - // Subtract the number of pixels read from the source stride - add w11, w11, #8 - sub x3, x3, w11, uxtw #1 +function wiener_filter7_h_16bpc_neon + stp x3, x5, [sp, #-32]! + str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w7, #1 // LR_HAVE_LEFT - b.eq 2f + tst w8, #1 // LR_HAVE_LEFT + b.eq 1f // LR_HAVE_LEFT - cbnz x1, 0f + cbnz x2, 0f // left == NULL - sub x2, x2, #6 - sub x13, x13, #6 - b 1f -0: // LR_HAVE_LEFT, left != NULL -2: // !LR_HAVE_LEFT, increase the stride. - // For this case we don't read the left 3 pixels from the src pointer, - // but shift it as if we had done that. - add x3, x3, #6 - - -1: // Loop vertically - ld1 {v2.8h, v3.8h}, [x2], #32 - ld1 {v4.8h, v5.8h}, [x13], #32 + sub x3, x3, #6 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f - tst w7, #1 // LR_HAVE_LEFT - b.eq 0f - cbz x1, 2f +0: // LR_HAVE_LEFT, left != NULL - ld1 {v1.d}[1], [x1], #8 - // Move x2/x13 back to account for the last 3 pixels we loaded earlier, + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 3 pixels we loaded earlier, // which we'll shift out. - sub x2, x2, #6 - sub x13, x13, #6 - ld1 {v6.d}[1], [x1], #8 + sub x3, x3, #6 ext v3.16b, v2.16b, v3.16b, #10 - ext v2.16b, v1.16b, v2.16b, #10 - ext v5.16b, v4.16b, v5.16b, #10 - ext v4.16b, v6.16b, v4.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 b 2f -0: - // !LR_HAVE_LEFT, fill v1 with the leftmost pixel - // and shift v2/v3 to have 3x the first pixel at the front. - dup v1.8h, v2.h[0] - dup v6.8h, v4.h[0] - // Move x2 back to account for the last 3 pixels we loaded before, + +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v4 with the leftmost pixel + // and shift v3 to have 3x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 3 pixels we loaded before, // which we shifted out. - sub x2, x2, #6 - sub x13, x13, #6 + sub x3, x3, #6 ext v3.16b, v2.16b, v3.16b, #10 - ext v2.16b, v1.16b, v2.16b, #10 - ext v5.16b, v4.16b, v5.16b, #10 - ext v4.16b, v6.16b, v4.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 2: + ld1 {v4.8h}, [x3], #16 - tst w7, #2 // LR_HAVE_RIGHT + tst w8, #2 // LR_HAVE_RIGHT b.ne 4f - // If we'll need to pad the right edge, load that pixel to pad with - // here since we can find it pretty easily from here. - sub w9, w5, #14 - ldr h27, [x2, w9, sxtw #1] - ldr h28, [x13, w9, sxtw #1] - // Fill v27/v28 with the right padding pixel - dup v27.8h, v27.h[0] - dup v28.8h, v28.h[0] + 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #11 - b.ge 4f // If w >= 11, all used input pixels are valid + cmp w5, #19 + b.ge 4f // If w >= 19, all used input pixels are valid - // 1 <= w < 11, w+3 pixels valid in v2-v3. For w=9 or w=10, + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. - // Insert padding in v2/3.h[w+3] onwards; fuse the +3 (*2) into the + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w5, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. - movrel x4, right_ext_mask, -6 - sub x4, x4, w5, uxtw #1 - ld1 {v25.16b, v26.16b}, [x4] + movrel x7, right_ext_mask, -6 + ldr h26, [x3, w17, sxtw #1] + sub x7, x7, w5, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x7] - bit v2.16b, v27.16b, v25.16b - bit v3.16b, v27.16b, v26.16b - bit v4.16b, v28.16b, v25.16b - bit v5.16b, v28.16b, v26.16b + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b 4: // Loop horizontally // Interleaving the mul/mla chains actually hurts performance @@ -174,31 +265,32 @@ function wiener_filter_h_16bpc_neon, export=1 add v20.8h, v20.8h, v16.8h add v21.8h, v21.8h, v2.8h smull v6.4s, v18.4h, v0.h[3] - smlal v6.4s, v19.4h, v0.h[4] - smlal v6.4s, v20.4h, v0.h[5] - smlal v6.4s, v21.4h, v0.h[6] + smlal v6.4s, v19.4h, v0.h[2] + smlal v6.4s, v20.4h, v0.h[1] + smlal v6.4s, v21.4h, v0.h[0] smull2 v7.4s, v18.8h, v0.h[3] - smlal2 v7.4s, v19.8h, v0.h[4] - smlal2 v7.4s, v20.8h, v0.h[5] - smlal2 v7.4s, v21.8h, v0.h[6] - - ext v20.16b, v4.16b, v5.16b, #4 - ext v22.16b, v4.16b, v5.16b, #8 - ext v19.16b, v4.16b, v5.16b, #2 - ext v23.16b, v4.16b, v5.16b, #10 - ext v24.16b, v4.16b, v5.16b, #12 - ext v21.16b, v4.16b, v5.16b, #6 - add v22.8h, v22.8h, v20.8h - add v23.8h, v23.8h, v19.8h - add v24.8h, v24.8h, v4.8h - smull v16.4s, v21.4h, v0.h[3] - smlal v16.4s, v22.4h, v0.h[4] - smlal v16.4s, v23.4h, v0.h[5] - smlal v16.4s, v24.4h, v0.h[6] - smull2 v17.4s, v21.8h, v0.h[3] - smlal2 v17.4s, v22.8h, v0.h[4] - smlal2 v17.4s, v23.8h, v0.h[5] - smlal2 v17.4s, v24.8h, v0.h[6] + smlal2 v7.4s, v19.8h, v0.h[2] + smlal2 v7.4s, v20.8h, v0.h[1] + smlal2 v7.4s, v21.8h, v0.h[0] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + smull v16.4s, v18.4h, v0.h[3] + smlal v16.4s, v19.4h, v0.h[2] + smlal v16.4s, v20.4h, v0.h[1] + smlal v16.4s, v21.4h, v0.h[0] + smull2 v17.4s, v18.8h, v0.h[3] + smlal2 v17.4s, v19.8h, v0.h[2] + smlal2 v17.4s, v20.8h, v0.h[1] + smlal2 v17.4s, v21.8h, v0.h[0] mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 add v6.4s, v6.4s, v30.4s @@ -218,214 +310,767 @@ function wiener_filter_h_16bpc_neon, export=1 sub v6.8h, v6.8h, v31.8h sub v7.8h, v7.8h, v31.8h - subs w5, w5, #8 + subs w5, w5, #16 - st1 {v6.8h}, [x0], #16 - st1 {v7.8h}, [x12], #16 + st1 {v6.8h, v7.8h}, [x14], #32 - b.le 9f - tst w7, #2 // LR_HAVE_RIGHT - mov v2.16b, v3.16b - mov v4.16b, v5.16b - ld1 {v3.8h}, [x2], #16 - ld1 {v5.8h}, [x13], #16 + b.le 0f + mov v2.16b, v4.16b + tst w8, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. -9: - subs w6, w6, #2 - b.le 0f - // Jump to the next row and loop horizontally - add x0, x0, x10 - add x12, x12, x10 - add x2, x2, x3 - add x13, x13, x3 - mov w5, w8 - b 1b 0: + ldr x14, [sp, #16] + ldp x3, x5, [sp], #32 ret endfunc -// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, -// const int16_t *mid, int w, int h, -// const int16_t fv[7], enum LrEdgeFlags edges, -// ptrdiff_t mid_stride, const int bitdepth_max); -function wiener_filter_v_16bpc_neon, export=1 - ldr w8, [sp] // bitdepth_max - ld1 {v0.8h}, [x5] - dup v31.8h, w8 - clz w8, w8 - sub w8, w8, #11 // round_bits_v - dup v30.4s, w8 - mov w8, w4 - neg v30.4s, v30.4s // -round_bits_v - - // Calculate the number of rows to move back when looping vertically - mov w11, w4 - tst w6, #4 // LR_HAVE_TOP - b.eq 0f - sub x2, x2, x7, lsl #1 - add w11, w11, #2 +function wiener_filter7_v_16bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, afterwards. + stp x10, x11, [sp, #-64]! + stp x12, x13, [sp, #16] + stp x14, x14, [sp, #32] + stp x0, x5, [sp, #48] +1: + ld1 {v16.8h, v17.8h}, [x9], #32 + ld1 {v18.8h, v19.8h}, [x10], #32 + ld1 {v20.8h, v21.8h}, [x11], #32 + ld1 {v22.8h, v23.8h}, [x12], #32 + ld1 {v24.8h, v25.8h}, [x13], #32 + ld1 {v6.8h, v7.8h}, [x14], #32 + + smull v2.4s, v16.4h, v0.h[4] + smlal v2.4s, v18.4h, v0.h[5] + smlal v2.4s, v20.4h, v0.h[6] + smlal v2.4s, v22.4h, v0.h[7] + smlal v2.4s, v24.4h, v0.h[6] + smlal v2.4s, v6.4h, v0.h[5] + smlal v2.4s, v6.4h, v0.h[4] + smull2 v3.4s, v16.8h, v0.h[4] + smlal2 v3.4s, v18.8h, v0.h[5] + smlal2 v3.4s, v20.8h, v0.h[6] + smlal2 v3.4s, v22.8h, v0.h[7] + smlal2 v3.4s, v24.8h, v0.h[6] + smlal2 v3.4s, v6.8h, v0.h[5] + smlal2 v3.4s, v6.8h, v0.h[4] + smull v4.4s, v17.4h, v0.h[4] + smlal v4.4s, v19.4h, v0.h[5] + smlal v4.4s, v21.4h, v0.h[6] + smlal v4.4s, v23.4h, v0.h[7] + smlal v4.4s, v25.4h, v0.h[6] + smlal v4.4s, v7.4h, v0.h[5] + smlal v4.4s, v7.4h, v0.h[4] + smull2 v5.4s, v17.8h, v0.h[4] + smlal2 v5.4s, v19.8h, v0.h[5] + smlal2 v5.4s, v21.8h, v0.h[6] + smlal2 v5.4s, v23.8h, v0.h[7] + smlal2 v5.4s, v25.8h, v0.h[6] + smlal2 v5.4s, v7.8h, v0.h[5] + smlal2 v5.4s, v7.8h, v0.h[4] + srshl v2.4s, v2.4s, v27.4s // -round_bits_v + srshl v3.4s, v3.4s, v27.4s + srshl v4.4s, v4.4s, v27.4s + srshl v5.4s, v5.4s, v27.4s + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + sqxtun v3.4h, v4.4s + sqxtun2 v3.8h, v5.4s + umin v2.8h, v2.8h, v28.8h // bitdepth_max + umin v3.8h, v3.8h, v28.8h + subs w5, w5, #16 + st1 {v2.8h, v3.8h}, [x0], #32 + b.gt 1b + + ldp x0, x5, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #64 + + add x0, x0, x1 + ret +endfunc + +function wiener_filter7_hv_16bpc_neon + // Backing up/restoring registers shifted, so that x9 gets the value + // of x10, etc, and x15==x9, afterwards. + stp x10, x11, [sp, #-80]! + stp x12, x13, [sp, #16] + stp x14, x15, [sp, #32] + stp x10, x0, [sp, #48] + stp x3, x5, [sp, #64] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w8, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #6 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f + +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 + b 2f +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v4 with the leftmost pixel + // and shift v3 to have 3x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub x3, x3, #6 + ext v3.16b, v2.16b, v3.16b, #10 + ext v2.16b, v4.16b, v2.16b, #10 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w8, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #19 + b.ge 4f // If w >= 19, all used input pixels are valid + + // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. + sub w17, w5, #22 + // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -6 + ldr h26, [x3, w17, sxtw #1] + sub x7, x7, w5, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x7] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally + ext v17.16b, v2.16b, v3.16b, #4 + ext v19.16b, v2.16b, v3.16b, #8 + ext v16.16b, v2.16b, v3.16b, #2 + ext v20.16b, v2.16b, v3.16b, #10 + ext v21.16b, v2.16b, v3.16b, #12 + ext v18.16b, v2.16b, v3.16b, #6 + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v2.8h + smull v6.4s, v18.4h, v0.h[3] + smlal v6.4s, v19.4h, v0.h[2] + smlal v6.4s, v20.4h, v0.h[1] + smlal v6.4s, v21.4h, v0.h[0] + smull2 v7.4s, v18.8h, v0.h[3] + smlal2 v7.4s, v19.8h, v0.h[2] + smlal2 v7.4s, v20.8h, v0.h[1] + smlal2 v7.4s, v21.8h, v0.h[0] + + ext v17.16b, v3.16b, v4.16b, #4 + ext v19.16b, v3.16b, v4.16b, #8 + ext v16.16b, v3.16b, v4.16b, #2 + ext v20.16b, v3.16b, v4.16b, #10 + ext v21.16b, v3.16b, v4.16b, #12 + ext v18.16b, v3.16b, v4.16b, #6 + + add v19.8h, v19.8h, v17.8h + add v20.8h, v20.8h, v16.8h + add v21.8h, v21.8h, v3.8h + smull v24.4s, v18.4h, v0.h[3] + smlal v24.4s, v19.4h, v0.h[2] + smlal v24.4s, v20.4h, v0.h[1] + smlal v24.4s, v21.4h, v0.h[0] + smull2 v25.4s, v18.8h, v0.h[3] + smlal2 v25.4s, v19.8h, v0.h[2] + smlal2 v25.4s, v20.8h, v0.h[1] + smlal2 v25.4s, v21.8h, v0.h[0] + + ld1 {v16.8h, v17.8h}, [x9], #32 + + mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v24.4s, v24.4s, v30.4s + add v25.4s, v25.4s, v30.4s + ld1 {v18.8h, v19.8h}, [x10], #32 + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v24.4s, v24.4s, v29.4s + srshl v25.4s, v25.4s, v29.4s + ld1 {v20.8h, v21.8h}, [x11], #32 + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v24.4s + sqxtun2 v7.8h, v25.4s + ld1 {v22.8h, v23.8h}, [x12], #32 + umin v6.8h, v6.8h, v26.8h + umin v7.8h, v7.8h, v26.8h + ld1 {v24.8h, v25.8h}, [x13], #32 + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + ld1 {v8.8h, v9.8h}, [x14], #32 + + smull v1.4s, v16.4h, v0.h[4] + smlal v1.4s, v18.4h, v0.h[5] + smlal v1.4s, v20.4h, v0.h[6] + smlal v1.4s, v22.4h, v0.h[7] + smlal v1.4s, v24.4h, v0.h[6] + smlal v1.4s, v8.4h, v0.h[5] + smlal v1.4s, v6.4h, v0.h[4] + smull2 v5.4s, v16.8h, v0.h[4] + smlal2 v5.4s, v18.8h, v0.h[5] + smlal2 v5.4s, v20.8h, v0.h[6] + smlal2 v5.4s, v22.8h, v0.h[7] + smlal2 v5.4s, v24.8h, v0.h[6] + smlal2 v5.4s, v8.8h, v0.h[5] + smlal2 v5.4s, v6.8h, v0.h[4] + smull v26.4s, v17.4h, v0.h[4] + smlal v26.4s, v19.4h, v0.h[5] + smlal v26.4s, v21.4h, v0.h[6] + smlal v26.4s, v23.4h, v0.h[7] + smlal v26.4s, v25.4h, v0.h[6] + smlal v26.4s, v9.4h, v0.h[5] + smlal v26.4s, v7.4h, v0.h[4] + smull2 v16.4s, v17.8h, v0.h[4] + smlal2 v16.4s, v19.8h, v0.h[5] + smlal2 v16.4s, v21.8h, v0.h[6] + smlal2 v16.4s, v23.8h, v0.h[7] + smlal2 v16.4s, v25.8h, v0.h[6] + smlal2 v16.4s, v9.8h, v0.h[5] + smlal2 v16.4s, v7.8h, v0.h[4] + srshl v1.4s, v1.4s, v27.4s // -round_bits_v + srshl v5.4s, v5.4s, v27.4s + srshl v26.4s, v26.4s, v27.4s + srshl v16.4s, v16.4s, v27.4s + sqxtun v18.4h, v1.4s + sqxtun2 v18.8h, v5.4s + sqxtun v19.4h, v26.4s + sqxtun2 v19.8h, v16.4s + st1 {v6.8h, v7.8h}, [x15], #32 + umin v18.8h, v18.8h, v28.8h // bitdepth_max + umin v19.8h, v19.8h, v28.8h + subs w5, w5, #16 + + st1 {v18.8h, v19.8h}, [x0], #32 + + b.le 0f + mov v2.16b, v4.16b + tst w8, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + 0: - tst w6, #8 // LR_HAVE_BOTTOM + ldp x3, x5, [sp, #64] + ldp x15, x0, [sp, #48] + ldp x13, x14, [sp, #32] + ldp x11, x12, [sp, #16] + ldp x9, x10, [sp], #80 + + add x3, x3, x1 + add x0, x0, x1 + + ret +endfunc + +// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride, +// const pixel (*left)[4], +// const pixel *lpf, const ptrdiff_t lpf_stride, +// const int w, int h, +// const int16_t filter[2][8], +// const enum LrEdgeFlags edges); +function wiener_filter5_16bpc_neon, export=1 + ldr w8, [sp] + ldr w9, [sp, #8] + stp x29, x30, [sp, #-32]! + stp d8, d9, [sp, #16] + mov x29, sp + ld1 {v0.8h, v1.8h}, [x7] + tst w8, #4 // LR_HAVE_TOP + sub_sp 384*2*4 + + dup v28.8h, w9 // bitdepth_max + clz w9, w9 + movi v30.4s, #1 + sub w10, w9, #38 // -(bitdepth + 6) + sub w11, w9, #11 // round_bits_v + sub w9, w9, #25 // -round_bits_h + neg w10, w10 // bitdepth + 6 + neg w11, w11 // -round_bits_v + dup v2.4s, w10 + dup v29.4s, w9 // -round_bits_h + dup v27.4s, w11 // -round_bits_v + movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 + ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) + + zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 + + // x11 - t4 + // x12 - t3 + // x13 - t2 + // x14 - t1 + // x15 - t0 + mov x14, sp // t1 + b.eq L(no_top_5) + + mov x16, x2 // backup left + mov x2, #0 + bl wiener_filter5_h_16bpc_neon + add x3, x3, x4 // lpf += lpf_stride + mov x11, x14 // t4 + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_16bpc_neon + add x3, x3, x4, lsl #2 + add x3, x3, x4 // lpf += lpf_stride*5 + mov x12, x14 // t3 + add x14, x14, #384*2 // t1 += 384*2 + mov x2, x16 // left + mov x16, x3 // backup lpf + mov x3, x0 // lpf = p + bl wiener_filter5_h_16bpc_neon + subs w6, w6, #1 // h-- + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_16bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += p_stride + +L(main_5): + mov x15, x11 // t0 = t4 +L(main_loop_5): + bl wiener_filter5_hv_16bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_loop_5) + tst w8, #8 // LR_HAVE_BOTTOM + b.eq L(v2_5) + + mov x3, x16 // restore lpf + mov x2, #0 // left = NULL + sub x4, x4, x1 // lpf_stride - p_stride + bl wiener_filter5_hv_16bpc_neon + add x3, x3, x4 // src += lpf_stride - p_stride + bl wiener_filter5_hv_16bpc_neon +L(end_5): + + mov sp, x29 + ldp d8, d9, [sp, #16] + ldp x29, x30, [sp], #32 + ret + +L(no_top_5): + add x3, x3, x4, lsl #2 + add x16, x3, x4, lsl #1 // lpf += lpf_stride*6, backup + mov x3, x0 // lpf = p + + bl wiener_filter5_h_16bpc_neon + subs w6, w6, #1 // h-- + mov x11, x14 // t4 + mov x12, x14 // t3 + mov x13, x14 // t2 + b.eq L(v1_5) + add x3, x3, x1 // src += p_stride + add x14, x14, #384*2 // t1 += 384*2 + bl wiener_filter5_h_16bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v2_5) + add x3, x3, x1 // src += p_stride + add x15, x14, #384*2 // t0 = t1 + 384*2 + bl wiener_filter5_hv_16bpc_neon + subs w6, w6, #1 // h-- + b.eq L(v2_5) + add x15, x15, #384*2*3 // t0 += 384*2*3 + bl wiener_filter5_hv_16bpc_neon + subs w6, w6, #1 // h-- + b.ne L(main_5) +L(v2_5): + bl wiener_filter5_v_16bpc_neon + add x0, x0, x1 + mov x11, x12 + mov x12, x13 + mov x13, x14 +L(v1_5): + bl wiener_filter5_v_16bpc_neon + b L(end_5) +endfunc + + +function wiener_filter5_h_16bpc_neon + stp x3, x5, [sp, #-32]! + str x14, [sp, #16] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w8, #1 // LR_HAVE_LEFT b.eq 1f - add w11, w11, #2 + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #4 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f -1: // Start of horizontal loop; start one vertical filter slice. - // Load rows into v16-v19 and pad properly. - tst w6, #4 // LR_HAVE_TOP - ld1 {v16.8h}, [x2], x7 - b.eq 2f - // LR_HAVE_TOP - ld1 {v18.8h}, [x2], x7 - mov v17.16b, v16.16b - ld1 {v19.8h}, [x2], x7 - b 3f -2: // !LR_HAVE_TOP - mov v17.16b, v16.16b - mov v18.16b, v16.16b - mov v19.16b, v16.16b - -3: - cmp w4, #4 - b.lt 5f - // Start filtering normally; fill in v20-v22 with unique rows. - ld1 {v20.8h}, [x2], x7 - ld1 {v21.8h}, [x2], x7 - ld1 {v22.8h}, [x2], x7 - -4: -.macro filter compare - subs w4, w4, #1 +0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + b 2f + +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v3 to have 3x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w8, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w5, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -4 + ldr h26, [x3, w17, sxtw #1] + sub x7, x7, w5, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x7] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. - smull v2.4s, v16.4h, v0.h[0] - smlal v2.4s, v17.4h, v0.h[1] - smlal v2.4s, v18.4h, v0.h[2] - smlal v2.4s, v19.4h, v0.h[3] - smlal v2.4s, v20.4h, v0.h[4] - smlal v2.4s, v21.4h, v0.h[5] - smlal v2.4s, v22.4h, v0.h[6] - smull2 v3.4s, v16.8h, v0.h[0] - smlal2 v3.4s, v17.8h, v0.h[1] - smlal2 v3.4s, v18.8h, v0.h[2] - smlal2 v3.4s, v19.8h, v0.h[3] - smlal2 v3.4s, v20.8h, v0.h[4] - smlal2 v3.4s, v21.8h, v0.h[5] - smlal2 v3.4s, v22.8h, v0.h[6] - srshl v2.4s, v2.4s, v30.4s // round_bits_v - srshl v3.4s, v3.4s, v30.4s - sqxtun v2.4h, v2.4s - sqxtun2 v2.8h, v3.4s - umin v2.8h, v2.8h, v31.8h // bitdepth_max - st1 {v2.8h}, [x0], x1 -.if \compare - cmp w4, #4 -.else - b.le 9f -.endif - mov v16.16b, v17.16b - mov v17.16b, v18.16b - mov v18.16b, v19.16b - mov v19.16b, v20.16b - mov v20.16b, v21.16b - mov v21.16b, v22.16b -.endm - filter 1 - b.lt 7f - ld1 {v22.8h}, [x2], x7 - b 4b - -5: // Less than 4 rows in total; not all of v20-v21 are filled yet. - tst w6, #8 // LR_HAVE_BOTTOM - b.eq 6f - // LR_HAVE_BOTTOM - cmp w4, #2 - // We load at least 2 rows in all cases. - ld1 {v20.8h}, [x2], x7 - ld1 {v21.8h}, [x2], x7 - b.gt 53f // 3 rows in total - b.eq 52f // 2 rows in total -51: // 1 row in total, v19 already loaded, load edge into v20-v22. - mov v22.16b, v21.16b - b 8f -52: // 2 rows in total, v19 already loaded, load v20 with content data - // and 2 rows of edge. - ld1 {v22.8h}, [x2], x7 - mov v23.16b, v22.16b - b 8f -53: - // 3 rows in total, v19 already loaded, load v20 and v21 with content - // and 2 rows of edge. - ld1 {v22.8h}, [x2], x7 - ld1 {v23.8h}, [x2], x7 - mov v24.16b, v23.16b - b 8f - -6: - // !LR_HAVE_BOTTOM - cmp w4, #2 - b.gt 63f // 3 rows in total - b.eq 62f // 2 rows in total -61: // 1 row in total, v19 already loaded, pad that into v20-v22. - mov v20.16b, v19.16b - mov v21.16b, v19.16b - mov v22.16b, v19.16b - b 8f -62: // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23. - ld1 {v20.8h}, [x2], x7 - mov v21.16b, v20.16b - mov v22.16b, v20.16b - mov v23.16b, v20.16b - b 8f -63: - // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24. - ld1 {v20.8h}, [x2], x7 - ld1 {v21.8h}, [x2], x7 - mov v22.16b, v21.16b - mov v23.16b, v21.16b - mov v24.16b, v21.16b - b 8f - -7: - // All registers up to v21 are filled already, 3 valid rows left. - // < 4 valid rows left; fill in padding and filter the last - // few rows. - tst w6, #8 // LR_HAVE_BOTTOM - b.eq 71f - // LR_HAVE_BOTTOM; load 2 rows of edge. - ld1 {v22.8h}, [x2], x7 - ld1 {v23.8h}, [x2], x7 - mov v24.16b, v23.16b - b 8f -71: - // !LR_HAVE_BOTTOM, pad 3 rows - mov v22.16b, v21.16b - mov v23.16b, v21.16b - mov v24.16b, v21.16b - -8: // At this point, all registers up to v22-v24 are loaded with - // edge/padding (depending on how many rows are left). - filter 0 // This branches to 9f when done - mov v22.16b, v23.16b - mov v23.16b, v24.16b - b 8b - -9: // End of one vertical slice. - subs w3, w3, #8 + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + smull v6.4s, v17.4h, v0.h[3] + smlal v6.4s, v18.4h, v0.h[2] + smlal v6.4s, v19.4h, v0.h[1] + smull2 v7.4s, v17.8h, v0.h[3] + smlal2 v7.4s, v18.8h, v0.h[2] + smlal2 v7.4s, v19.8h, v0.h[1] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + smull v16.4s, v17.4h, v0.h[3] + smlal v16.4s, v18.4h, v0.h[2] + smlal v16.4s, v19.4h, v0.h[1] + smull2 v17.4s, v17.8h, v0.h[3] + smlal2 v17.4s, v18.8h, v0.h[2] + smlal2 v17.4s, v19.8h, v0.h[1] + + mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v16.4s, v16.4s, v29.4s + srshl v17.4s, v17.4s, v29.4s + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v16.4s + sqxtun2 v7.8h, v17.4s + umin v6.8h, v6.8h, v24.8h + umin v7.8h, v7.8h, v24.8h + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + subs w5, w5, #16 + + st1 {v6.8h, v7.8h}, [x14], #32 + b.le 0f - // Move pointers back up to the top and loop horizontally. - msub x0, x1, x8, x0 - msub x2, x7, x11, x2 - add x0, x0, #16 - add x2, x2, #16 - mov w4, w8 - b 1b + mov v2.16b, v4.16b + tst w8, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldr x14, [sp, #16] + ldp x3, x5, [sp], #32 + ret +endfunc + +function wiener_filter5_v_16bpc_neon + stp x11, x12, [sp, #-48]! + stp x13, x14, [sp, #16] + stp x0, x5, [sp, #32] +1: + ld1 {v16.8h, v17.8h}, [x11], #32 + ld1 {v18.8h, v19.8h}, [x12], #32 + ld1 {v20.8h, v21.8h}, [x13], #32 + ld1 {v22.8h, v23.8h}, [x14], #32 + + smull v2.4s, v16.4h, v0.h[5] + smlal v2.4s, v18.4h, v0.h[6] + smlal v2.4s, v20.4h, v0.h[7] + smlal v2.4s, v22.4h, v0.h[6] + smlal v2.4s, v22.4h, v0.h[5] + smull2 v3.4s, v16.8h, v0.h[5] + smlal2 v3.4s, v18.8h, v0.h[6] + smlal2 v3.4s, v20.8h, v0.h[7] + smlal2 v3.4s, v22.8h, v0.h[6] + smlal2 v3.4s, v22.8h, v0.h[5] + smull v4.4s, v17.4h, v0.h[5] + smlal v4.4s, v19.4h, v0.h[6] + smlal v4.4s, v21.4h, v0.h[7] + smlal v4.4s, v23.4h, v0.h[6] + smlal v4.4s, v23.4h, v0.h[5] + smull2 v5.4s, v17.8h, v0.h[5] + smlal2 v5.4s, v19.8h, v0.h[6] + smlal2 v5.4s, v21.8h, v0.h[7] + smlal2 v5.4s, v23.8h, v0.h[6] + smlal2 v5.4s, v23.8h, v0.h[5] + srshl v2.4s, v2.4s, v27.4s // -round_bits_v + srshl v3.4s, v3.4s, v27.4s + srshl v4.4s, v4.4s, v27.4s + srshl v5.4s, v5.4s, v27.4s + sqxtun v2.4h, v2.4s + sqxtun2 v2.8h, v3.4s + sqxtun v3.4h, v4.4s + sqxtun2 v3.8h, v5.4s + umin v2.8h, v2.8h, v28.8h // bitdepth_max + umin v3.8h, v3.8h, v28.8h + + subs w5, w5, #16 + st1 {v2.8h, v3.8h}, [x0], #32 + b.gt 1b + + ldp x0, x5, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #48 + + ret +endfunc + +function wiener_filter5_hv_16bpc_neon + // Backing up/restoring registers shifted, so that x11 gets the value + // of x12, etc, and x15==x11, afterwards. + stp x12, x13, [sp, #-64]! + stp x14, x15, [sp, #16] + stp x12, x0, [sp, #32] + stp x3, x5, [sp, #48] + + // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL + tst w8, #1 // LR_HAVE_LEFT + b.eq 1f + // LR_HAVE_LEFT + cbnz x2, 0f + // left == NULL + sub x3, x3, #4 + ld1 {v2.8h, v3.8h}, [x3], #32 + b 2f 0: + // LR_HAVE_LEFT, left != NULL + ld1 {v2.8h, v3.8h}, [x3], #32 + ld1 {v4.d}[1], [x2], #8 + // Move x3 back to account for the last 2 pixels we loaded earlier, + // which we'll shift out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + b 2f +1: + ld1 {v2.8h, v3.8h}, [x3], #32 + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v3 to have 2x the first pixel at the front. + dup v4.8h, v2.h[0] + // Move x3 back to account for the last 2 pixels we loaded before, + // which we shifted out. + sub x3, x3, #4 + ext v3.16b, v2.16b, v3.16b, #12 + ext v2.16b, v4.16b, v2.16b, #12 + +2: + ld1 {v4.8h}, [x3], #16 + + tst w8, #2 // LR_HAVE_RIGHT + b.ne 4f + +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w5, #18 + b.ge 4f // If w >= 18, all used input pixels are valid + + // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie + // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. + sub w17, w5, #23 + // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the + // buffer pointer. + movrel x7, right_ext_mask, -4 + ldr h26, [x3, w17, sxtw #1] + sub x7, x7, w5, uxtw #1 + dup v26.8h, v26.h[0] + ld1 {v23.16b, v24.16b, v25.16b}, [x7] + + bit v2.16b, v26.16b, v23.16b + bit v3.16b, v26.16b, v24.16b + bit v4.16b, v26.16b, v25.16b + +4: // Loop horizontally + ext v16.16b, v2.16b, v3.16b, #2 + ext v18.16b, v2.16b, v3.16b, #6 + ext v19.16b, v2.16b, v3.16b, #8 + ext v17.16b, v2.16b, v3.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v2.8h + smull v6.4s, v17.4h, v0.h[3] + smlal v6.4s, v18.4h, v0.h[2] + smlal v6.4s, v19.4h, v0.h[1] + smull2 v7.4s, v17.8h, v0.h[3] + smlal2 v7.4s, v18.8h, v0.h[2] + smlal2 v7.4s, v19.8h, v0.h[1] + + ext v16.16b, v3.16b, v4.16b, #2 + ext v18.16b, v3.16b, v4.16b, #6 + ext v19.16b, v3.16b, v4.16b, #8 + ext v17.16b, v3.16b, v4.16b, #4 + add v18.8h, v18.8h, v16.8h + add v19.8h, v19.8h, v3.8h + smull v24.4s, v17.4h, v0.h[3] + smlal v24.4s, v18.4h, v0.h[2] + smlal v24.4s, v19.4h, v0.h[1] + smull2 v25.4s, v17.8h, v0.h[3] + smlal2 v25.4s, v18.8h, v0.h[2] + smlal2 v25.4s, v19.8h, v0.h[1] + + ld1 {v16.8h, v17.8h}, [x11], #32 + mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 + add v6.4s, v6.4s, v30.4s + add v7.4s, v7.4s, v30.4s + add v24.4s, v24.4s, v30.4s + add v25.4s, v25.4s, v30.4s + ld1 {v18.8h, v19.8h}, [x12], #32 + srshl v6.4s, v6.4s, v29.4s + srshl v7.4s, v7.4s, v29.4s + srshl v24.4s, v24.4s, v29.4s + srshl v25.4s, v25.4s, v29.4s + ld1 {v20.8h, v21.8h}, [x13], #32 + sqxtun v6.4h, v6.4s + sqxtun2 v6.8h, v7.4s + sqxtun v7.4h, v24.4s + sqxtun2 v7.8h, v25.4s + ld1 {v22.8h, v23.8h}, [x14], #32 + umin v6.8h, v6.8h, v26.8h + umin v7.8h, v7.8h, v26.8h + sub v6.8h, v6.8h, v31.8h + sub v7.8h, v7.8h, v31.8h + + smull v8.4s, v16.4h, v0.h[5] + smlal v8.4s, v18.4h, v0.h[6] + smlal v8.4s, v20.4h, v0.h[7] + smlal v8.4s, v22.4h, v0.h[6] + smlal v8.4s, v6.4h, v0.h[5] + smull2 v9.4s, v16.8h, v0.h[5] + smlal2 v9.4s, v18.8h, v0.h[6] + smlal2 v9.4s, v20.8h, v0.h[7] + smlal2 v9.4s, v22.8h, v0.h[6] + smlal2 v9.4s, v6.8h, v0.h[5] + smull v1.4s, v17.4h, v0.h[5] + smlal v1.4s, v19.4h, v0.h[6] + smlal v1.4s, v21.4h, v0.h[7] + smlal v1.4s, v23.4h, v0.h[6] + smlal v1.4s, v7.4h, v0.h[5] + smull2 v5.4s, v17.8h, v0.h[5] + smlal2 v5.4s, v19.8h, v0.h[6] + smlal2 v5.4s, v21.8h, v0.h[7] + smlal2 v5.4s, v23.8h, v0.h[6] + smlal2 v5.4s, v7.8h, v0.h[5] + srshl v8.4s, v8.4s, v27.4s // -round_bits_v + srshl v9.4s, v9.4s, v27.4s + srshl v1.4s, v1.4s, v27.4s + srshl v5.4s, v5.4s, v27.4s + sqxtun v8.4h, v8.4s + sqxtun2 v8.8h, v9.4s + sqxtun v9.4h, v1.4s + sqxtun2 v9.8h, v5.4s + st1 {v6.8h, v7.8h}, [x15], #32 + umin v8.8h, v8.8h, v28.8h // bitdepth_max + umin v9.8h, v9.8h, v28.8h + + subs w5, w5, #16 + + st1 {v8.8h, v9.8h}, [x0], #32 + + b.le 0f + mov v2.16b, v4.16b + tst w8, #2 // LR_HAVE_RIGHT + ld1 {v3.8h, v4.8h}, [x3], #32 + b.ne 4b // If we don't need to pad, just keep filtering. + b 3b // If we need to pad, check how many pixels we have left. + +0: + ldp x3, x5, [sp, #48] + ldp x15, x0, [sp, #32] + ldp x13, x14, [sp, #16] + ldp x11, x12, [sp], #64 + + add x3, x3, x1 + add x0, x0, x1 + ret -.purgem filter endfunc #define SUM_STRIDE (384+16) From afee165f88657251bc91eb6e620d4f3f908c58dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 9 Feb 2021 13:09:34 +0200 Subject: [PATCH 122/155] arm64: itx16: Use usqadd to avoid separate clamping of negative values Before: Cortex A53 A72 A73 inv_txfm_add_4x4_dct_dct_0_10bpc_neon: 40.7 23.0 24.0 inv_txfm_add_4x4_dct_dct_1_10bpc_neon: 116.0 71.5 78.2 inv_txfm_add_8x8_dct_dct_0_10bpc_neon: 85.7 50.7 53.8 inv_txfm_add_8x8_dct_dct_1_10bpc_neon: 287.0 203.5 215.2 inv_txfm_add_16x16_dct_dct_0_10bpc_neon: 255.7 129.1 140.4 inv_txfm_add_16x16_dct_dct_1_10bpc_neon: 1401.4 1026.7 1039.2 inv_txfm_add_16x16_dct_dct_2_10bpc_neon: 1913.2 1407.3 1479.6 After: inv_txfm_add_4x4_dct_dct_0_10bpc_neon: 38.7 21.5 22.2 inv_txfm_add_4x4_dct_dct_1_10bpc_neon: 116.0 71.3 77.2 inv_txfm_add_8x8_dct_dct_0_10bpc_neon: 76.7 44.7 43.5 inv_txfm_add_8x8_dct_dct_1_10bpc_neon: 278.0 203.0 203.9 inv_txfm_add_16x16_dct_dct_0_10bpc_neon: 236.9 106.2 116.2 inv_txfm_add_16x16_dct_dct_1_10bpc_neon: 1368.7 999.7 1008.4 inv_txfm_add_16x16_dct_dct_2_10bpc_neon: 1880.5 1381.2 1459.4 --- src/arm/64/itx16.S | 263 ++++++++++++++++++--------------------------- 1 file changed, 104 insertions(+), 159 deletions(-) diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S index 2cdb583f66..46851567d2 100644 --- a/src/arm/64/itx16.S +++ b/src/arm/64/itx16.S @@ -124,7 +124,7 @@ endconst .endif .endm -.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 +.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4 .ifnb \load ld1 {\load}, [\src], x1 .endif @@ -132,10 +132,7 @@ endconst srshr \shift, \shift, #\shiftbits .endif .ifnb \addsrc - sqadd \adddst, \adddst, \addsrc -.endif -.ifnb \max - smax \max, \max, v6.8h + usqadd \adddst, \addsrc .endif .ifnb \min smin \min, \min, v7.8h @@ -146,63 +143,57 @@ endconst .endm .macro load_add_store_8x16 dst, src mov \src, \dst - movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff - load_add_store v2.8h, v16.8h, , , , , , \dst, \src - load_add_store v3.8h, v17.8h, , , , , , \dst, \src - load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src - load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src - load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src - load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src - load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src - load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src - load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src - load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src - load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src - load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src - load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src - load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src - load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src - load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src - load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src - load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src - load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src - load_add_store , , , , , v31.8h, v30.8h, \dst, \src - load_add_store , , , , , , v31.8h, \dst, \src + load_add_store v2.8h, v16.8h, , , , , \dst, \src + load_add_store v3.8h, v17.8h, , , , , \dst, \src + load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src + load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src + load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src + load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src + load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src + load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src + load_add_store v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src + load_add_store v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src + load_add_store v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src + load_add_store v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src + load_add_store v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src + load_add_store v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src + load_add_store v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src + load_add_store v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src + load_add_store , , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src + load_add_store , , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src + load_add_store , , , , v27.8h, v26.8h, \dst, \src + load_add_store , , , , , v27.8h, \dst, \src .endm .macro load_add_store_8x8 dst, src, shiftbits=4 mov \src, \dst - movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff - load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits - load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits - load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits - load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits - load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits - load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits - load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits - load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits - load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits - load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits - load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits - load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits - load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits + load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits + load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits + load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits + load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src, \shiftbits + load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src, \shiftbits + load_add_store , , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits + load_add_store , , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits + load_add_store , , , , v19.8h, v18.8h, \dst, \src, \shiftbits + load_add_store , , , , , v19.8h, \dst, \src, \shiftbits .endm .macro load_add_store_8x4 dst, src, shiftbits=4 mov \src, \dst - movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff - load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits - load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits - load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits - load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits - load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits - load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits - load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits - load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits - load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits + load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits + load_add_store , , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits + load_add_store , , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits + load_add_store , , , , v5.8h, v4.8h, \dst, \src, \shiftbits + load_add_store , , , , , v5.8h, \dst, \src, \shiftbits .endm -.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src +.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src .ifnb \load ld1 {\load}[0], [\src], x1 .endif @@ -216,14 +207,11 @@ endconst ld1 {\load}[1], [\src], x1 .endif .ifnb \addsrc - sqadd \adddst, \adddst, \addsrc + usqadd \adddst, \addsrc .endif .ifnb \store st1 {\store}[0], [\dst], x1 .endif -.ifnb \max - smax \max, \max, v6.8h -.endif .ifnb \min smin \min, \min, v7.8h .endif @@ -233,37 +221,33 @@ endconst .endm .macro load_add_store_4x16 dst, src mov \src, \dst - movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff - load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src - load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src - load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src - load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src - load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src - load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src - load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src - load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src - load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src - load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src - load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src - load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src - load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src - load_add_store4 , , , , , , , , v30.d, \dst, \src + load_add_store4 v0.d, v17, v16, , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src + load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src + load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src + load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h, v2.8h, v1.d, \dst, \src + load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h, v2.d, \dst, \src + load_add_store4 , , , v28.8h, v26.8h, v19.8h, v17.8h, v3.d, \dst, \src + load_add_store4 , , , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src + load_add_store4 , , , , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src + load_add_store4 , , , , , , v23.8h, v21.d, \dst, \src + load_add_store4 , , , , , , , v23.d, \dst, \src .endm .macro load_add_store_4x8 dst, src mov \src, \dst - movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff - load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src - load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src - load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src - load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src - load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src - load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src - load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src - load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src - load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src - load_add_store4 , , , , , , , , v22.d, \dst, \src + load_add_store4 v0.d, v17, v16, , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src + load_add_store4 , , , v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src + load_add_store4 , , , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src + load_add_store4 , , , , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src + load_add_store4 , , , , , , v3.8h, v2.d, \dst, \src + load_add_store4 , , , , , , , v3.d, \dst, \src .endm .macro idct_dc w, h, shift @@ -291,7 +275,6 @@ endconst .endm function idct_dc_w4_neon - movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.d}[0], [x0], x1 @@ -299,11 +282,9 @@ function idct_dc_w4_neon ld1 {v1.d}[0], [x0], x1 subs w4, w4, #4 ld1 {v1.d}[1], [x0], x1 - sqadd v0.8h, v0.8h, v16.8h + usqadd v0.8h, v16.8h sub x0, x0, x1, lsl #2 - sqadd v1.8h, v1.8h, v16.8h - smax v0.8h, v0.8h, v30.8h - smax v1.8h, v1.8h, v30.8h + usqadd v1.8h, v16.8h smin v0.8h, v0.8h, v31.8h st1 {v0.d}[0], [x0], x1 smin v1.8h, v1.8h, v31.8h @@ -315,23 +296,18 @@ function idct_dc_w4_neon endfunc function idct_dc_w8_neon - movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h}, [x0], x1 subs w4, w4, #4 ld1 {v1.8h}, [x0], x1 - sqadd v0.8h, v0.8h, v16.8h + usqadd v0.8h, v16.8h ld1 {v2.8h}, [x0], x1 - sqadd v1.8h, v1.8h, v16.8h + usqadd v1.8h, v16.8h ld1 {v3.8h}, [x0], x1 - sqadd v2.8h, v2.8h, v16.8h - sqadd v3.8h, v3.8h, v16.8h + usqadd v2.8h, v16.8h + usqadd v3.8h, v16.8h sub x0, x0, x1, lsl #2 - smax v0.8h, v0.8h, v30.8h - smax v1.8h, v1.8h, v30.8h - smax v2.8h, v2.8h, v30.8h - smax v3.8h, v3.8h, v30.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h st1 {v0.8h}, [x0], x1 @@ -345,21 +321,16 @@ function idct_dc_w8_neon endfunc function idct_dc_w16_neon - movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h, v1.8h}, [x0], x1 subs w4, w4, #2 ld1 {v2.8h, v3.8h}, [x0], x1 - sqadd v0.8h, v0.8h, v16.8h - sqadd v1.8h, v1.8h, v16.8h + usqadd v0.8h, v16.8h + usqadd v1.8h, v16.8h sub x0, x0, x1, lsl #1 - sqadd v2.8h, v2.8h, v16.8h - sqadd v3.8h, v3.8h, v16.8h - smax v0.8h, v0.8h, v30.8h - smax v1.8h, v1.8h, v30.8h - smax v2.8h, v2.8h, v30.8h - smax v3.8h, v3.8h, v30.8h + usqadd v2.8h, v16.8h + usqadd v3.8h, v16.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h @@ -371,19 +342,14 @@ function idct_dc_w16_neon endfunc function idct_dc_w32_neon - movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] subs w4, w4, #1 - sqadd v0.8h, v0.8h, v16.8h - sqadd v1.8h, v1.8h, v16.8h - sqadd v2.8h, v2.8h, v16.8h - sqadd v3.8h, v3.8h, v16.8h - smax v0.8h, v0.8h, v30.8h - smax v1.8h, v1.8h, v30.8h - smax v2.8h, v2.8h, v30.8h - smax v3.8h, v3.8h, v30.8h + usqadd v0.8h, v16.8h + usqadd v1.8h, v16.8h + usqadd v2.8h, v16.8h + usqadd v3.8h, v16.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h @@ -394,30 +360,21 @@ function idct_dc_w32_neon endfunc function idct_dc_w64_neon - movi v30.8h, #0 mvni v31.8h, #0xfc, lsl #8 // 0x3ff sub x1, x1, #64 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 subs w4, w4, #1 - sqadd v0.8h, v0.8h, v16.8h + usqadd v0.8h, v16.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0] - sqadd v1.8h, v1.8h, v16.8h + usqadd v1.8h, v16.8h sub x0, x0, #64 - sqadd v2.8h, v2.8h, v16.8h - sqadd v3.8h, v3.8h, v16.8h - sqadd v4.8h, v4.8h, v16.8h - sqadd v5.8h, v5.8h, v16.8h - sqadd v6.8h, v6.8h, v16.8h - sqadd v7.8h, v7.8h, v16.8h - smax v0.8h, v0.8h, v30.8h - smax v1.8h, v1.8h, v30.8h - smax v2.8h, v2.8h, v30.8h - smax v3.8h, v3.8h, v30.8h - smax v4.8h, v4.8h, v30.8h - smax v5.8h, v5.8h, v30.8h - smax v6.8h, v6.8h, v30.8h - smax v7.8h, v7.8h, v30.8h + usqadd v2.8h, v16.8h + usqadd v3.8h, v16.8h + usqadd v4.8h, v16.8h + usqadd v5.8h, v16.8h + usqadd v6.8h, v16.8h + usqadd v7.8h, v16.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h @@ -575,16 +532,14 @@ function inv_txfm_add_4x4_neon L(itx_4x4_end): mvni v31.8h, #0xfc, lsl #8 // 0x3ff sub x0, x0, x1, lsl #2 - sqadd v16.8h, v16.8h, v0.8h - sqadd v18.8h, v18.8h, v1.8h - smax v16.8h, v16.8h, v30.8h - smax v18.8h, v18.8h, v30.8h - smin v16.8h, v16.8h, v31.8h - st1 {v16.d}[0], [x0], x1 - smin v18.8h, v18.8h, v31.8h - st1 {v16.d}[1], [x0], x1 - st1 {v18.d}[0], [x0], x1 - st1 {v18.d}[1], [x0], x1 + usqadd v0.8h, v16.8h + usqadd v1.8h, v18.8h + smin v0.8h, v0.8h, v31.8h + st1 {v0.d}[0], [x0], x1 + smin v1.8h, v1.8h, v31.8h + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 br x15 endfunc @@ -2219,7 +2174,6 @@ function inv_txfm_add_vert_dct_8x32_neon neg x9, x8 mov x10, x6 - movi v0.8h, #0 mvni v1.8h, #0xfc, lsl #8 // 0x3ff .macro combine r0, r1, r2, r3, op, stride ld1 {v5.8h}, [x7], \stride @@ -2231,27 +2185,23 @@ function inv_txfm_add_vert_dct_8x32_neon ld1 {v4.8h}, [x10], x1 srshr v5.8h, v5.8h, #4 \op v6.8h, v6.8h, \r1 - sqadd v5.8h, v5.8h, v2.8h + usqadd v2.8h, v5.8h srshr v6.8h, v6.8h, #4 \op v7.8h, v7.8h, \r2 - smax v2.8h, v5.8h, v0.8h ld1 {v5.8h}, [x7], \stride - sqadd v6.8h, v6.8h, v3.8h + usqadd v3.8h, v6.8h smin v2.8h, v2.8h, v1.8h srshr v7.8h, v7.8h, #4 \op v5.8h, v5.8h, \r3 st1 {v2.8h}, [x6], x1 ld1 {v2.8h}, [x10], x1 - smax v3.8h, v6.8h, v0.8h - sqadd v7.8h, v7.8h, v4.8h + usqadd v4.8h, v7.8h smin v3.8h, v3.8h, v1.8h srshr v5.8h, v5.8h, #4 st1 {v3.8h}, [x6], x1 - smax v4.8h, v7.8h, v0.8h - sqadd v5.8h, v5.8h, v2.8h + usqadd v2.8h, v5.8h smin v4.8h, v4.8h, v1.8h st1 {v4.8h}, [x6], x1 - smax v2.8h, v5.8h, v0.8h smin v2.8h, v2.8h, v1.8h st1 {v2.8h}, [x6], x1 .endm @@ -3195,7 +3145,6 @@ function inv_txfm_add_vert_dct_8x64_neon ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 - movi v6.8h, #0 mvni v7.8h, #0xfc, lsl #8 // 0x3ff .macro add_dest_addsub src0, src1, src2, src3 ld1 {v0.8h}, [x6], x1 @@ -3211,18 +3160,14 @@ function inv_txfm_add_vert_dct_8x64_neon srshr v4.8h, v4.8h, #4 srshr v5.8h, v5.8h, #4 srshr \src0, \src0, #4 - sqadd v0.8h, v0.8h, v4.8h + usqadd v0.8h, v4.8h srshr \src2, \src2, #4 - sqadd v1.8h, v1.8h, \src0 - sqadd v2.8h, v2.8h, v5.8h - smax v0.8h, v0.8h, v6.8h - sqadd v3.8h, v3.8h, \src2 - smax v1.8h, v1.8h, v6.8h + usqadd v1.8h, \src0 + usqadd v2.8h, v5.8h smin v0.8h, v0.8h, v7.8h - smax v2.8h, v2.8h, v6.8h + usqadd v3.8h, \src2 smin v1.8h, v1.8h, v7.8h st1 {v0.8h}, [x6], x1 - smax v3.8h, v3.8h, v6.8h smin v2.8h, v2.8h, v7.8h st1 {v1.8h}, [x9], x10 smin v3.8h, v3.8h, v7.8h From 4b60133f4dfe605dbfa36fbfd9c522b6a3285df9 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Wed, 10 Feb 2021 01:51:41 +0100 Subject: [PATCH 123/155] x86: Rewrite SGR AVX2 asm The previous implementation did multiple passes in the horizontal and vertical directions, with the intermediate values being stored in buffers on the stack. This caused bad cache thrashing. By interleaving the all the different passes in combination with a ring buffer for storing only a few rows at a time the performance is improved by a significant amount. Also slightly speed up neighbor calculations by packing the a and b values into a single 32-bit unsigned integer which allows calculations on both values simultaneously. --- src/x86/looprestoration.asm | 2407 +++++++++++++++++++++++------------ 1 file changed, 1568 insertions(+), 839 deletions(-) diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm index 8ebe230db9..44aaaf49c7 100644 --- a/src/x86/looprestoration.asm +++ b/src/x86/looprestoration.asm @@ -30,15 +30,37 @@ SECTION_RODATA 32 -wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 -wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 -wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1 wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 -pb_right_ext_mask: times 32 db 0xff - times 32 db 0 +wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 +wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 +sgr_r_ext: times 16 db 1 + times 16 db 9 + +; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of +; cache but eliminates some shifts in the inner sgr loop which is overall a win +sgr_x_by_x: dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 + dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8 + dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5 + dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 + dd 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3 + dd 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 + dd 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 + +sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 + db 9, -1, 10, -1, 11, -1, 12, -1 pb_3: times 4 db 3 pb_m5: times 4 db -5 @@ -47,11 +69,11 @@ pw_256: times 2 dw 256 pw_2056: times 2 dw 2056 pw_m16380: times 2 dw -16380 pw_5_6: dw 5, 6 -pd_1024: dd 1024 -pd_0xf0080029: dd 0xf0080029 +pd_25: dd 25 +pd_34816: dd 34816 +pd_m4096: dd -4096 pd_0xf00801c7: dd 0xf00801c7 - -cextern sgr_x_by_x +pd_0xf00800a4: dd 0xf00800a4 SECTION .text @@ -63,7 +85,7 @@ SECTION .text %endrep %endmacro -DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; wiener ring buffer pointers +DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers INIT_YMM avx2 cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ @@ -79,7 +101,7 @@ cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ vbroadcasti128 m8, [wiener_shufC] packsswb m12, m12 ; x1 x2 vpbroadcastw m13, [fltq+ 6] ; x3 - vbroadcasti128 m9, [wiener_shufD] + vbroadcasti128 m9, [sgr_shuf+6] add lpfq, wq vpbroadcastd m10, [pw_m16380] lea t1, [rsp+wq*2+16] @@ -425,7 +447,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ vbroadcasti128 m7, [wiener_shufC] packsswb m12, m12 ; x1 x2 vpbroadcastw m13, [fltq+ 6] ; x3 - vbroadcasti128 m8, [wiener_shufD] + vbroadcasti128 m8, [sgr_shuf+6] add lpfq, wq vpbroadcastd m9, [pw_m16380] vpbroadcastd m10, [pw_2056] @@ -705,835 +727,1542 @@ ALIGN function_align jl .v_loop ret -cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim - mov xlimd, edgem - movifnidn wd, wm - mov hd, hm - mov edged, xlimd - and xlimd, 2 ; have_right - jz .no_right - add wd, 2+15 - and wd, ~15 -.no_right: - lea r10, [pb_right_ext_mask+32] - xor xlimd, 2 ; 2*!have_right - pxor m1, m1 - add srcq, wq - lea sumq, [sumq+wq*2-2] - lea sumsqq, [sumsqq+wq*4-4] - neg wq -.loop_y: - mov xq, wq - - ; load left - test edgeb, 1 ; have_left - jz .no_left - test leftq, leftq - jz .load_left_from_main - vpbroadcastw xm0, [leftq+2] - add leftq, 4 - jmp .expand_x -.no_left: - vpbroadcastb xm0, [srcq+xq] - jmp .expand_x -.load_left_from_main: - vpbroadcastw xm0, [srcq+xq-2] -.expand_x: - punpckhbw xm0, xm1 - - ; when we reach this, xm0 contains left two px in highest words - cmp xd, -16 - jle .loop_x -.partial_load_and_extend: - vpbroadcastb m3, [srcq-1] - pmovzxbw m2, [srcq+xq] - movu m4, [r10+xq*2] - punpcklbw m3, m1 - pand m2, m4 - pandn m4, m3 - por m2, m4 - jmp .loop_x_noload -.right_extend: - psrldq xm2, xm0, 14 - vpbroadcastw m2, xm2 - jmp .loop_x_noload - -.loop_x: - pmovzxbw m2, [srcq+xq] -.loop_x_noload: - vinserti128 m0, xm2, 1 - palignr m3, m2, m0, 12 - palignr m4, m2, m0, 14 - - punpcklwd m5, m3, m2 - punpckhwd m6, m3, m2 - paddw m3, m4 - punpcklwd m0, m4, m1 - punpckhwd m4, m1 - pmaddwd m5, m5 - pmaddwd m6, m6 - pmaddwd m0, m0 - pmaddwd m4, m4 - paddw m3, m2 - paddd m5, m0 - vextracti128 xm0, m2, 1 - paddd m6, m4 - movu [sumq+xq*2], m3 - movu [sumsqq+xq*4+ 0], xm5 - movu [sumsqq+xq*4+16], xm6 - vextracti128 [sumsqq+xq*4+32], m5, 1 - vextracti128 [sumsqq+xq*4+48], m6, 1 - add xq, 16 - - ; if x <= -16 we can reload more pixels - ; else if x < 0 we reload and extend (this implies have_right=0) - ; else if x < xlimd we extend from previous load (this implies have_right=0) - ; else we are done - - cmp xd, -16 - jle .loop_x - test xd, xd - jl .partial_load_and_extend - cmp xd, xlimd - jl .right_extend - - add sumsqq, (384+16)*4 - add sumq, (384+16)*2 - add srcq, strideq - dec hd - jg .loop_y - RET - -INIT_YMM avx2 -cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim - movifnidn edged, edgem - mov xq, -2 - rorx ylimd, edged, 2 - and ylimd, 2 ; have_bottom - sub ylimd, 2 ; -2 if have_bottom=0, else 0 -.loop_x: - lea yd, [hq+ylimq+2] - lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] - lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] - test edgeb, 4 ; have_top - jnz .load_top - movu m0, [sumsq_ptrq+(384+16)*4*1] - movu m1, [sumsq_ptrq+(384+16)*4*1+32] - movu m6, [sum_ptrq+(384+16)*2*1] - mova m2, m0 - mova m3, m1 - mova m4, m0 - mova m5, m1 - mova m7, m6 - mova m8, m6 - jmp .loop_y_noload -.load_top: - movu m0, [sumsq_ptrq-(384+16)*4*1] ; l2sq [left] - movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l2sq [right] - movu m2, [sumsq_ptrq-(384+16)*4*0] ; l1sq [left] - movu m3, [sumsq_ptrq-(384+16)*4*0+32] ; l1sq [right] - movu m6, [sum_ptrq-(384+16)*2*1] ; l2 - movu m7, [sum_ptrq-(384+16)*2*0] ; l1 -.loop_y: - movu m4, [sumsq_ptrq+(384+16)*4*1] ; l0sq [left] - movu m5, [sumsq_ptrq+(384+16)*4*1+32] ; l0sq [right] - movu m8, [sum_ptrq+(384+16)*2*1] ; l0 -.loop_y_noload: - paddd m0, m2 - paddd m1, m3 - paddw m6, m7 - paddd m0, m4 - paddd m1, m5 - paddw m6, m8 - movu [sumsq_ptrq+ 0], m0 - movu [sumsq_ptrq+32], m1 - movu [sum_ptrq], m6 - - ; shift position down by one - mova m0, m2 - mova m1, m3 - mova m2, m4 - mova m3, m5 - mova m6, m7 - mova m7, m8 - add sumsq_ptrq, (384+16)*4 - add sum_ptrq, (384+16)*2 - dec yd - jg .loop_y - cmp yd, ylimd - jg .loop_y_noload - add xd, 16 - cmp xd, wd - jl .loop_x - RET - -INIT_YMM avx2 -cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s - sub aq, (384+16-1)*4 - sub bq, (384+16-1)*2 - add hd, 2 - lea r5, [sgr_x_by_x-0xf03] -%ifidn sd, sm - movd xm6, sd - vpbroadcastd m6, xm6 -%else - vpbroadcastd m6, sm -%endif - vpbroadcastd m8, [pd_0xf00801c7] - vpbroadcastd m9, [pw_256] - pcmpeqb m7, m7 - psrld m10, m9, 13 ; pd_2048 - DEFINE_ARGS a, b, w, h, x - -.loop_y: - mov xq, -2 -.loop_x: - pmovzxwd m0, [bq+xq*2] - pmovzxwd m1, [bq+xq*2+(384+16)*2] - movu m2, [aq+xq*4] - movu m3, [aq+xq*4+(384+16)*4] - pslld m4, m2, 3 - pslld m5, m3, 3 - paddd m2, m4 ; aa * 9 - paddd m3, m5 - pmaddwd m4, m0, m0 - pmaddwd m5, m1, m1 - pmaddwd m0, m8 - pmaddwd m1, m8 - psubd m2, m4 ; p = aa * 9 - bb * bb - psubd m3, m5 - pmulld m2, m6 - pmulld m3, m6 - paddusw m2, m8 - paddusw m3, m8 - psrld m2, 20 ; z - psrld m3, 20 - mova m5, m7 - vpgatherdd m4, [r5+m2], m5 ; xx - mova m5, m7 - vpgatherdd m2, [r5+m3], m5 - psrld m4, 24 - psrld m2, 24 - pmulld m0, m4 - pmulld m1, m2 - packssdw m4, m2 - psubw m4, m9, m4 - vpermq m4, m4, q3120 - paddd m0, m10 - paddd m1, m10 - psrld m0, 12 - psrld m1, 12 - movu [bq+xq*2], xm4 - vextracti128 [bq+xq*2+(384+16)*2], m4, 1 - movu [aq+xq*4], m0 - movu [aq+xq*4+(384+16)*4], m1 - add xd, 8 - cmp xd, wd - jl .loop_x - add aq, (384+16)*4*2 - add bq, (384+16)*2*2 - sub hd, 2 - jg .loop_y - RET - -INIT_YMM avx2 -cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ - tmp_ptr, src_ptr, a_ptr, b_ptr, x, y - movifnidn wd, wm - mov hd, hm - vpbroadcastd m15, [pw_16] - xor xd, xd -.loop_x: - lea tmp_ptrq, [tq+xq*2] - lea src_ptrq, [srcq+xq*1] - lea a_ptrq, [aq+xq*4+(384+16)*4] - lea b_ptrq, [bq+xq*2+(384+16)*2] - movu m0, [aq+xq*4-(384+16)*4-4] - movu m2, [aq+xq*4-(384+16)*4+4] - mova m1, [aq+xq*4-(384+16)*4] ; a:top [first half] - paddd m0, m2 ; a:tl+tr [first half] - movu m2, [aq+xq*4-(384+16)*4-4+32] - movu m4, [aq+xq*4-(384+16)*4+4+32] - mova m3, [aq+xq*4-(384+16)*4+32] ; a:top [second half] - paddd m2, m4 ; a:tl+tr [second half] - movu m4, [aq+xq*4-4] - movu m5, [aq+xq*4+4] - paddd m1, [aq+xq*4] ; a:top+ctr [first half] - paddd m4, m5 ; a:l+r [first half] - movu m5, [aq+xq*4+32-4] - movu m6, [aq+xq*4+32+4] - paddd m3, [aq+xq*4+32] ; a:top+ctr [second half] - paddd m5, m6 ; a:l+r [second half] - - movu m6, [bq+xq*2-(384+16)*2-2] - movu m8, [bq+xq*2-(384+16)*2+2] - mova m7, [bq+xq*2-(384+16)*2] ; b:top - paddw m6, m8 ; b:tl+tr - movu m8, [bq+xq*2-2] - movu m9, [bq+xq*2+2] - paddw m7, [bq+xq*2] ; b:top+ctr - paddw m8, m9 ; b:l+r - mov yd, hd -.loop_y: - movu m9, [b_ptrq-2] - movu m10, [b_ptrq+2] - paddw m7, [b_ptrq] ; b:top+ctr+bottom - paddw m9, m10 ; b:bl+br - paddw m10, m7, m8 ; b:top+ctr+bottom+l+r - paddw m6, m9 ; b:tl+tr+bl+br - psubw m7, [b_ptrq-(384+16)*2*2] ; b:ctr+bottom - paddw m10, m6 - psllw m10, 2 - psubw m10, m6 ; aa - pmovzxbw m12, [src_ptrq] - punpcklwd m6, m10, m15 - punpckhwd m10, m15 - punpcklwd m13, m12, m15 - punpckhwd m12, m15 - pmaddwd m6, m13 ; aa*src[x]+256 [first half] - pmaddwd m10, m12 ; aa*src[x]+256 [second half] - - movu m11, [a_ptrq-4] - movu m12, [a_ptrq+4] - paddd m1, [a_ptrq] ; a:top+ctr+bottom [first half] - paddd m11, m12 ; a:bl+br [first half] - movu m12, [a_ptrq+32-4] - movu m13, [a_ptrq+32+4] - paddd m3, [a_ptrq+32] ; a:top+ctr+bottom [second half] - paddd m12, m13 ; a:bl+br [second half] - paddd m13, m1, m4 ; a:top+ctr+bottom+l+r [first half] - paddd m14, m3, m5 ; a:top+ctr+bottom+l+r [second half] - paddd m0, m11 ; a:tl+tr+bl+br [first half] - paddd m2, m12 ; a:tl+tr+bl+br [second half] - paddd m13, m0 - paddd m14, m2 - pslld m13, 2 - pslld m14, 2 - psubd m13, m0 ; bb [first half] - psubd m14, m2 ; bb [second half] - vperm2i128 m0, m13, m14, 0x31 - vinserti128 m13, xm14, 1 - psubd m1, [a_ptrq-(384+16)*4*2] ; a:ctr+bottom [first half] - psubd m3, [a_ptrq-(384+16)*4*2+32] ; a:ctr+bottom [second half] - - paddd m6, m13 - paddd m10, m0 - psrad m6, 9 - psrad m10, 9 - packssdw m6, m10 - mova [tmp_ptrq], m6 - - ; shift to next row - mova m0, m4 - mova m2, m5 - mova m4, m11 - mova m5, m12 - mova m6, m8 - mova m8, m9 - - add a_ptrq, (384+16)*4 - add b_ptrq, (384+16)*2 - add tmp_ptrq, 384*2 - add src_ptrq, strideq - dec yd - jg .loop_y - add xd, 16 - cmp xd, wd - jl .loop_x - RET - -INIT_YMM avx2 -cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt -%ifidn wtd, wtm - shl wtd, 4 - movd xm5, wtd - vpbroadcastw m5, xm5 -%else - vpbroadcastw m5, wtm - mov hd, hm - psllw m5, 4 -%endif - DEFINE_ARGS dst, stride, t, w, h, idx -.loop_y: - xor idxd, idxd -.loop_x: - mova m0, [tq+idxq*2+ 0] - mova m1, [tq+idxq*2+32] - pmovzxbw m2, [dstq+idxq+ 0] - pmovzxbw m3, [dstq+idxq+16] - psllw m4, m2, 4 - psubw m0, m4 - psllw m4, m3, 4 - psubw m1, m4 - pmulhrsw m0, m5 - pmulhrsw m1, m5 - paddw m0, m2 - paddw m1, m3 - packuswb m0, m1 - vpermq m0, m0, q3120 - mova [dstq+idxq], m0 - add idxd, 32 - cmp idxd, wd - jl .loop_x - add tq, 384*2 - add dstq, strideq - dec hd - jg .loop_y - RET - -INIT_YMM avx2 -cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim - mov edged, edgem - movifnidn wd, wm - mov hd, hm - test edgeb, 2 ; have_right - jz .no_right - xor xlimd, xlimd - add wd, 2+15 - and wd, ~15 - jmp .right_done -.no_right: - mov xlimd, 3 - sub wd, 1 -.right_done: - lea r10, [pb_right_ext_mask+32] - pxor m1, m1 - lea srcq, [srcq+wq+1] - lea sumq, [sumq+wq*2-2] - lea sumsqq, [sumsqq+wq*4-4] - neg wq -.loop_y: - mov xq, wq - - ; load left - test edgeb, 1 ; have_left - jz .no_left - test leftq, leftq - jz .load_left_from_main - vpbroadcastd xm2, [leftq] - movd xm0, [srcq+xq-1] - add leftq, 4 - palignr xm0, xm2, 1 - jmp .expand_x -.no_left: - vpbroadcastb xm0, [srcq+xq-1] - jmp .expand_x -.load_left_from_main: - vpbroadcastd xm0, [srcq+xq-4] -.expand_x: - punpckhbw xm0, xm1 - - ; when we reach this, xm0 contains left two px in highest words - cmp xd, -16 - jle .loop_x - test xd, xd - jge .right_extend -.partial_load_and_extend: - vpbroadcastb m3, [srcq-1] - pmovzxbw m2, [srcq+xq] - movu m4, [r10+xq*2] - punpcklbw m3, m1 - pand m2, m4 - pandn m4, m3 - por m2, m4 - jmp .loop_x_noload -.right_extend: - psrldq xm2, xm0, 14 - vpbroadcastw m2, xm2 - jmp .loop_x_noload - -.loop_x: - pmovzxbw m2, [srcq+xq] -.loop_x_noload: - vinserti128 m0, xm2, 1 - palignr m3, m2, m0, 8 - palignr m4, m2, m0, 10 - palignr m5, m2, m0, 12 - palignr m6, m2, m0, 14 - - paddw m0, m3, m2 - punpcklwd m7, m3, m2 - punpckhwd m3, m2 - paddw m0, m4 - punpcklwd m8, m4, m5 - punpckhwd m4, m5 - paddw m0, m5 - punpcklwd m9, m6, m1 - punpckhwd m5, m6, m1 - paddw m0, m6 - pmaddwd m7, m7 - pmaddwd m3, m3 - pmaddwd m8, m8 - pmaddwd m4, m4 - pmaddwd m9, m9 - pmaddwd m5, m5 - paddd m7, m8 - paddd m3, m4 - paddd m7, m9 - paddd m3, m5 - movu [sumq+xq*2], m0 - movu [sumsqq+xq*4+ 0], xm7 - movu [sumsqq+xq*4+16], xm3 - vextracti128 [sumsqq+xq*4+32], m7, 1 - vextracti128 [sumsqq+xq*4+48], m3, 1 - - vextracti128 xm0, m2, 1 - add xq, 16 - - ; if x <= -16 we can reload more pixels - ; else if x < 0 we reload and extend (this implies have_right=0) - ; else if x < xlimd we extend from previous load (this implies have_right=0) - ; else we are done - - cmp xd, -16 - jle .loop_x - test xd, xd - jl .partial_load_and_extend - cmp xd, xlimd - jl .right_extend - - add srcq, strideq - add sumsqq, (384+16)*4 - add sumq, (384+16)*2 - dec hd - jg .loop_y - RET - -INIT_YMM avx2 -cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim - movifnidn edged, edgem - mov xq, -2 - rorx ylimd, edged, 2 - and ylimd, 2 ; have_bottom - sub ylimd, 3 ; -3 if have_bottom=0, else -1 -.loop_x: - lea yd, [hq+ylimq+2] - lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] - lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] - test edgeb, 4 ; have_top - jnz .load_top - movu m0, [sumsq_ptrq+(384+16)*4*1] - movu m1, [sumsq_ptrq+(384+16)*4*1+32] - movu m10, [sum_ptrq+(384+16)*2*1] - mova m2, m0 - mova m3, m1 - mova m4, m0 - mova m5, m1 - mova m6, m0 - mova m7, m1 - mova m11, m10 - mova m12, m10 - mova m13, m10 - jmp .loop_y_second_load -.load_top: - movu m0, [sumsq_ptrq-(384+16)*4*1] ; l3/4sq [left] - movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l3/4sq [right] - movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] - movu m5, [sumsq_ptrq-(384+16)*4*0+32] ; l2sq [right] - movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 - movu m12, [sum_ptrq-(384+16)*2*0] ; l2 - mova m2, m0 - mova m3, m1 - mova m11, m10 -.loop_y: - movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] - movu m7, [sumsq_ptrq+(384+16)*4*1+32] ; l1sq [right] - movu m13, [sum_ptrq+(384+16)*2*1] ; l1 -.loop_y_second_load: - test yd, yd - jle .emulate_second_load - movu m8, [sumsq_ptrq+(384+16)*4*2] ; l0sq [left] - movu m9, [sumsq_ptrq+(384+16)*4*2+32] ; l0sq [right] - movu m14, [sum_ptrq+(384+16)*2*2] ; l0 -.loop_y_noload: - paddd m0, m2 - paddd m1, m3 - paddw m10, m11 - paddd m0, m4 - paddd m1, m5 - paddw m10, m12 - paddd m0, m6 - paddd m1, m7 - paddw m10, m13 - paddd m0, m8 - paddd m1, m9 - paddw m10, m14 - movu [sumsq_ptrq+ 0], m0 - movu [sumsq_ptrq+32], m1 - movu [sum_ptrq], m10 - - ; shift position down by one - mova m0, m4 - mova m1, m5 - mova m2, m6 - mova m3, m7 - mova m4, m8 - mova m5, m9 - mova m10, m12 - mova m11, m13 - mova m12, m14 - add sumsq_ptrq, (384+16)*4*2 - add sum_ptrq, (384+16)*2*2 - sub yd, 2 - jge .loop_y - ; l1 = l0 - mova m6, m8 - mova m7, m9 - mova m13, m14 - cmp yd, ylimd - jg .loop_y_noload - add xd, 16 - cmp xd, wd - jl .loop_x - RET -.emulate_second_load: - mova m8, m6 - mova m9, m7 - mova m14, m13 - jmp .loop_y_noload - -INIT_YMM avx2 -cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s - sub aq, (384+16-1)*4 - sub bq, (384+16-1)*2 - add hd, 2 - lea r5, [sgr_x_by_x-0xf03] -%ifidn sd, sm - movd xm6, sd - vpbroadcastd m6, xm6 -%else - vpbroadcastd m6, sm -%endif - vpbroadcastd m8, [pd_0xf0080029] - vpbroadcastd m9, [pw_256] - pcmpeqb m7, m7 - psrld m10, m9, 15 ; pd_512 - DEFINE_ARGS a, b, w, h, x -.loop_y: - mov xq, -2 -.loop_x: - pmovzxwd m0, [bq+xq*2+ 0] - pmovzxwd m1, [bq+xq*2+16] - movu m2, [aq+xq*4+ 0] - movu m3, [aq+xq*4+32] - pslld m4, m2, 3 ; aa * 8 - pslld m5, m3, 3 - paddd m2, m4 ; aa * 9 - paddd m3, m5 - paddd m4, m4 ; aa * 16 - paddd m5, m5 - paddd m2, m4 ; aa * 25 - paddd m3, m5 - pmaddwd m4, m0, m0 - pmaddwd m5, m1, m1 - psubd m2, m4 ; p = aa * 25 - bb * bb - psubd m3, m5 - pmulld m2, m6 - pmulld m3, m6 - paddusw m2, m8 - paddusw m3, m8 - psrld m2, 20 ; z - psrld m3, 20 - mova m5, m7 - vpgatherdd m4, [r5+m2], m5 ; xx - mova m5, m7 - vpgatherdd m2, [r5+m3], m5 - psrld m4, 24 - psrld m2, 24 - packssdw m3, m4, m2 - pmullw m4, m8 - pmullw m2, m8 - psubw m3, m9, m3 - vpermq m3, m3, q3120 - pmaddwd m0, m4 - pmaddwd m1, m2 - paddd m0, m10 - paddd m1, m10 - psrld m0, 10 - psrld m1, 10 - movu [bq+xq*2], m3 - movu [aq+xq*4+ 0], m0 - movu [aq+xq*4+32], m1 - add xd, 16 - cmp xd, wd - jl .loop_x - add aq, (384+16)*4*2 - add bq, (384+16)*2*2 - sub hd, 2 - jg .loop_y +cglobal sgr_filter_5x5, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h +%define base r12-sgr_x_by_x-256*4 + lea r12, [sgr_x_by_x+256*4] + mov paramsq, paramsmp + mov wd, wm + mov edged, r8m + mov hd, r6m + vbroadcasti128 m8, [base+sgr_shuf+0] + add lpfq, wq + vbroadcasti128 m9, [base+sgr_shuf+8] + lea t1, [rsp+wq*2+20] + vbroadcasti128 m10, [base+sgr_shuf+2] + add dstq, wq + vbroadcasti128 m11, [base+sgr_shuf+6] + lea t3, [rsp+wq*4+16+400*12] + vpbroadcastd m12, [paramsq+0] ; s0 + neg wq + vpbroadcastd m13, [base+pd_0xf00800a4] + pxor m6, m6 + vpbroadcastw m7, [paramsq+8] ; w0 + vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15) + psllw m7, 4 + vpbroadcastd m15, [base+pd_m4096] + lea r10, [lpfq+lpf_strideq*4] + mov [rsp+8*1], lpf_strideq + add r10, lpf_strideq + mov [rsp+8*0], r10 ; below + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + add r10, lpf_strideq + mov [rsp+8*0], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, dst_strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + test hd, hd + jz .odd_height + call .h + add lpfq, dst_strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp+8*0] + call .h_top + add lpfq, [rsp+8*1] + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea r10, [r10+lpf_strideq*2] + mov [rsp+8*0], r10 + call .h + lea t2, [t1+400*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.extend_right: + movd xm2, r10d + mova m0, [sgr_r_ext] + vpbroadcastb m2, xm2 + psubb m0, m2 + pminub m0, [pb_0to31] + pshufb m5, m0 + ret +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .h_main +.h_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu xm5, [lpfq+r10-2] +.h_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -18 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m3, m5, m8 + pmullw m4, m3, m3 + pshufb m2, m5, m9 + paddw m0, m3, m2 + shufps m3, m2, q2121 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + punpcklwd m3, m4, m6 + paddd m1, m3 + punpckhwd m4, m6 + paddd m2, m4 + pshufb m4, m5, m10 + paddw m0, m4 + pshufb m5, m11 + paddw m0, m5 ; sum + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + punpckhwd m4, m5 + pmaddwd m4, m4 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10*2+400*0] + paddd m1, [t1+r10*2+400*2] + paddd m2, [t1+r10*2+400*4] +.h_loop_end: + paddd m1, m3 ; sumsq + paddd m2, m4 + mova [t1+r10*2+400*0], m0 + mova [t1+r10*2+400*2], m1 + mova [t1+r10*2+400*4], m2 + add r10, 16 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-2] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10*2+400*0] + mova m1, [t1+r10*2+400*2] + mova m2, [t1+r10*2+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10*2+400*0], m0 + mova [t2+r10*2+400*2], m1 + mova [t2+r10*2+400*4], m2 + add r10, 16 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv_main +.hv_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu xm5, [lpfq+r10-2] +.hv_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m1, m5, m8 + pmullw m4, m1, m1 + pshufb m3, m5, m9 + paddw m0, m1, m3 + shufps m1, m3, q2121 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + punpcklwd m1, m4, m6 + paddd m2, m1 + punpckhwd m4, m6 + paddd m3, m4 + pshufb m1, m5, m10 + paddw m0, m1 + pshufb m5, m11 + paddw m0, m5 ; h sum + punpcklwd m4, m5, m1 + pmaddwd m4, m4 + punpckhwd m5, m1 + pmaddwd m5, m5 + paddw m1, m0, [t1+r10*2+400*0] + paddd m2, m4 ; h sumsq + paddd m3, m5 + paddd m4, m2, [t1+r10*2+400*2] + paddd m5, m3, [t1+r10*2+400*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddw m1, [t2+r10*2+400*0] ; hv sum + paddd m4, [t2+r10*2+400*2] ; hv sumsq + paddd m5, [t2+r10*2+400*4] + mova [t0+r10*2+400*0], m0 + mova [t0+r10*2+400*2], m2 + mova [t0+r10*2+400*4], m3 + vpbroadcastd m2, [pd_25] + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmulld m4, m2 ; a * 25 + pmulld m5, m2 + pmaddwd m2, m0, m0 ; b * b + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m12 ; p * s + pmulld m5, m12 + pmaddwd m0, m13 ; b * 164 + pmaddwd m1, m13 + paddusw m4, m13 + paddusw m5, m13 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + pand m0, m15 + pand m1, m15 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires + vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b. + mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but + vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way. + add r10, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10*2+400*0], m1 + paddw m1, m0 + mova [t1+r10*2+400*2], m4 + paddd m4, m2 + mova [t1+r10*2+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m0, [t1+r10*2+400*0] + mova m2, [t1+r10*2+400*2] + mova m3, [t1+r10*2+400*4] + paddw m1, m0, [t2+r10*2+400*0] + paddd m4, m2, [t2+r10*2+400*2] + paddd m5, m3, [t2+r10*2+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + vpbroadcastd m2, [pd_25] + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmulld m4, m2 ; a * 25 + pmulld m5, m2 + pmaddwd m2, m0, m0 ; b * b + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m12 ; p * s + pmulld m5, m12 + pmaddwd m0, m13 ; b * 164 + pmaddwd m1, m13 + paddusw m4, m13 + paddusw m5, m13 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + pand m0, m15 + pand m1, m15 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 + vextracti128 [t3+r10*4+40], m0, 1 + mova [t3+r10*4+24], xm1 + vextracti128 [t3+r10*4+56], m1, 1 + add r10, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+ 4] + movu m1, [t3+r10*4+36] + paddd m2, m0, [t3+r10*4+ 0] + paddd m3, m1, [t3+r10*4+32] + paddd m2, [t3+r10*4+ 8] + paddd m3, [t3+r10*4+40] + paddd m0, m2 + pslld m2, 2 + paddd m1, m3 + pslld m3, 2 + paddd m2, m0 ; ab 565 + paddd m3, m1 + ; a = 4096 - (ab & 4095) = -(ab | ~4095), so by + ; using OR instead of AND for the masking we get + ; the subtraction for free (with a negated result) + por m0, m15, m2 ; -a + psrld m2, 12 ; b + por m1, m15, m3 + psrld m3, 12 + mova [t3+r10*4+400*4+ 0], m0 + mova [t3+r10*4+400*8+ 0], m2 + mova [t3+r10*4+400*4+32], m1 + mova [t3+r10*4+400*8+32], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t3+r10*4+ 4] + movu m1, [t3+r10*4+36] + paddd m2, m0, [t3+r10*4+ 0] + paddd m3, m1, [t3+r10*4+32] + paddd m2, [t3+r10*4+ 8] + paddd m3, [t3+r10*4+40] + paddd m0, m2 + pslld m2, 2 + paddd m1, m3 + pslld m3, 2 + paddd m2, m0 + paddd m3, m1 + por m0, m15, m2 + psrld m2, 12 + por m1, m15, m3 + psrld m3, 12 + paddd m4, m0, [t3+r10*4+400*4+ 0] ; -a + paddd m5, m1, [t3+r10*4+400*4+32] + mova [t3+r10*4+400*4+ 0], m0 + mova [t3+r10*4+400*4+32], m1 + paddd m0, m2, [t3+r10*4+400*8+ 0] ; b + paddd m1, m3, [t3+r10*4+400*8+32] + mova [t3+r10*4+400*8+ 0], m2 + mova [t3+r10*4+400*8+32], m3 + pmovzxbd m2, [dstq+r10+0] + pmovzxbd m3, [dstq+r10+8] + pmaddwd m4, m2 ; -a * src + pmaddwd m5, m3 + packssdw m2, m3 + psubd m0, m4 ; a * src + b + (1 << 8) + psubd m1, m5 + psrld m0, 9 + psrld m1, 9 + packssdw m0, m1 + psllw m1, m2, 4 + psubw m0, m1 + pmulhrsw m0, m7 + paddw m0, m2 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, q3120 + mova [dstq+r10], xm0 + add r10, 16 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + pmovzxbd m2, [dstq+r10+0] + pmovzxbd m3, [dstq+r10+8] + pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; -a * src + pmaddwd m5, m3, [t3+r10*4+400*4+32] + mova m0, [t3+r10*4+400*8+ 0] ; b + mova m1, [t3+r10*4+400*8+32] + packssdw m2, m3 + psubd m0, m4 ; a * src + b + (1 << 7) + psubd m1, m5 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + psllw m1, m2, 4 + psubw m0, m1 + pmulhrsw m0, m7 + paddw m0, m2 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, q3120 + mova [dstq+r10], xm0 + add r10, 16 + jl .n1_loop + add dstq, dst_strideq + ret -INIT_YMM avx2 -cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \ - tmp_ptr, src_ptr, a_ptr, b_ptr, x, y - movifnidn wd, wm - mov hd, hm - vpbroadcastd m9, [pw_5_6] - vpbroadcastd m12, [pw_256] - psrlw m11, m12, 1 ; pw_128 - psrlw m10, m12, 8 ; pw_1 - xor xd, xd -.loop_x: - lea tmp_ptrq, [tq+xq*2] - lea src_ptrq, [srcq+xq*1] - lea a_ptrq, [aq+xq*4+(384+16)*4] - lea b_ptrq, [bq+xq*2+(384+16)*2] - movu m0, [aq+xq*4-(384+16)*4-4] - mova m1, [aq+xq*4-(384+16)*4] - movu m2, [aq+xq*4-(384+16)*4+4] - movu m3, [aq+xq*4-(384+16)*4-4+32] - mova m4, [aq+xq*4-(384+16)*4+32] - movu m5, [aq+xq*4-(384+16)*4+4+32] - paddd m0, m2 - paddd m3, m5 - paddd m0, m1 - paddd m3, m4 - pslld m2, m0, 2 - pslld m5, m3, 2 - paddd m2, m0 - paddd m5, m3 - paddd m0, m2, m1 ; prev_odd_b [first half] - paddd m1, m5, m4 ; prev_odd_b [second half] - movu m3, [bq+xq*2-(384+16)*2-2] - mova m4, [bq+xq*2-(384+16)*2] - movu m5, [bq+xq*2-(384+16)*2+2] - paddw m3, m5 - punpcklwd m5, m3, m4 - punpckhwd m3, m4 - pmaddwd m5, m9 - pmaddwd m3, m9 - packssdw m2, m5, m3 ; prev_odd_a - mov yd, hd -.loop_y: - movu m3, [a_ptrq-4] - mova m4, [a_ptrq] - movu m5, [a_ptrq+4] - movu m6, [a_ptrq+32-4] - mova m7, [a_ptrq+32] - movu m8, [a_ptrq+32+4] - paddd m3, m5 - paddd m6, m8 - paddd m3, m4 - paddd m6, m7 - pslld m5, m3, 2 - pslld m8, m6, 2 - paddd m5, m3 - paddd m8, m6 - paddd m3, m5, m4 ; cur_odd_b [first half] - paddd m4, m8, m7 ; cur_odd_b [second half] - movu m5, [b_ptrq-2] - mova m6, [b_ptrq] - movu m7, [b_ptrq+2] - paddw m5, m7 - punpcklwd m7, m5, m6 - punpckhwd m5, m6 - pmaddwd m7, m9 - pmaddwd m5, m9 - packssdw m5, m7, m5 ; cur_odd_a - - paddd m0, m3 ; cur_even_b [first half] - paddd m1, m4 ; cur_even_b [second half] - paddw m2, m5 ; cur_even_a - - pmovzxbw m6, [src_ptrq] - vperm2i128 m8, m0, m1, 0x31 - vinserti128 m0, xm1, 1 - punpcklwd m7, m6, m10 - punpckhwd m6, m10 - punpcklwd m1, m2, m12 - punpckhwd m2, m12 - pmaddwd m7, m1 - pmaddwd m6, m2 - paddd m7, m0 - paddd m6, m8 - psrad m7, 9 - psrad m6, 9 - - pmovzxbw m8, [src_ptrq+strideq] - punpcklwd m0, m8, m10 - punpckhwd m8, m10 - punpcklwd m1, m5, m11 - punpckhwd m2, m5, m11 - pmaddwd m0, m1 - pmaddwd m8, m2 - vinserti128 m2, m3, xm4, 1 - vperm2i128 m1, m3, m4, 0x31 - paddd m0, m2 - paddd m8, m1 - psrad m0, 8 - psrad m8, 8 - - packssdw m7, m6 - packssdw m0, m8 - mova [tmp_ptrq+384*2*0], m7 - mova [tmp_ptrq+384*2*1], m0 - - mova m0, m3 - mova m1, m4 - mova m2, m5 - add a_ptrq, (384+16)*4*2 - add b_ptrq, (384+16)*2*2 - add tmp_ptrq, 384*2*2 - lea src_ptrq, [src_ptrq+strideq*2] - sub yd, 2 - jg .loop_y - add xd, 16 - cmp xd, wd - jl .loop_x +cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h +%define base r14-sgr_x_by_x-256*4 + mov paramsq, paramsmp + mov edged, r8m + mov wd, wm + mov hd, r6m + lea r14, [sgr_x_by_x+256*4] + vbroadcasti128 m8, [base+sgr_shuf+2] + add lpfq, wq + vbroadcasti128 m9, [base+sgr_shuf+4] + lea t1, [rsp+wq*2+20] + vbroadcasti128 m10, [base+sgr_shuf+6] + add dstq, wq + vpbroadcastd m11, [paramsq+ 4] ; s1 + lea t3, [rsp+wq*4+16+400*12] + vpbroadcastd m12, [base+pd_0xf00801c7] + neg wq + vpbroadcastw m7, [paramsq+10] ; w1 + pxor m6, m6 + vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15) + psllw m7, 4 + vpbroadcastd m14, [base+pd_m4096] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + add t1, 400*6 + call .h_top + lea t4, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + add t4, lpf_strideq + mov [rsp+8*0], t4 ; below + mov t0, t2 + call .hv +.main: + mov t5, t3 + add t3, 400*4 + dec hd + jz .height1 + add lpfq, dst_strideq + call .hv + call .prep_n + dec hd + jz .extend_bottom +.main_loop: + add lpfq, dst_strideq + call .hv + call .n + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp+8*0] + call .hv_bottom + call .n + add lpfq, [rsp+8*1] + call .hv_bottom +.end: + call .n RET +.height1: + call .v + call .prep_n + mov t2, t1 + call .v + jmp .end +.extend_bottom: + call .v + call .n + mov t2, t1 + call .v + jmp .end +.no_top: + lea t4, [lpfq+lpf_strideq*4] + mov lpfq, dstq + mov [rsp+8*1], lpf_strideq + lea t4, [t4+lpf_strideq*2] + mov [rsp+8*0], t4 + call .h + lea t0, [t1+400*6] + mov t2, t1 + call .v + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .h_main +.h_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu xm5, [lpfq+r10-2] +.h_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -17 + jl .h_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right +.h_have_right: + pshufb m0, m5, m8 + pmullw m2, m0, m0 + pshufb m4, m5, m9 + paddw m0, m4 + pshufb m5, m10 + paddw m0, m5 ; sum + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + punpckhwd m4, m5 + pmaddwd m4, m4 + punpcklwd m1, m2, m6 + punpckhwd m2, m6 + mova [t1+r10*2+400*0], m0 + paddd m1, m3 ; sumsq + paddd m2, m4 + mova [t1+r10*2+400*2], m1 + mova [t1+r10*2+400*4], m2 + add r10, 16 + jl .h_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv_main +.hv_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu xm5, [lpfq+r10-2] +.hv_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -17 + jl .hv_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right +.hv_have_right: + pshufb m0, m5, m8 + pmullw m3, m0, m0 + pshufb m1, m5, m9 + paddw m0, m1 + pshufb m5, m10 + paddw m0, m5 ; h sum + punpcklwd m4, m5, m1 + pmaddwd m4, m4 + punpckhwd m5, m1 + pmaddwd m5, m5 + paddw m1, m0, [t2+r10*2+400*0] + paddw m1, [t1+r10*2+400*0] ; hv sum + punpcklwd m2, m3, m6 + punpckhwd m3, m6 + paddd m4, m2 ; h sumsq + paddd m5, m3 + paddd m2, m4, [t2+r10*2+400*2] + paddd m3, m5, [t2+r10*2+400*4] + paddd m2, [t1+r10*2+400*2] ; hv sumsq + paddd m3, [t1+r10*2+400*4] + mova [t0+r10*2+400*0], m0 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + mova [t0+r10*2+400*2], m4 + pslld m4, m2, 3 + mova [t0+r10*2+400*4], m5 + pslld m5, m3, 3 + paddd m4, m2 ; a * 9 + pmaddwd m2, m0, m0 ; b * b + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m11 ; p * s + pmulld m5, m11 + pmaddwd m0, m12 ; b * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r14+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r14+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m13 + pand m0, m14 + pand m1, m14 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 + vextracti128 [t3+r10*4+40], m0, 1 + mova [t3+r10*4+24], xm1 + vextracti128 [t3+r10*4+56], m1, 1 + add r10, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m1, [t1+r10*2+400*0] + paddw m1, m1 + paddw m1, [t2+r10*2+400*0] ; hv sum + mova m2, [t1+r10*2+400*2] + mova m3, [t1+r10*2+400*4] + paddd m2, m2 + paddd m3, m3 + paddd m2, [t2+r10*2+400*2] ; hv sumsq + paddd m3, [t2+r10*2+400*4] + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a * 9 + pmaddwd m2, m0, m0 ; b * b + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m11 ; p * s + pmulld m5, m11 + pmaddwd m0, m12 ; b * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r14+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r14+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m13 + pand m0, m14 + pand m1, m14 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 + vextracti128 [t3+r10*4+40], m0, 1 + mova [t3+r10*4+24], xm1 + vextracti128 [t3+r10*4+56], m1, 1 + add r10, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq + mov t4, t3 + add t3, 400*4 +.prep_n_loop: + mova m2, [t5+r10*4+0] + mova m3, [t4+r10*4+0] + paddd m2, [t5+r10*4+8] + paddd m3, [t4+r10*4+8] + paddd m0, m2, [t5+r10*4+4] + paddd m1, m3, [t4+r10*4+4] + pslld m0, 2 + paddd m1, m1 ; ab[ 0] 222 + psubd m0, m2 ; ab[-1] 343 + mova [t3+r10*4+400*4], m1 + paddd m1, m1 + mova [t5+r10*4], m0 + psubd m1, m3 ; ab[ 0] 343 + mova [t4+r10*4], m1 + add r10, 8 + jl .prep_n_loop + ret +; a+b are packed together in a single dword, but we can't do the +; full neighbor calculations before splitting them since we don't +; have sufficient precision. The solution is to do the calculations +; in two equal halves and split a and b before doing the final sum. +ALIGN function_align +.n: ; neighbor + output + mov r10, wq +.n_loop: + mova m4, [t3+r10*4+ 0] + paddd m4, [t3+r10*4+ 8] + paddd m5, m4, [t3+r10*4+ 4] + paddd m5, m5 ; ab[+1] 222 + mova m2, [t3+r10*4+400*4+ 0] + paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 + mova m3, [t3+r10*4+400*4+32] + paddd m1, m3, [t5+r10*4+32] + mova [t3+r10*4+400*4+ 0], m5 + paddd m5, m5 + psubd m5, m4 ; ab[+1] 343 + mova [t5+r10*4+ 0], m5 + paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343 + mova m4, [t3+r10*4+32] + paddd m4, [t3+r10*4+40] + paddd m5, m4, [t3+r10*4+36] + paddd m5, m5 + mova [t3+r10*4+400*4+32], m5 + paddd m5, m5 + psubd m5, m4 + mova [t5+r10*4+32], m5 + por m4, m14, m0 + psrld m0, 12 + paddd m3, m5 + por m5, m14, m2 + psrld m2, 12 + paddd m4, m5 ; -a + por m5, m14, m1 + psrld m1, 12 + paddd m0, m2 ; b + (1 << 8) + por m2, m14, m3 + psrld m3, 12 + paddd m5, m2 + pmovzxbd m2, [dstq+r10+0] + paddd m1, m3 + pmovzxbd m3, [dstq+r10+8] + pmaddwd m4, m2 ; -a * src + pmaddwd m5, m3 + packssdw m2, m3 + psubd m0, m4 ; a * src + b + (1 << 8) + psubd m1, m5 + psrld m0, 9 + psrld m1, 9 + packssdw m0, m1 + psllw m1, m2, 4 + psubw m0, m1 + pmulhrsw m0, m7 + paddw m0, m2 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, q3120 + mova [dstq+r10], xm0 + add r10, 16 + jl .n_loop + mov r10, t5 + mov t5, t4 + mov t4, r10 + add dstq, dst_strideq + ret -INIT_YMM avx2 -cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt - movifnidn wd, wm - movifnidn hd, hm - vpbroadcastd m0, wtm - vpbroadcastd m10, [pd_1024] - DEFINE_ARGS dst, stride, t1, t2, w, h, idx -.loop_y: - xor idxd, idxd -.loop_x: - mova m1, [t1q+idxq*2+ 0] - mova m2, [t1q+idxq*2+32] - mova m3, [t2q+idxq*2+ 0] - mova m4, [t2q+idxq*2+32] - pmovzxbw m5, [dstq+idxq+ 0] - pmovzxbw m6, [dstq+idxq+16] - psllw m7, m5, 4 - psllw m8, m6, 4 - psubw m1, m7 - psubw m2, m8 - psubw m3, m7 - psubw m4, m8 - punpcklwd m9, m1, m3 - punpckhwd m1, m3 - punpcklwd m3, m2, m4 - punpckhwd m2, m4 - pmaddwd m9, m0 - pmaddwd m1, m0 - pmaddwd m3, m0 - pmaddwd m2, m0 - paddd m9, m10 - paddd m1, m10 - paddd m3, m10 - paddd m2, m10 - psrad m9, 11 - psrad m1, 11 - psrad m3, 11 - psrad m2, 11 - packssdw m1, m9, m1 - packssdw m2, m3, m2 - paddw m1, m5 - paddw m2, m6 - packuswb m1, m2 - vpermq m1, m1, q3120 - mova [dstq+idxq], m1 - add idxd, 32 - cmp idxd, wd - jl .loop_x - add dstq, strideq - add t1q, 384 * 2 - add t2q, 384 * 2 - dec hd - jg .loop_y +cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h +%define base r12-sgr_x_by_x-256*4 + lea r12, [sgr_x_by_x+256*4] + mov paramsq, paramsmp + mov wd, wm + mov edged, r8m + mov hd, r6m + vbroadcasti128 m9, [base+sgr_shuf+0] + add lpfq, wq + vbroadcasti128 m10, [base+sgr_shuf+8] + lea t1, [rsp+wq*2+12] + vbroadcasti128 m11, [base+sgr_shuf+2] + add dstq, wq + vbroadcasti128 m12, [base+sgr_shuf+6] + lea t3, [rsp+wq*4+400*24+8] + vpbroadcastd m15, [paramsq+8] ; w0 w1 + neg wq + vpbroadcastd m13, [paramsq+0] ; s0 + pxor m7, m7 + vpbroadcastd m14, [paramsq+4] ; s1 + psllw m15, 2 ; to reuse existing pd_m4096 register for rounding + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, lpf_strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_avx2).top_fixup + add t1, 400*12 + call .h_top + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + add r10, lpf_strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, dst_strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, dst_strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, dst_strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, lpf_strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+lpf_strideq*4] + mov lpfq, dstq + lea r10, [r10+lpf_strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+400*12] + lea r10, [wq-2] +.top_fixup_loop: + mova m0, [t1+r10*2+400* 0] + mova m1, [t1+r10*2+400* 2] + mova m2, [t1+r10*2+400* 4] + paddw m0, m0 + mova m3, [t1+r10*2+400* 6] + paddd m1, m1 + mova m4, [t1+r10*2+400* 8] + paddd m2, m2 + mova m5, [t1+r10*2+400*10] + mova [t2+r10*2+400* 0], m0 + mova [t2+r10*2+400* 2], m1 + mova [t2+r10*2+400* 4], m2 + mova [t2+r10*2+400* 6], m3 + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + add r10, 16 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsums + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .h_main +.h_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu xm5, [lpfq+r10-2] +.h_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -18 + jl .h_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right +.h_have_right: + pshufb m6, m5, m9 + pshufb m4, m5, m10 + paddw m8, m6, m4 + shufps m0, m6, m4, q2121 + pmullw m3, m0, m0 + pshufb m2, m5, m11 + paddw m0, m2 + pshufb m5, m12 + paddw m0, m5 ; sum3 + punpcklwd m1, m2, m5 + pmaddwd m1, m1 + punpckhwd m2, m5 + pmaddwd m2, m2 + punpcklwd m5, m6, m4 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + punpcklwd m4, m3, m7 + paddd m1, m4 ; sumsq3 + punpckhwd m3, m7 + paddd m2, m3 + mova [t1+r10*2+400* 6], m0 + mova [t1+r10*2+400* 8], m1 + mova [t1+r10*2+400*10], m2 + paddw m8, m0 ; sum5 + paddd m5, m1 ; sumsq5 + paddd m6, m2 + mova [t1+r10*2+400* 0], m8 + mova [t1+r10*2+400* 2], m5 + mova [t1+r10*2+400* 4], m6 + add r10, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv0_main +.hv0_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu xm5, [lpfq+r10-2] +.hv0_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -18 + jl .hv0_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right +.hv0_have_right: + pshufb m6, m5, m9 + pshufb m4, m5, m10 + paddw m8, m6, m4 + shufps m1, m6, m4, q2121 + pmullw m0, m1, m1 + pshufb m3, m5, m11 + paddw m1, m3 + pshufb m5, m12 + paddw m1, m5 ; sum3 + punpcklwd m2, m3, m5 + pmaddwd m2, m2 + punpckhwd m3, m5 + pmaddwd m3, m3 + punpcklwd m5, m6, m4 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + punpcklwd m4, m0, m7 + paddd m2, m4 ; sumsq3 + punpckhwd m0, m7 + paddd m3, m0 + paddw m8, m1 ; sum5 + paddd m5, m2 ; sumsq5 + paddd m6, m3 + mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row + mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd + mova [t3+r10*4+400*0+40], m6 + paddw m8, [t1+r10*2+400* 0] + paddd m5, [t1+r10*2+400* 2] + paddd m6, [t1+r10*2+400* 4] + mova [t1+r10*2+400* 0], m8 + mova [t1+r10*2+400* 2], m5 + mova [t1+r10*2+400* 4], m6 + paddw m0, m1, [t1+r10*2+400* 6] + paddd m4, m2, [t1+r10*2+400* 8] + paddd m5, m3, [t1+r10*2+400*10] + mova [t1+r10*2+400* 6], m1 + mova [t1+r10*2+400* 8], m2 + mova [t1+r10*2+400*10], m3 + paddw m1, m0, [t2+r10*2+400* 6] + paddd m2, m4, [t2+r10*2+400* 8] + paddd m3, m5, [t2+r10*2+400*10] + mova [t2+r10*2+400* 6], m0 + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m4, m2 + paddusw m5, m2 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + vpbroadcastd m4, [base+pd_34816] + pmulld m0, m2 + vpbroadcastd m5, [base+pd_m4096] + pmulld m1, m3 + paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m4 + pand m0, m5 + pand m1, m5 + por m0, m2 ; a3 | (b3 << 12) + por m1, m3 + mova [t3+r10*4+400*4+ 8], xm0 + vextracti128 [t3+r10*4+400*4+40], m0, 1 + mova [t3+r10*4+400*4+24], xm1 + vextracti128 [t3+r10*4+400*4+56], m1, 1 + add r10, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv1_main +.hv1_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu xm5, [lpfq+r10-2] +.hv1_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -18 + jl .hv1_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right +.hv1_have_right: + pshufb m6, m5, m9 + pshufb m3, m5, m10 + paddw m8, m6, m3 + shufps m2, m6, m3, q2121 + pmullw m1, m2, m2 + pshufb m0, m5, m11 + paddw m2, m0 + pshufb m5, m12 + paddw m2, m5 ; sum3 + punpcklwd m4, m5, m0 + pmaddwd m4, m4 + punpckhwd m5, m0 + pmaddwd m5, m5 + punpcklwd m0, m6, m3 + pmaddwd m0, m0 + punpckhwd m6, m3 + pmaddwd m6, m6 + punpcklwd m3, m1, m7 + paddd m4, m3 ; sumsq3 + punpckhwd m1, m7 + paddd m5, m1 + paddw m1, m2, [t2+r10*2+400* 6] + mova [t2+r10*2+400* 6], m2 + paddw m8, m2 ; sum5 + paddd m2, m4, [t2+r10*2+400* 8] + paddd m3, m5, [t2+r10*2+400*10] + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + paddd m4, m0 ; sumsq5 + paddd m5, m6 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m6, m2, 3 + pslld m7, m3, 3 + paddd m6, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b3 + paddd m7, m3 + pmaddwd m3, m1, m1 + psubd m6, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m7, m3 + pmulld m6, m14 ; p3 * s1 + pmulld m7, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m6, m2 + paddusw m7, m2 + psrad m3, m6, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m6 + psrad m6, m7, 20 + vpgatherdd m3, [r12+m6*4], m7 + vpbroadcastd m6, [base+pd_34816] + pmulld m0, m2 + vpbroadcastd m7, [base+pd_m4096] + pmulld m1, m3 + paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m6 + pand m0, m7 + pand m7, m1 + por m0, m2 ; a3 | (b3 << 12) + por m7, m3 + paddw m1, m8, [t2+r10*2+400*0] + paddd m2, m4, [t2+r10*2+400*2] + paddd m3, m5, [t2+r10*2+400*4] + paddw m1, [t1+r10*2+400*0] + paddd m2, [t1+r10*2+400*2] + paddd m3, [t1+r10*2+400*4] + mova [t2+r10*2+400*0], m8 + mova [t2+r10*2+400*2], m4 + mova [t2+r10*2+400*4], m5 + mova [t3+r10*4+400*8+ 8], xm0 + vextracti128 [t3+r10*4+400*8+40], m0, 1 + mova [t3+r10*4+400*8+24], xm7 + vextracti128 [t3+r10*4+400*8+56], m7, 1 + vpbroadcastd m4, [base+pd_25] + pxor m7, m7 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmulld m2, m4 ; a5 * 25 + pmulld m3, m4 + pmaddwd m4, m0, m0 ; b5 * b5 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p5 + vpbroadcastd m4, [base+pd_0xf00800a4] + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r12+m5*4], m2 + psrad m2, m3, 20 + vpgatherdd m5, [r12+m2*4], m3 + pmulld m0, m4 + pmulld m1, m5 + paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m6 + vpbroadcastd m6, [base+pd_m4096] + pand m0, m6 + pand m1, m6 + por m0, m4 ; a5 | (b5 << 12) + por m1, m5 + mova [t3+r10*4+400*0+ 8], xm0 + vextracti128 [t3+r10*4+400*0+40], m0, 1 + mova [t3+r10*4+400*0+24], xm1 + vextracti128 [t3+r10*4+400*0+56], m1, 1 + add r10, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-2] + vpbroadcastd m6, [base+pd_34816] + vpbroadcastd m8, [base+pd_m4096] +.v0_loop: + mova m0, [t1+r10*2+400* 6] + mova m4, [t1+r10*2+400* 8] + mova m5, [t1+r10*2+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+r10*2+400* 6] + paddd m2, m4, [t2+r10*2+400* 8] + paddd m3, m5, [t2+r10*2+400*10] + mova [t2+r10*2+400* 6], m0 + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b3 + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m4, m2 + paddusw m5, m2 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m6 + pand m0, m8 + pand m1, m8 + por m0, m2 ; a3 | (b3 << 12) + por m1, m3 + mova m2, [t1+r10*2+400*0] + mova m3, [t1+r10*2+400*2] + mova m4, [t1+r10*2+400*4] + mova [t3+r10*4+400*8+ 8], m2 + mova [t3+r10*4+400*0+ 8], m3 + mova [t3+r10*4+400*0+40], m4 + paddw m2, m2 ; cc5 + paddd m3, m3 + paddd m4, m4 + mova [t1+r10*2+400*0], m2 + mova [t1+r10*2+400*2], m3 + mova [t1+r10*2+400*4], m4 + mova [t3+r10*4+400*4+ 8], xm0 + vextracti128 [t3+r10*4+400*4+40], m0, 1 + mova [t3+r10*4+400*4+24], xm1 + vextracti128 [t3+r10*4+400*4+56], m1, 1 + add r10, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-2] +.v1_loop: + mova m4, [t1+r10*2+400* 6] + mova m5, [t1+r10*2+400* 8] + mova m6, [t1+r10*2+400*10] + paddw m1, m4, [t2+r10*2+400* 6] + paddd m2, m5, [t2+r10*2+400* 8] + paddd m3, m6, [t2+r10*2+400*10] + mova [t2+r10*2+400* 6], m4 + mova [t2+r10*2+400* 8], m5 + mova [t2+r10*2+400*10], m6 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b3 + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m4, m2 + paddusw m5, m2 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + vpbroadcastd m4, [base+pd_34816] + pmulld m0, m2 + vpbroadcastd m8, [base+pd_m4096] + pmulld m1, m3 + paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m4 + pand m0, m8 + pand m8, m1 + por m0, m2 ; a3 | (b3 << 12) + por m8, m3 + mova m4, [t3+r10*4+400*8+ 8] + mova m5, [t3+r10*4+400*0+ 8] + mova m6, [t3+r10*4+400*0+40] + paddw m1, m4, [t2+r10*2+400*0] + paddd m2, m5, [t2+r10*2+400*2] + paddd m3, m6, [t2+r10*2+400*4] + paddw m1, [t1+r10*2+400*0] + paddd m2, [t1+r10*2+400*2] + paddd m3, [t1+r10*2+400*4] + mova [t2+r10*2+400*0], m4 + mova [t2+r10*2+400*2], m5 + mova [t2+r10*2+400*4], m6 + vpbroadcastd m4, [base+pd_25] + mova [t3+r10*4+400*8+ 8], xm0 + vextracti128 [t3+r10*4+400*8+40], m0, 1 + mova [t3+r10*4+400*8+24], xm8 + vextracti128 [t3+r10*4+400*8+56], m8, 1 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmulld m2, m4 ; a5 * 25 + pmulld m3, m4 + pmaddwd m4, m0, m0 ; b5 * b5 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p5 + vpbroadcastd m4, [base+pd_0xf00800a4] + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r12+m5*4], m2 + psrad m2, m3, 20 + vpgatherdd m5, [r12+m2*4], m3 + pmulld m0, m4 + vpbroadcastd m6, [base+pd_34816] + pmulld m1, m5 + paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m6 + vpbroadcastd m6, [base+pd_m4096] + pand m0, m6 + pand m1, m6 + por m0, m4 ; a5 | (b5 << 12) + por m1, m5 + mova [t3+r10*4+400*0+ 8], xm0 + vextracti128 [t3+r10*4+400*0+40], m0, 1 + mova [t3+r10*4+400*0+24], xm1 + vextracti128 [t3+r10*4+400*0+56], m1, 1 + add r10, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+400*0+4] + paddd m1, m0, [t3+r10*4+400*0+0] + mova m4, [t3+r10*4+400*4+0] + paddd m1, [t3+r10*4+400*0+8] + mova m5, [t3+r10*4+400*8+0] + paddd m4, [t3+r10*4+400*4+8] + paddd m5, [t3+r10*4+400*8+8] + paddd m2, m4, [t3+r10*4+400*4+4] + paddd m3, m5, [t3+r10*4+400*8+4] + paddd m0, m1 + pslld m1, 2 + pslld m2, 2 + paddd m1, m0 ; ab5 565 + paddd m3, m3 ; ab3[ 0] 222 + psubd m2, m4 ; ab3[-1] 343 + mova [t3+r10*4+400*20], m3 + por m0, m6, m1 ; a5 565 + mova [t3+r10*4+400*24], m2 + psrld m1, 12 ; b5 565 + mova [t3+r10*4+400*12], m0 + paddd m3, m3 + mova [t3+r10*4+400*16], m1 + psubd m3, m5 ; ab3[ 0] 343 + mova [t3+r10*4+400*28], m3 + add r10, 8 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t3+r10*4+4] + paddd m4, m0, [t3+r10*4+0] + paddd m4, [t3+r10*4+8] + paddd m0, m4 + pslld m4, 2 + paddd m4, m0 + por m0, m6, m4 + psrld m4, 12 + paddd m2, m0, [t3+r10*4+400*12] ; -a5 + mova [t3+r10*4+400*12], m0 + paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) + mova [t3+r10*4+400*16], m4 + mova m3, [t3+r10*4+400*4+0] + paddd m3, [t3+r10*4+400*4+8] + paddd m5, m3, [t3+r10*4+400*4+4] + paddd m5, m5 ; ab3[ 1] 222 + mova m4, [t3+r10*4+400*20] + paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343 + mova [t3+r10*4+400*20], m5 + paddd m5, m5 + psubd m5, m3 ; ab3[ 1] 343 + mova [t3+r10*4+400*24], m5 + paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 + por m3, m6, m1 + psrld m1, 12 + por m5, m6, m4 + psrld m4, 12 + paddd m3, m5 ; -a3 + paddd m1, m4 ; b3 + (1 << 8) + pmovzxbd m4, [dstq+r10] + pmaddwd m2, m4 ; -a5 * src + pmaddwd m3, m4 ; -a3 * src + pslld m4, 13 + psubd m0, m4 + psubd m1, m4 + psubd m0, m2 ; a5 * src + b5 + (1 << 8) + psubd m1, m3 ; a3 * src + b3 + (1 << 8) + psrld m0, 9 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + psubd m4, m6 + paddd m0, m4 + psrad m0, 13 + vextracti128 xm1, m0, 1 + packssdw xm0, xm1 + packuswb xm0, xm0 + movq [dstq+r10], xm0 + add r10, 8 + jl .n0_loop + add dstq, dst_strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m3, [t3+r10*4+400*8+0] + paddd m3, [t3+r10*4+400*8+8] + paddd m5, m3, [t3+r10*4+400*8+4] + paddd m5, m5 ; ab3[ 1] 222 + mova m4, [t3+r10*4+400*20] + paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343 + mova [t3+r10*4+400*20], m5 + paddd m5, m5 + psubd m5, m3 ; ab3[ 1] 343 + mova [t3+r10*4+400*28], m5 + paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 + por m3, m6, m1 + psrld m1, 12 + por m5, m6, m4 + psrld m4, 12 + paddd m3, m5 ; -a3 + paddd m1, m4 ; b3 + (1 << 8) + pmovzxbd m4, [dstq+r10] + pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src + mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7) + pmaddwd m3, m4 ; -a3 * src + pslld m4, 12 + psubd m0, m4 + paddd m4, m4 + psubd m1, m4 + psubd m0, m2 ; a5 * src + b5 + (1 << 7) + psubd m1, m3 ; a3 * src + b3 + (1 << 8) + psrld m0, 8 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + psubd m4, m6 + paddd m0, m4 + psrad m0, 13 + vextracti128 xm1, m0, 1 + packssdw xm0, xm1 + packuswb xm0, xm0 + movq [dstq+r10], xm0 + add r10, 8 + jl .n1_loop + add dstq, dst_strideq + ret %endif ; ARCH_X86_64 From 09f8b5fc18656675775370bff1cc6343b33d8030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 12 Feb 2021 14:21:56 +0200 Subject: [PATCH 124/155] arm64: looprestoration16: Fix parameter reading from the stack on darwin On darwin, 32 bit parameters that aren't passed in registers but on the stack, are packed tightly instead of each of them occupying an 8 byte slot. --- src/arm/64/looprestoration16.S | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/arm/64/looprestoration16.S b/src/arm/64/looprestoration16.S index fb41cf96f1..69efa4ecc3 100644 --- a/src/arm/64/looprestoration16.S +++ b/src/arm/64/looprestoration16.S @@ -53,7 +53,11 @@ endconst // const int bitdepth_max); function wiener_filter7_16bpc_neon, export=1 ldr w8, [sp] +#ifdef __APPLE__ + ldr w9, [sp, #4] +#else ldr w9, [sp, #8] +#endif stp x29, x30, [sp, #-32]! stp d8, d9, [sp, #16] mov x29, sp @@ -596,10 +600,15 @@ endfunc // const pixel *lpf, const ptrdiff_t lpf_stride, // const int w, int h, // const int16_t filter[2][8], -// const enum LrEdgeFlags edges); +// const enum LrEdgeFlags edges, +// const int bitdepth_max); function wiener_filter5_16bpc_neon, export=1 ldr w8, [sp] +#ifdef __APPLE__ + ldr w9, [sp, #4] +#else ldr w9, [sp, #8] +#endif stp x29, x30, [sp, #-32]! stp d8, d9, [sp, #16] mov x29, sp From 6daac9ec5a4bc31d3b0b41005310d2e4b9d377bc Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sat, 16 Jan 2021 09:06:09 -0500 Subject: [PATCH 125/155] Add bpc suffix to cdef functions --- src/x86/cdef_avx2.asm | 8 ++++---- src/x86/cdef_sse.asm | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/x86/cdef_avx2.asm b/src/x86/cdef_avx2.asm index 685a1274a1..f274a1d631 100644 --- a/src/x86/cdef_avx2.asm +++ b/src/x86/cdef_avx2.asm @@ -39,7 +39,7 @@ %endmacro %macro CDEF_FILTER_JMP_TABLE 1 -JMP_TABLE cdef_filter_%1, \ +JMP_TABLE cdef_filter_%1_8bpc, \ d6k0, d6k1, d7k0, d7k1, \ d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \ d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \ @@ -94,7 +94,7 @@ SECTION .text %macro PREP_REGS 2 ; w, h ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] mov dird, r6m - lea tableq, [cdef_filter_%1x%2_jmptable] + lea tableq, [cdef_filter_%1x%2_8bpc_jmptable] lea dirq, [tableq+dirq*2*4] %if %1 == 4 %if %2 == 4 @@ -397,7 +397,7 @@ SECTION .text %macro CDEF_FILTER 2 ; w, h INIT_YMM avx2 -cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ +cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \ pri, sec, dir, damping, edge %assign stack_offset_entry stack_offset mov edged, edgem @@ -1592,7 +1592,7 @@ CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 INIT_YMM avx2 -cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 +cglobal cdef_dir_8bpc, 3, 4, 15, src, stride, var, stride3 lea stride3q, [strideq*3] movq xm0, [srcq+strideq*0] movq xm1, [srcq+strideq*1] diff --git a/src/x86/cdef_sse.asm b/src/x86/cdef_sse.asm index 9335e727bc..4c335aba21 100644 --- a/src/x86/cdef_sse.asm +++ b/src/x86/cdef_sse.asm @@ -249,13 +249,13 @@ SECTION .text %macro CDEF_FILTER 2 ; w, h %if ARCH_X86_64 -cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*32, \ - dst, stride, left, top, pri, sec, edge, stride3, dst4 +cglobal cdef_filter_%1x%2_8bpc, 4, 9, 16, 3 * 16 + (%2+4)*32, \ + dst, stride, left, top, pri, sec, edge, stride3, dst4 %define px rsp+3*16+2*32 %define base 0 %else -cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ - dst, stride, left, edge, stride3 +cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ + dst, stride, left, edge, stride3 %define topq r2 %define dst4q r2 LEA r5, tap_table From 010a908e034b28c45819b597f539716bce827827 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sat, 16 Jan 2021 11:13:41 -0500 Subject: [PATCH 126/155] x86: cdef: Add SIMD implementation of cdef_dir for 16bpc Relative speed-ups over C code (compared with gcc-9.3.0): C ASM cdef_dir_16bpc_avx2: 534.2 72.5 7.36x cdef_dir_16bpc_ssse3: 534.2 104.8 5.10x cdef_dir_16bpc_ssse3 (x86-32): 854.1 116.2 7.35x --- src/x86/cdef16_avx2.asm | 62 +++++++++++++++---------- src/x86/cdef16_sse.asm | 100 +++++++++++++++++++++++----------------- src/x86/cdef_avx512.asm | 10 ++-- 3 files changed, 100 insertions(+), 72 deletions(-) diff --git a/src/x86/cdef16_avx2.asm b/src/x86/cdef16_avx2.asm index 9491d81431..27b64cf261 100644 --- a/src/x86/cdef16_avx2.asm +++ b/src/x86/cdef16_avx2.asm @@ -2,12 +2,26 @@ ; Copyright (c) 2021, Nathan Egge ; All rights reserved. ; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" @@ -20,31 +34,31 @@ cextern cdef_dir_8bpc_avx2 INIT_YMM avx2 cglobal cdef_dir_16bpc, 4, 4, 3, 32 + 8*8, src, ss, var, bdmax - popcnt bdmaxd, bdmaxd - movzx bdmaxq, bdmaxw - sub bdmaxq, 8 - movq xm2, bdmaxq + popcnt bdmaxd, bdmaxd + movzx bdmaxq, bdmaxw + sub bdmaxq, 8 + movq xm2, bdmaxq DEFINE_ARGS src, ss, var, ss3 - lea ss3q, [ssq*3] - mova xm0, [srcq + ssq*0] - mova xm1, [srcq + ssq*1] + lea ss3q, [ssq*3] + mova xm0, [srcq + ssq*0] + mova xm1, [srcq + ssq*1] vinserti128 m0, [srcq + ssq*2], 1 vinserti128 m1, [srcq + ss3q], 1 - psraw m0, xm2 - psraw m1, xm2 - vpackuswb m0, m1 + psraw m0, xm2 + psraw m1, xm2 + vpackuswb m0, m1 mova [rsp + 32 + 0*8], m0 - lea srcq, [srcq + ssq*4] - mova xm0, [srcq + ssq*0] - mova xm1, [srcq + ssq*1] + lea srcq, [srcq + ssq*4] + mova xm0, [srcq + ssq*0] + mova xm1, [srcq + ssq*1] vinserti128 m0, [srcq + ssq*2], 1 vinserti128 m1, [srcq + ss3q], 1 - psraw m0, xm2 - psraw m1, xm2 - vpackuswb m0, m1 + psraw m0, xm2 + psraw m1, xm2 + vpackuswb m0, m1 mova [rsp + 32 + 4*8], m0 - lea srcq, [rsp + 32] ; WIN64 shadow space - mov ssq, 8 + lea srcq, [rsp + 32] ; WIN64 shadow space + mov ssq, 8 call mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX) RET diff --git a/src/x86/cdef16_sse.asm b/src/x86/cdef16_sse.asm index ced7621498..f89c8c4088 100644 --- a/src/x86/cdef16_sse.asm +++ b/src/x86/cdef16_sse.asm @@ -2,12 +2,26 @@ ; Copyright (c) 2021, Nathan Egge ; All rights reserved. ; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" @@ -24,56 +38,56 @@ cextern cdef_dir_8bpc_ssse3 INIT_XMM ssse3 cglobal cdef_dir_16bpc, 2, 4, 4, 32 + 8*8, src, ss, var, bdmax - bsr bdmaxd, bdmaxm + bsr bdmaxd, bdmaxm %if ARCH_X86_64 movzx bdmaxq, bdmaxw - sub bdmaxq, 7 - movq m4, bdmaxq + sub bdmaxq, 7 + movq m4, bdmaxq %else - push r4 - sub bdmaxd, 9 - LEA r4, pq_dir_shr - movq m4, [r4 + bdmaxd*4] - pop r4 + push r4 + sub bdmaxd, 9 + LEA r4, pq_dir_shr + movq m4, [r4 + bdmaxd*4] + pop r4 %endif DEFINE_ARGS src, ss, var, ss3 - lea ss3q, [ssq*3] - mova m0, [srcq + ssq*0] - mova m1, [srcq + ssq*1] - mova m2, [srcq + ssq*2] - mova m3, [srcq + ss3q] - psraw m0, m4 - psraw m1, m4 - psraw m2, m4 - psraw m3, m4 - packuswb m0, m1 - packuswb m2, m3 + lea ss3q, [ssq*3] + mova m0, [srcq + ssq*0] + mova m1, [srcq + ssq*1] + mova m2, [srcq + ssq*2] + mova m3, [srcq + ss3q] + psraw m0, m4 + psraw m1, m4 + psraw m2, m4 + psraw m3, m4 + packuswb m0, m1 + packuswb m2, m3 mova [rsp + 32 + 0*8], m0 mova [rsp + 32 + 2*8], m2 - lea srcq, [srcq + ssq*4] - mova m0, [srcq + ssq*0] - mova m1, [srcq + ssq*1] - mova m2, [srcq + ssq*2] - mova m3, [srcq + ss3q] - psraw m0, m4 - psraw m1, m4 - psraw m2, m4 - psraw m3, m4 - packuswb m0, m1 - packuswb m2, m3 + lea srcq, [srcq + ssq*4] + mova m0, [srcq + ssq*0] + mova m1, [srcq + ssq*1] + mova m2, [srcq + ssq*2] + mova m3, [srcq + ss3q] + psraw m0, m4 + psraw m1, m4 + psraw m2, m4 + psraw m3, m4 + packuswb m0, m1 + packuswb m2, m3 mova [rsp + 32 + 4*8], m0 mova [rsp + 32 + 6*8], m2 - lea srcq, [rsp + 32] ; WIN64 shadow space - mov ssq, 8 + lea srcq, [rsp + 32] ; WIN64 shadow space + mov ssq, 8 %if ARCH_X86_64 call mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX) %else movifnidn vard, varm - push eax ; align stack - push vard - push ssd - push srcd + push eax ; align stack + push vard + push ssd + push srcd call mangle(private_prefix %+ _cdef_dir_8bpc) - add esp, 0x10 + add esp, 0x10 %endif RET diff --git a/src/x86/cdef_avx512.asm b/src/x86/cdef_avx512.asm index b1fa1ad16f..af2bb6e66f 100644 --- a/src/x86/cdef_avx512.asm +++ b/src/x86/cdef_avx512.asm @@ -109,7 +109,8 @@ DECLARE_REG_TMP 8, 5 ; 5e 5f 50 51 52 53 54 55 INIT_ZMM avx512icl -cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge +cglobal cdef_filter_4x4_8bpc, 4, 8, 13, dst, stride, left, top, \ + pri, sec, dir, damping, edge %define base r7-edge_mask movq xmm0, [dstq+strideq*0] movhps xmm0, [dstq+strideq*1] @@ -269,8 +270,7 @@ DECLARE_REG_TMP 2, 7 ; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85 ; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95 -cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \ - pri, sec, dir, damping, edge +cglobal cdef_filter_4x8_8bpc, 4, 9, 22, dst, stride, left, top, pri, sec, dir, damping, edge %define base r8-edge_mask vpbroadcastd ym21, strided mov r6d, edgem @@ -504,8 +504,8 @@ ALIGN function_align ; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b ; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b -cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \ - pri, sec, dir, damping, edge +cglobal cdef_filter_8x8_8bpc, 4, 11, 32, 4*64, dst, stride, left, top, \ + pri, sec, dir, damping, edge %define base r8-edge_mask mov r6d, edgem lea r10, [dstq+strideq*4-2] From 4acf628bc7643cc0168edb409ad8461db4d8c8f5 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Tue, 29 Dec 2020 06:58:33 -0500 Subject: [PATCH 127/155] Add bpc suffix to lr functions --- src/x86/looprestoration.asm | 36 ++++++++++---------- src/x86/looprestoration_sse.asm | 60 ++++++++++++++++----------------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/x86/looprestoration.asm b/src/x86/looprestoration.asm index 44aaaf49c7..71e3e0d225 100644 --- a/src/x86/looprestoration.asm +++ b/src/x86/looprestoration.asm @@ -88,8 +88,8 @@ SECTION .text DECLARE_REG_TMP 4, 9, 7, 11, 12, 13, 14 ; ring buffer pointers INIT_YMM avx2 -cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, flt, h +cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h mov fltq, fltmp mov edged, r8m mov wd, wm @@ -436,8 +436,8 @@ ALIGN function_align add dstq, dst_strideq ret -cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, flt, h +cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h mov fltq, fltmp mov edged, r8m mov wd, wm @@ -554,7 +554,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ jnz .h_have_right cmp r10d, -33 jl .h_have_right - call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right + call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right .h_have_right: pshufb m0, m4, m6 pmaddubsw m0, m12 @@ -613,7 +613,7 @@ ALIGN function_align jnz .hv_have_right cmp r10d, -33 jl .hv_have_right - call mangle(private_prefix %+ _wiener_filter7_avx2).extend_right + call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right .hv_have_right: pshufb m0, m4, m6 pmaddubsw m0, m12 @@ -727,8 +727,8 @@ ALIGN function_align jl .v_loop ret -cglobal sgr_filter_5x5, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +cglobal sgr_filter_5x5_8bpc, 5, 13, 16, 400*24+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h %define base r12-sgr_x_by_x-256*4 lea r12, [sgr_x_by_x+256*4] mov paramsq, paramsmp @@ -1187,8 +1187,8 @@ ALIGN function_align add dstq, dst_strideq ret -cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +cglobal sgr_filter_3x3_8bpc, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h %define base r14-sgr_x_by_x-256*4 mov paramsq, paramsmp mov edged, r8m @@ -1298,7 +1298,7 @@ cglobal sgr_filter_3x3, 5, 15, 15, -400*28-16, dst, dst_stride, left, lpf, \ jnz .h_have_right cmp r10d, -17 jl .h_have_right - call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .h_have_right: pshufb m0, m5, m8 pmullw m2, m0, m0 @@ -1346,7 +1346,7 @@ ALIGN function_align jnz .hv_have_right cmp r10d, -17 jl .hv_have_right - call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .hv_have_right: pshufb m0, m5, m8 pmullw m3, m0, m0 @@ -1546,8 +1546,8 @@ ALIGN function_align add dstq, dst_strideq ret -cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, params, h +cglobal sgr_filter_mix_8bpc, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, params, h %define base r12-sgr_x_by_x-256*4 lea r12, [sgr_x_by_x+256*4] mov paramsq, paramsmp @@ -1573,7 +1573,7 @@ cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ call .h_top add lpfq, lpf_strideq mov t2, t1 - call mangle(private_prefix %+ _sgr_filter_5x5_avx2).top_fixup + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup add t1, 400*12 call .h_top lea r10, [lpfq+lpf_strideq*4] @@ -1681,7 +1681,7 @@ cglobal sgr_filter_mix, 5, 13, 16, 400*56+8, dst, dst_stride, left, lpf, \ jnz .h_have_right cmp r10d, -18 jl .h_have_right - call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .h_have_right: pshufb m6, m5, m9 pshufb m4, m5, m10 @@ -1742,7 +1742,7 @@ ALIGN function_align jnz .hv0_have_right cmp r10d, -18 jl .hv0_have_right - call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .hv0_have_right: pshufb m6, m5, m9 pshufb m4, m5, m10 @@ -1853,7 +1853,7 @@ ALIGN function_align jnz .hv1_have_right cmp r10d, -18 jl .hv1_have_right - call mangle(private_prefix %+ _sgr_filter_5x5_avx2).extend_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .hv1_have_right: pshufb m6, m5, m9 pshufb m3, m5, m10 diff --git a/src/x86/looprestoration_sse.asm b/src/x86/looprestoration_sse.asm index 5d3ca49211..4b77138d70 100644 --- a/src/x86/looprestoration_sse.asm +++ b/src/x86/looprestoration_sse.asm @@ -97,8 +97,8 @@ SECTION .text %macro WIENER 0 %if ARCH_X86_64 DECLARE_REG_TMP 4, 10, 7, 11, 12, 13, 14 ; ring buffer pointers -cglobal wiener_filter7, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, flt, h, x +cglobal wiener_filter7_8bpc, 5, 15, 16, -384*12-16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h, x %define base 0 mov fltq, fltmp mov edged, r8m @@ -139,7 +139,7 @@ DECLARE_REG_TMP 4, 0, _, 5 %define m11 [stk+96] %define stk_off 112 %endif -cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride +cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride %define base r6-pb_right_ext_mask-21 %define stk esp %define dstq leftq @@ -245,7 +245,7 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride add lpfq, [rsp+gprsize*1] call .hv_bottom .v1: - call mangle(private_prefix %+ _wiener_filter7_ssse3).v + call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v RET .no_top: lea t3, [lpfq+lpf_strideq*4] @@ -281,9 +281,9 @@ cglobal wiener_filter7, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, lpf_stride dec hd jnz .main .v3: - call mangle(private_prefix %+ _wiener_filter7_ssse3).v + call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v .v2: - call mangle(private_prefix %+ _wiener_filter7_ssse3).v + call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v jmp .v1 .extend_right: movd m2, [lpfq-4] @@ -685,8 +685,8 @@ ALIGN function_align %endif %if ARCH_X86_64 -cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ - lpf_stride, w, edge, flt, h, x +cglobal wiener_filter5_8bpc, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ + lpf_stride, w, edge, flt, h, x mov fltq, fltmp mov edged, r8m mov wd, wm @@ -720,7 +720,7 @@ cglobal wiener_filter5, 5, 13, 16, 384*8+16, dst, dst_stride, left, lpf, \ %define m11 [stk+80] %define stk_off 96 %endif -cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride +cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride %define stk esp %define leftmp [stk+28] %define m8 [base+pw_m16380] @@ -827,14 +827,14 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride dec hd jnz .main .v2: - call mangle(private_prefix %+ _wiener_filter5_ssse3).v + call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v add dstq, dst_strideq mov t4, t3 mov t3, t2 mov t2, t1 movifnidn dstmp, dstq .v1: - call mangle(private_prefix %+ _wiener_filter5_ssse3).v + call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v jmp .end .h: %define stk esp+4 @@ -873,7 +873,7 @@ cglobal wiener_filter5, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, lpf_stride jnz .h_have_right cmp xd, -17 jl .h_have_right - call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right + call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right .h_have_right: %macro %%h5 0 %if cpuflag(ssse3) @@ -991,7 +991,7 @@ ALIGN function_align jnz .hv_have_right cmp xd, -17 jl .hv_have_right - call mangle(private_prefix %+ _wiener_filter7 %+ SUFFIX).extend_right + call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right .hv_have_right: %%h5 mova m2, [t3+xq*2] @@ -1161,7 +1161,7 @@ WIENER %endmacro %if ARCH_X86_64 -cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim +cglobal sgr_box3_h_8bpc, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim mov xlimd, edgem movifnidn xd, xm mov hd, hm @@ -1170,7 +1170,7 @@ cglobal sgr_box3_h, 5, 11, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim add xd, xlimd xor xlimd, 2 ; 2*!have_right %else -cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim +cglobal sgr_box3_h_8bpc, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim %define wq r0m %define xlimd r1m %define hd hmp @@ -1287,10 +1287,10 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim RET %if ARCH_X86_64 -cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim +cglobal sgr_box3_v_8bpc, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim movifnidn edged, edgem %else -cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y +cglobal sgr_box3_v_8bpc, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y %define sumsq_baseq dword [esp+0] %define sum_baseq dword [esp+4] %define ylimd dword [esp+8] @@ -1383,7 +1383,7 @@ cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y jl .loop_x RET -cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s +cglobal sgr_calc_ab1_8bpc, 4, 7, 12, a, b, w, h, s movifnidn sd, sm sub aq, (384+16-1)*4 sub bq, (384+16-1)*2 @@ -1463,8 +1463,8 @@ cglobal sgr_calc_ab1, 4, 7, 12, a, b, w, h, s RET %if ARCH_X86_64 -cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ - tmp_base, src_base, a_base, b_base, x, y +cglobal sgr_finish_filter1_8bpc, 5, 13, 16, t, src, stride, a, b, w, h, \ + tmp_base, src_base, a_base, b_base, x, y movifnidn wd, wm mov hd, hm mova m15, [pw_16] @@ -1474,7 +1474,7 @@ cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ mov b_baseq, bq xor xd, xd %else -cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y +cglobal sgr_finish_filter1_8bpc, 7, 7, 8, -144, t, src, stride, a, b, x, y %define tmp_baseq [esp+8] %define src_baseq [esp+12] %define a_baseq [esp+16] @@ -1688,7 +1688,7 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y jl .loop_x RET -cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt +cglobal sgr_weighted1_8bpc, 4, 7, 8, dst, stride, t, w, h, wt movifnidn hd, hm %if ARCH_X86_32 SETUP_PIC r6, 0 @@ -1726,14 +1726,14 @@ cglobal sgr_weighted1, 4, 7, 8, dst, stride, t, w, h, wt RET %if ARCH_X86_64 -cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim +cglobal sgr_box5_h_8bpc, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xlim mov edged, edgem movifnidn wd, wm mov hd, hm mova m10, [pb_0] mova m11, [pb_0_1] %else -cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge +cglobal sgr_box5_h_8bpc, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge %define edgeb byte edgem %define wd xd %define wq wd @@ -1909,11 +1909,11 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge RET %if ARCH_X86_64 -cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim +cglobal sgr_box5_v_8bpc, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim movifnidn edged, edgem mov ylimd, edged %else -cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr +cglobal sgr_box5_v_8bpc, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr %define wm [esp+0] %define hm [esp+4] %define edgem [esp+8] @@ -2127,7 +2127,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr jmp .sum_loop_y_noload %endif -cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s +cglobal sgr_calc_ab2_8bpc, 4, 7, 11, a, b, w, h, s movifnidn sd, sm sub aq, (384+16-1)*4 sub bq, (384+16-1)*2 @@ -2205,7 +2205,7 @@ cglobal sgr_calc_ab2, 4, 7, 11, a, b, w, h, s RET %if ARCH_X86_64 -cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ +cglobal sgr_finish_filter2_8bpc, 5, 13, 14, t, src, stride, a, b, w, h, \ tmp_base, src_base, a_base, b_base, x, y movifnidn wd, wm mov hd, hm @@ -2219,7 +2219,7 @@ cglobal sgr_finish_filter2, 5, 13, 14, t, src, stride, a, b, w, h, \ psrlw m11, m12, 1 ; pw_128 pxor m13, m13 %else -cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y +cglobal sgr_finish_filter2_8bpc, 6, 7, 8, t, src, stride, a, b, x, y %define tmp_baseq r0m %define src_baseq r1m %define a_baseq r3m @@ -2378,7 +2378,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y RET %undef t2 -cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt +cglobal sgr_weighted2_8bpc, 4, 7, 12, dst, stride, t1, t2, w, h, wt movifnidn wd, wm movd m0, wtm %if ARCH_X86_64 From 8cd9e73869cb4907b1c24cb57e692510db7f6d64 Mon Sep 17 00:00:00 2001 From: "Nathan E. Egge" Date: Sun, 10 Jan 2021 14:12:10 -0500 Subject: [PATCH 128/155] x86: lr: Add AVX2 implementation of wiener filter for 16 bpc Relative speed-ups over C code (compared with gcc-9.3.0): C AVX2 wiener_5tap_10bpc: 194892.0 14831.9 13.14x wiener_5tap_12bpc: 194295.4 14828.9 13.10x wiener_7tap_10bpc: 194391.7 19461.4 9.99x wiener_7tap_12bpc: 194136.1 19418.7 10.00x --- src/x86/looprestoration16_avx2.asm | 648 +++++++++++++++-------------- 1 file changed, 331 insertions(+), 317 deletions(-) diff --git a/src/x86/looprestoration16_avx2.asm b/src/x86/looprestoration16_avx2.asm index 2012860b8b..4eb1b8056c 100644 --- a/src/x86/looprestoration16_avx2.asm +++ b/src/x86/looprestoration16_avx2.asm @@ -2,12 +2,26 @@ ; Copyright (c) 2021, Nathan Egge ; All rights reserved. ; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" @@ -16,16 +30,16 @@ SECTION_RODATA 32 -wiener5_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 -wiener5_shufB: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13 -wiener5_shufC: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1 +wiener5_shufB: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +wiener5_shufC: db 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13 +wiener5_shufD: db 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1, 10, 11, -1, -1 wiener5_l_shuf: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 -wiener7_shufB: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9 -wiener7_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 -wiener7_shufD: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1 +wiener7_shufC: db 4, 5, 2, 3, 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9 +wiener7_shufD: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +wiener7_shufE: db 8, 9, -1, -1, 10, 11, -1, -1, 12, 13, -1, -1, 14, 15, -1, -1 rev_w: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 rev_d: db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 wiener7_l_shuf: db 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 @@ -52,90 +66,90 @@ SECTION .text INIT_YMM avx2 cglobal wiener_filter5_h_16bpc, 6, 9, 14, dst, left, src, ss, f, w, h, edge, bdmax - movifnidn wd, wm - movifnidn hd, hm - movifnidn edgeb, edgem - vbroadcasti128 m6, [wiener5_shufA] - vpbroadcastd m12, [fq + 2] - vbroadcasti128 m7, [wiener5_shufB] - vpbroadcastw m13, [fq + 6] - vbroadcasti128 m8, [wiener5_shufC] - popcnt bdmaxd, bdmaxm - vpbroadcastd m9, [pd_65540] - movq xm10, [pq_3] - cmp bdmaxd, 10 + movifnidn wd, wm + movifnidn hd, hm + movifnidn edgeb, edgem + vbroadcasti128 m6, [wiener5_shufB] + vpbroadcastd m12, [fq + 2] + vbroadcasti128 m7, [wiener5_shufC] + vpbroadcastw m13, [fq + 6] + vbroadcasti128 m8, [wiener5_shufD] + popcnt bdmaxd, bdmaxm + vpbroadcastd m9, [pd_65540] + movq xm10, [pq_3] + cmp bdmaxd, 10 je .bits10 - vpbroadcastd m9, [pd_262160] - movq xm10, [pq_5] + vpbroadcastd m9, [pd_262160] + movq xm10, [pq_5] .bits10: - pxor m11, m11 - add wq, wq - add srcq, wq - add dstq, wq - neg wq + pxor m11, m11 + add wq, wq + add srcq, wq + add dstq, wq + neg wq DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x .v_loop: - mov xq, wq - test edgeb, 1 ; LR_HAVE_LEFT + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left - test leftq, leftq + test leftq, leftq jz .h_loop - movd xm4, [leftq + 4] - vpblendd m4, [srcq + xq - 4], 0xfe - add leftq, 8 + movd xm4, [leftq + 4] + vpblendd m4, [srcq + xq - 4], 0xfe + add leftq, 8 jmp .h_main .h_extend_left: vbroadcasti128 m5, [srcq + xq] - mova m4, [srcq + xq] - palignr m4, m5, 12 - pshufb m4, [wiener5_l_shuf] + mova m4, [srcq + xq] + palignr m4, m5, 12 + pshufb m4, [wiener5_l_shuf] jmp .h_main .h_loop: - movu m4, [srcq + xq - 4] + movu m4, [srcq + xq - 4] .h_main: - movu m5, [srcq + xq + 4] - test edgeb, 2 ; LR_HAVE_RIGHT + movu m5, [srcq + xq + 4] + test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right - cmp xd, -18*2 + cmp xd, -36 jl .h_have_right - movd xm2, xd - vpbroadcastd m0, [pb_wiener5_l] - vpbroadcastd m1, [pb_wiener5_r] - vpbroadcastb m2, xm2 - movu m3, [pb_0to31] - psubb m0, m2 - psubb m1, m2 - pminub m0, m3 - pminub m1, m3 - pshufb m4, m0 - pshufb m5, m1 + movd xm2, xd + vpbroadcastd m0, [pb_wiener5_l] + vpbroadcastd m1, [pb_wiener5_r] + vpbroadcastb m2, xm2 + movu m3, [pb_0to31] + psubb m0, m2 + psubb m1, m2 + pminub m0, m3 + pminub m1, m3 + pshufb m4, m0 + pshufb m5, m1 .h_have_right: - pshufb m0, m4, m6 - pshufb m2, m4, m7 - paddw m0, m2 - pmaddwd m0, m12 - pshufb m1, m5, m6 - pshufb m3, m5, m7 - paddw m1, m3 - pmaddwd m1, m12 - pshufb m4, m8 - pmaddwd m4, m13 - pshufb m5, m8 - pmaddwd m5, m13 - paddd m0, m4 - paddd m1, m5 - paddd m0, m9 - paddd m1, m9 - psrad m0, xm10 - psrad m1, xm10 - packssdw m0, m1 - pmaxsw m0, m11 - mova [dstq + xq], m0 - add xq, 32 + pshufb m0, m4, m6 + pshufb m2, m4, m7 + paddw m0, m2 + pmaddwd m0, m12 + pshufb m1, m5, m6 + pshufb m3, m5, m7 + paddw m1, m3 + pmaddwd m1, m12 + pshufb m4, m8 + pmaddwd m4, m13 + pshufb m5, m8 + pmaddwd m5, m13 + paddd m0, m4 + paddd m1, m5 + paddd m0, m9 + paddd m1, m9 + psrad m0, xm10 + psrad m1, xm10 + packssdw m0, m1 + pmaxsw m0, m11 + mova [dstq + xq], m0 + add xq, 32 jl .h_loop - add srcq, ssq - add dstq, 384*2 - dec hd + add srcq, ssq + add dstq, 384*2 + dec hd jg .v_loop RET @@ -143,323 +157,323 @@ DECLARE_REG_TMP 8, 9, 10, 11, 12, 13, 14 INIT_YMM avx2 cglobal wiener_filter5_v_16bpc, 6, 13, 12, dst, ds, mid, f, w, h, edge, bdmax - movifnidn wd, wm - movifnidn hd, hm + movifnidn wd, wm + movifnidn hd, hm movifnidn edgeb, edgem - pxor m6, m6 + pxor m6, m6 vpbroadcastd m7, [fq + 2] vpbroadcastd m8, [fq + 6] - popcnt bdmaxd, bdmaxm + popcnt bdmaxd, bdmaxm vpbroadcastd m9, [nd_1047552] - movq xm10, [pq_11] - cmp bdmaxd, 10 + movq xm10, [pq_11] + cmp bdmaxd, 10 je .bits10 vpbroadcastd m9, [nd_1048320] - movq xm10, [pq_9] + movq xm10, [pq_9] .bits10: vpbroadcastw m11, bdmaxm - add wq, wq - add midq, wq - add dstq, wq - neg wq + add wq, wq + add midq, wq + add dstq, wq + neg wq DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x - mov msq, 2*384 - mov t0, midq - lea t1, [t0 + msq] - lea t2, [t1 + msq] - lea t3, [t2 + msq] - lea t4, [t3 + msq] - test edgeb, 4 ; LR_HAVE_TOP + mov msq, 2*384 + mov t0, midq + lea t1, [t0 + msq] + lea t2, [t1 + msq] + lea t3, [t2 + msq] + lea t4, [t3 + msq] + test edgeb, 4 ; LR_HAVE_TOP jnz .have_top - mov t0, t2 - mov t1, t2 + mov t0, t2 + mov t1, t2 .have_top: - test edgeb, 8 ; LR_HAVE_BOTTOM + test edgeb, 8 ; LR_HAVE_BOTTOM jnz .v_loop - cmp hd, 2 + cmp hd, 2 jg .v_loop - cmp hd, 1 + cmp hd, 1 jne .limit_v - mov t3, t2 + mov t3, t2 .limit_v: - mov t4, t3 + mov t4, t3 .v_loop: - mov xq, wq + mov xq, wq .h_loop: - mova m1, [t0 + xq] - mova m2, [t1 + xq] - mova m3, [t2 + xq] - mova m4, [t3 + xq] - mova m5, [t4 + xq] - punpcklwd m0, m1, m2 - pmaddwd m0, m7 - punpckhwd m1, m2 - pmaddwd m1, m7 - punpcklwd m2, m5, m4 - pmaddwd m2, m7 - punpckhwd m5, m4 - pmaddwd m5, m7 - paddd m0, m2 - paddd m1, m5 - punpcklwd m2, m3, m6 - pmaddwd m2, m8 - punpckhwd m3, m6 - pmaddwd m3, m8 - paddd m0, m2 - paddd m1, m3 - paddd m0, m9 - paddd m1, m9 - psrad m0, xm10 - psrad m1, xm10 - packusdw m0, m1 - pminuw m0, m11 + mova m1, [t0 + xq] + mova m2, [t1 + xq] + mova m3, [t2 + xq] + mova m4, [t3 + xq] + mova m5, [t4 + xq] + punpcklwd m0, m1, m2 + pmaddwd m0, m7 + punpckhwd m1, m2 + pmaddwd m1, m7 + punpcklwd m2, m5, m4 + pmaddwd m2, m7 + punpckhwd m5, m4 + pmaddwd m5, m7 + paddd m0, m2 + paddd m1, m5 + punpcklwd m2, m3, m6 + pmaddwd m2, m8 + punpckhwd m3, m6 + pmaddwd m3, m8 + paddd m0, m2 + paddd m1, m3 + paddd m0, m9 + paddd m1, m9 + psrad m0, xm10 + psrad m1, xm10 + packusdw m0, m1 + pminuw m0, m11 mova [dstq + xq], m0 - add xq, 32 + add xq, 32 jl .h_loop - add dstq, dsq - mov t0, t1 - mov t1, t2 - mov t2, t3 - mov t3, t4 - add t4, msq - test edgeb, 8 ; LR_HAVE_BOTTOM + add dstq, dsq + mov t0, t1 + mov t1, t2 + mov t2, t3 + mov t3, t4 + add t4, msq + test edgeb, 8 ; LR_HAVE_BOTTOM jnz .have_bottom - cmp hd, 3 + cmp hd, 3 jg .have_bottom - mov t4, t3 + mov t4, t3 .have_bottom: - dec hd + dec hd jg .v_loop RET INIT_YMM avx2 cglobal wiener_filter7_h_16bpc, 6, 10, 16, dst, left, src, ss, f, w, h, edge, bdmax, rh - movifnidn wd, wm - movifnidn hd, hm - movifnidn edgeb, edgem - vpbroadcastd m7, [fq] - vpbroadcastd m8, [fq + 4] + movifnidn wd, wm + movifnidn hd, hm + movifnidn edgeb, edgem + vpbroadcastd m7, [fq] + vpbroadcastd m8, [fq + 4] vbroadcasti128 m10, [rev_w] - vbroadcasti128 m11, [wiener5_shufA] - vbroadcasti128 m12, [wiener7_shufB] - vbroadcasti128 m13, [wiener7_shufC] - vbroadcasti128 m14, [wiener7_shufD] + vbroadcasti128 m11, [wiener5_shufB] + vbroadcasti128 m12, [wiener7_shufC] + vbroadcasti128 m13, [wiener7_shufD] + vbroadcasti128 m14, [wiener7_shufE] vbroadcasti128 m15, [rev_d] - popcnt bdmaxd, bdmaxm - vpbroadcastd m9, [pd_65540] - mov rhq, [pq_3] - cmp bdmaxd, 10 + popcnt bdmaxd, bdmaxm + vpbroadcastd m9, [pd_65540] + mov rhq, [pq_3] + cmp bdmaxd, 10 je .bits10 - vpbroadcastd m9, [pd_262160] - mov rhq, [pq_5] + vpbroadcastd m9, [pd_262160] + mov rhq, [pq_5] .bits10: - add wq, wq - add srcq, wq - add dstq, wq - neg wq + add wq, wq + add srcq, wq + add dstq, wq + neg wq DEFINE_ARGS dst, left, src, ss, f, w, h, edge, x, rh .v_loop: - mov xq, wq - test edgeb, 1 ; LR_HAVE_LEFT + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left - test leftq, leftq + test leftq, leftq jz .h_loop - movq xm4, [leftq + 2] - vpblendw xm4, [srcq + xq - 6], 0xf8 - vinserti128 m4, [srcq + xq + 10], 1 - add leftq, 8 + movq xm4, [leftq + 2] + vpblendw xm4, [srcq + xq - 6], 0xf8 + vinserti128 m4, [srcq + xq + 10], 1 + add leftq, 8 jmp .h_main .h_extend_left: - vbroadcasti128 m5, [srcq + xq] - mova m4, [srcq + xq] - palignr m4, m5, 10 - pshufb m4, [wiener7_l_shuf] + vbroadcasti128 m5, [srcq + xq] + mova m4, [srcq + xq] + palignr m4, m5, 10 + pshufb m4, [wiener7_l_shuf] jmp .h_main .h_loop: - movu m4, [srcq + xq - 6] + movu m4, [srcq + xq - 6] .h_main: - movu m5, [srcq + xq + 2] - movu m6, [srcq + xq + 6] - test edgeb, 2 ; LR_HAVE_RIGHT + movu m5, [srcq + xq + 2] + movu m6, [srcq + xq + 6] + test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right - cmp xd, -19*2 + cmp xd, -38 jl .h_have_right - movd xm3, xd - vpbroadcastd m0, [pb_wiener7_l] - vpbroadcastd m1, [pb_wiener7_m] - vpbroadcastd m2, [pb_wiener7_r] - vpbroadcastb m3, xm3 - psubb m0, m3 - psubb m1, m3 - psubb m2, m3 - movu m3, [pb_0to31] - pminub m0, m3 - pminub m1, m3 - pminub m2, m3 - pshufb m4, m0 - pshufb m5, m1 - pshufb m6, m2 - cmp xd, -9*2 + movd xm3, xd + vpbroadcastd m0, [pb_wiener7_l] + vpbroadcastd m1, [pb_wiener7_m] + vpbroadcastd m2, [pb_wiener7_r] + vpbroadcastb m3, xm3 + psubb m0, m3 + psubb m1, m3 + psubb m2, m3 + movu m3, [pb_0to31] + pminub m0, m3 + pminub m1, m3 + pminub m2, m3 + pshufb m4, m0 + pshufb m5, m1 + pshufb m6, m2 + cmp xd, -9*2 jne .hack - vpbroadcastw xm3, [srcq + xq + 16] - vinserti128 m5, xm3, 1 + vpbroadcastw xm3, [srcq + xq + 16] + vinserti128 m5, xm3, 1 jmp .h_have_right .hack: - cmp xd, -1*2 + cmp xd, -1*2 jne .h_have_right - vpbroadcastw xm5, [srcq + xq] + vpbroadcastw xm5, [srcq + xq] .h_have_right: - pshufb m6, m10 - pshufb m0, m4, m11 - pshufb m2, m5, m12 - paddw m0, m2 - pmaddwd m0, m7 - pshufb m2, m4, m13 - pshufb m4, m14 - paddw m2, m4 - pmaddwd m2, m8 - pshufb m1, m6, m11 - pshufb m5, m11 - pmaddwd m1, m7 - pmaddwd m5, m7 - pshufb m3, m6, m13 - pshufb m6, m14 - paddw m3, m6 - pmaddwd m3, m8 - paddd m0, m2 - paddd m1, m3 - pshufb m1, m15 - paddd m1, m5 - movq xm4, rhq - pxor m5, m5 - paddd m0, m9 - paddd m1, m9 - psrad m0, xm4 - psrad m1, xm4 - packssdw m0, m1 - pmaxsw m0, m5 - mova [dstq + xq], m0 - add xq, 32 + pshufb m6, m10 + pshufb m0, m4, m11 + pshufb m2, m5, m12 + paddw m0, m2 + pmaddwd m0, m7 + pshufb m2, m4, m13 + pshufb m4, m14 + paddw m2, m4 + pmaddwd m2, m8 + pshufb m1, m6, m11 + pshufb m5, m11 + pmaddwd m1, m7 + pmaddwd m5, m7 + pshufb m3, m6, m13 + pshufb m6, m14 + paddw m3, m6 + pmaddwd m3, m8 + paddd m0, m2 + paddd m1, m3 + pshufb m1, m15 + paddd m1, m5 + movq xm4, rhq + pxor m5, m5 + paddd m0, m9 + paddd m1, m9 + psrad m0, xm4 + psrad m1, xm4 + packssdw m0, m1 + pmaxsw m0, m5 + mova [dstq + xq], m0 + add xq, 32 jl .h_loop - add srcq, ssq - add dstq, 384*2 - dec hd + add srcq, ssq + add dstq, 384*2 + dec hd jg .v_loop RET INIT_YMM avx2 cglobal wiener_filter7_v_16bpc, 6, 15, 13, dst, ds, mid, f, w, h, edge, bdmax - movifnidn wd, wm - movifnidn hd, hm - movifnidn edgeb, edgem - pxor m6, m6 - vpbroadcastd m7, [fq] - vpbroadcastw m8, [fq + 4] - vpbroadcastd m9, [fq + 6] - popcnt bdmaxd, bdmaxm + movifnidn wd, wm + movifnidn hd, hm + movifnidn edgeb, edgem + pxor m6, m6 + vpbroadcastd m7, [fq] + vpbroadcastw m8, [fq + 4] + vpbroadcastd m9, [fq + 6] + popcnt bdmaxd, bdmaxm vpbroadcastd m10, [nd_1047552] - movq xm11, [pq_11] - cmp bdmaxd, 10 + movq xm11, [pq_11] + cmp bdmaxd, 10 je .bits10 vpbroadcastd m10, [nd_1048320] - movq xm11, [pq_9] + movq xm11, [pq_9] .bits10: vpbroadcastw m12, bdmaxm - add wq, wq - add midq, wq - add dstq, wq - neg wq + add wq, wq + add midq, wq + add dstq, wq + neg wq DEFINE_ARGS dst, ds, mid, ms, w, h, edge, x - mov msq, 2*384 - mov t0, midq - mov t1, t0 - lea t2, [t1 + msq] - lea t3, [t2 + msq] - lea t4, [t3 + msq] - lea t5, [t4 + msq] - lea t6, [t5 + msq] - test edgeb, 4 ; LR_HAVE_TOP + mov msq, 2*384 + mov t0, midq + mov t1, t0 + lea t2, [t1 + msq] + lea t3, [t2 + msq] + lea t4, [t3 + msq] + lea t5, [t4 + msq] + lea t6, [t5 + msq] + test edgeb, 4 ; LR_HAVE_TOP jnz .have_top - mov t0, t3 - mov t1, t3 - mov t2, t3 + mov t0, t3 + mov t1, t3 + mov t2, t3 .have_top: - cmp hd, 3 + cmp hd, 3 jg .v_loop - test edgeb, 8 ; LR_HAVE_BOTTOM + test edgeb, 8 ; LR_HAVE_BOTTOM jz .no_bottom0 - cmp hd, 1 + cmp hd, 1 jg .v_loop jmp .h3 .no_bottom0: - cmp hd, 2 + cmp hd, 2 je .h2 jns .h3 .h1: - mov t4, t3 + mov t4, t3 .h2: - mov t5, t4 + mov t5, t4 .h3: - mov t6, t5 + mov t6, t5 .v_loop: - mov xq, wq + mov xq, wq .h_loop: - mova m1, [t0 + xq] - mova m2, [t1 + xq] - mova m3, [t5 + xq] - mova m4, [t6 + xq] - punpcklwd m0, m1, m2 - pmaddwd m0, m7 - punpckhwd m1, m2 - pmaddwd m1, m7 - punpcklwd m2, m4, m3 - pmaddwd m2, m7 - punpckhwd m4, m3 - pmaddwd m4, m7 - paddd m0, m2 - paddd m1, m4 - mova m3, [t2 + xq] - mova m4, [t4 + xq] - punpcklwd m2, m3, m4 - pmaddwd m2, m8 - punpckhwd m3, m4 - pmaddwd m3, m8 - paddd m0, m2 - paddd m1, m3 - mova m3, [t3 + xq] - punpcklwd m2, m3, m6 - pmaddwd m2, m9 - punpckhwd m3, m6 - pmaddwd m3, m9 - paddd m0, m2 - paddd m1, m3 - paddd m0, m10 - paddd m1, m10 - psrad m0, xm11 - psrad m1, xm11 - packusdw m0, m1 - pminuw m0, m12 + mova m1, [t0 + xq] + mova m2, [t1 + xq] + mova m3, [t5 + xq] + mova m4, [t6 + xq] + punpcklwd m0, m1, m2 + pmaddwd m0, m7 + punpckhwd m1, m2 + pmaddwd m1, m7 + punpcklwd m2, m4, m3 + pmaddwd m2, m7 + punpckhwd m4, m3 + pmaddwd m4, m7 + paddd m0, m2 + paddd m1, m4 + mova m3, [t2 + xq] + mova m4, [t4 + xq] + punpcklwd m2, m3, m4 + pmaddwd m2, m8 + punpckhwd m3, m4 + pmaddwd m3, m8 + paddd m0, m2 + paddd m1, m3 + mova m3, [t3 + xq] + punpcklwd m2, m3, m6 + pmaddwd m2, m9 + punpckhwd m3, m6 + pmaddwd m3, m9 + paddd m0, m2 + paddd m1, m3 + paddd m0, m10 + paddd m1, m10 + psrad m0, xm11 + psrad m1, xm11 + packusdw m0, m1 + pminuw m0, m12 mova [dstq + xq], m0 - add xq, 32 + add xq, 32 jl .h_loop - add dstq, dsq - mov t0, t1 - mov t1, t2 - mov t2, t3 - mov t3, t4 - mov t4, t5 - mov t5, t6 - add t6, msq - cmp hd, 4 + add dstq, dsq + mov t0, t1 + mov t1, t2 + mov t2, t3 + mov t3, t4 + mov t4, t5 + mov t5, t6 + add t6, msq + cmp hd, 4 jg .next_row - test edgeb, 8 ; LR_HAVE_BOTTOM + test edgeb, 8 ; LR_HAVE_BOTTOM jz .no_bottom - cmp hd, 2 + cmp hd, 2 jg .next_row .no_bottom: - mov t6, t5 + mov t6, t5 .next_row: - dec hd + dec hd jg .v_loop RET From 5d159a24d2fbc266e3d07018cfd0ef1a1ff779ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 16 Feb 2021 14:20:11 +0200 Subject: [PATCH 129/155] arm32: Fix the descriptive comment for the sub_sp_align macro --- src/arm/32/util.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arm/32/util.S b/src/arm/32/util.S index 065749a7b2..c8ac12bf03 100644 --- a/src/arm/32/util.S +++ b/src/arm/32/util.S @@ -70,7 +70,7 @@ .endm // This macro clobbers r7 (and r12 on windows) and stores data at the -// bottom of the stack; sp+16 is the start of the space allocated that +// bottom of the stack; sp is the start of the space allocated that // the caller can use. .macro sub_sp_align space #if CONFIG_THUMB From 706a74895bd331436f2084fea9a15ec33efedb43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 19 Feb 2021 11:00:31 +0200 Subject: [PATCH 130/155] arm32: ipred16: Fix overwrites to the right of the buffer for ipred_dc_left_w64 In these cases, the function wrote a 64 pixel wide output, regardless of the actual width. --- src/arm/32/ipred16.S | 1 + 1 file changed, 1 insertion(+) diff --git a/src/arm/32/ipred16.S b/src/arm/32/ipred16.S index eb2efe0188..993d9500aa 100644 --- a/src/arm/32/ipred16.S +++ b/src/arm/32/ipred16.S @@ -575,6 +575,7 @@ L(ipred_dc_left_h64): vpadd.i32 d0, d0, d0 vrshrn.i32 d0, q0, #6 vdup.16 q0, d0[0] + bx r3 L(ipred_dc_left_w64): sub r1, r1, #96 vmov q1, q0 From c4e7213dc9e8232ca3250e7b656b3abbcedfd1e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 10 Feb 2021 13:16:29 +0200 Subject: [PATCH 131/155] arm: itx: Fix comment typos --- src/arm/32/itx.S | 4 ++-- src/arm/64/itx.S | 4 ++-- src/arm/64/itx16.S | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/arm/32/itx.S b/src/arm/32/itx.S index 1e4938c74f..42126f00d0 100644 --- a/src/arm/32/itx.S +++ b/src/arm/32/itx.S @@ -706,7 +706,7 @@ def_fn_4x4 identity, flipadst vrshrn_8h \r14, \r15, q4, q5, #12 // t7a vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a vrshrn_8h \r6, \r7, q6, q7, #12 // t5a - vrshrn_8h \r10, \r11, q2, q3, #12 // taa + vrshrn_8h \r10, \r11, q2, q3, #12 // t6a vqadd.s16 q2, \q1, \q3 // t4 vqsub.s16 \q1, \q1, \q3 // t5a @@ -1173,7 +1173,7 @@ function inv_dct_4h_x16_neon, export=1 vrshrn.i32 d6, q3, #12 // t11 vrshrn.i32 d7, q4, #12 // t12 - vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t10a + vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a vrshrn.i32 d4, q2, #12 // t10a vrshrn.i32 d5, q4, #12 // t13a diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S index a83b0fd663..7283e3ab52 100644 --- a/src/arm/64/itx.S +++ b/src/arm/64/itx.S @@ -718,7 +718,7 @@ def_fn_4x4 identity, flipadst rshrn_sz \r7, v4, v5, #12, \sz // t7a smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a rshrn_sz \r3, v6, v7, #12, \sz // t5a - rshrn_sz \r5, v2, v3, #12, \sz // taa + rshrn_sz \r5, v2, v3, #12, \sz // t6a sqadd v2\sz, \r1\sz, \r3\sz // t4 sqsub \r1\sz, \r1\sz, \r3\sz // t5a @@ -1085,7 +1085,7 @@ def_fns_48 8, 4 rshrn_sz v4, v4, v5, #12, \sz // t11 rshrn_sz v5, v6, v7, #12, \sz // t12 - smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a + smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a rshrn_sz v2, v2, v3, #12, \sz // t10a rshrn_sz v3, v6, v7, #12, \sz // t13a diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S index 46851567d2..a8af9c85c7 100644 --- a/src/arm/64/itx16.S +++ b/src/arm/64/itx16.S @@ -602,7 +602,7 @@ def_fn_4x4 identity, flipadst srshr \r1\().4s, v2.4s, #12 // t4a srshr \r7\().4s, v4.4s, #12 // t7a srshr \r3\().4s, v6.4s, #12 // t5a - srshr \r5\().4s, v7.4s, #12 // taa + srshr \r5\().4s, v7.4s, #12 // t6a sqadd v2.4s, \r1\().4s, \r3\().4s // t4 sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a @@ -1007,7 +1007,7 @@ function inv_dct_4s_x16_neon srshr v4.4s, v4.4s, #12 // t11 srshr v5.4s, v6.4s, #12 // t12 - mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t10a + mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a srshr v2.4s, v2.4s, #12 // t10a srshr v3.4s, v6.4s, #12 // t13a From a6ed6993854854d79f4157f35103ae30540958f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 18 Feb 2021 12:33:42 +0200 Subject: [PATCH 132/155] arm: itx: Add missing/fix conditions around loading eob threshold value This makes these instances consistent with the rest of similar cases. --- src/arm/32/itx.S | 6 +++++- src/arm/64/itx.S | 4 ++++ src/arm/64/itx16.S | 4 ++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/arm/32/itx.S b/src/arm/32/itx.S index 42126f00d0..a1aea4139b 100644 --- a/src/arm/32/itx.S +++ b/src/arm/32/itx.S @@ -3201,7 +3201,9 @@ function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 mov r8, #(32 - \i) cmp r3, r11 blt 1f +.if \i < 28 ldrh r11, [r10], #2 +.endif .endif add r7, r2, #(\i*2) mov r8, #32*2 @@ -3257,7 +3259,7 @@ function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 add r6, r4, #(\i*64*2) mov r9, #-2 // shift bl inv_txfm_horz_dct_64x4_neon -.if \i < 8 +.if \i < 12 ldrh r11, [r10], #2 .endif .endr @@ -3306,7 +3308,9 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 mov r8, #(32 - \i) cmp r3, r11 blt 1f +.if \i < 28 ldrh r11, [r10], #2 +.endif .endif add r7, r2, #(\i*2) mov r8, #32*2 diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S index 7283e3ab52..98147a33f4 100644 --- a/src/arm/64/itx.S +++ b/src/arm/64/itx.S @@ -3126,7 +3126,9 @@ function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f +.if \i < 24 ldrh w12, [x13], #2 +.endif .endif add x7, x2, #(\i*2) mov x8, #32*2 @@ -3231,7 +3233,9 @@ function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f +.if \i < 24 ldrh w12, [x13], #2 +.endif .endif add x7, x2, #(\i*2) mov x8, #32*2 diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S index a8af9c85c7..cd99c9d49f 100644 --- a/src/arm/64/itx16.S +++ b/src/arm/64/itx16.S @@ -2602,7 +2602,9 @@ function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f +.if \i < 12 ldrh w12, [x13], #2 +.endif .endif mov x8, #4*16 bl inv_txfm_horz_scale_dct_32x4_neon @@ -3414,7 +3416,9 @@ function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f +.if \i < 28 ldrh w12, [x13], #2 +.endif .endif add x7, x2, #(\i*4) mov x8, #32*4 From 1595fc8aa0a1a88123d39698079c845479270620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 9 Feb 2021 10:18:53 +0200 Subject: [PATCH 133/155] arm64: itx16: Improve scheduling in idct4 --- src/arm/64/itx16.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S index cd99c9d49f..ce7913ac5e 100644 --- a/src/arm/64/itx16.S +++ b/src/arm/64/itx16.S @@ -402,12 +402,12 @@ endfunc .macro idct_4 r0, r1, r2, r3 mul_mla v6, \r1, \r3, v0.s[3], v0.s[2] - mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] mul_mla v2, \r0, \r2, v0.s[0], v0.s[0] + mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] mul_mls v3, \r0, \r2, v0.s[0], v0.s[0] srshr v6.4s, v6.4s, #12 - srshr v7.4s, v4.4s, #12 srshr v2.4s, v2.4s, #12 + srshr v7.4s, v4.4s, #12 srshr v3.4s, v3.4s, #12 sqadd \r0\().4s, v2.4s, v6.4s sqsub \r3\().4s, v2.4s, v6.4s From 74faad7a3d3601d45cadf1d588af11bdab0106c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 16 Feb 2021 13:26:39 +0200 Subject: [PATCH 134/155] arm64: itx16: Add missing clipping on narrowings While these might not be needed in practice, add them for consistency. --- src/arm/64/itx16.S | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S index ce7913ac5e..5e55dd3754 100644 --- a/src/arm/64/itx16.S +++ b/src/arm/64/itx16.S @@ -1443,10 +1443,10 @@ function inv_txfm_add_4x16_neon st1 {v2.4s}, [x6], x11 .endr blr x4 - rshrn v28.4h, v16.4s, #1 - rshrn v29.4h, v17.4s, #1 - rshrn v30.4h, v18.4s, #1 - rshrn v31.4h, v19.4s, #1 + sqrshrn v28.4h, v16.4s, #1 + sqrshrn v29.4h, v17.4s, #1 + sqrshrn v30.4h, v18.4s, #1 + sqrshrn v31.4h, v19.4s, #1 transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7 b 2f @@ -1466,10 +1466,10 @@ function inv_txfm_add_4x16_neon st1 {v2.4s}, [x6], x11 .endr blr x4 - rshrn v24.4h, v16.4s, #1 - rshrn v25.4h, v17.4s, #1 - rshrn v26.4h, v18.4s, #1 - rshrn v27.4h, v19.4s, #1 + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7 b 2f @@ -1488,10 +1488,10 @@ function inv_txfm_add_4x16_neon st1 {v2.4s}, [x6], x11 .endr blr x4 - rshrn v20.4h, v16.4s, #1 - rshrn v21.4h, v17.4s, #1 - rshrn v22.4h, v18.4s, #1 - rshrn v23.4h, v19.4s, #1 + sqrshrn v20.4h, v16.4s, #1 + sqrshrn v21.4h, v17.4s, #1 + sqrshrn v22.4h, v18.4s, #1 + sqrshrn v23.4h, v19.4s, #1 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 b 2f @@ -1507,10 +1507,10 @@ function inv_txfm_add_4x16_neon st1 {v2.4s}, [x2], x11 .endr blr x4 - rshrn v16.4h, v16.4s, #1 - rshrn v17.4h, v17.4s, #1 - rshrn v18.4h, v18.4s, #1 - rshrn v19.4h, v19.4s, #1 + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 blr x5 From edc43b92ebde90bb92ff3009080c2ffaaa5804c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 8 Feb 2021 15:09:34 +0200 Subject: [PATCH 135/155] arm32: itx: Add a NEON implementation of itx for 10 bpc Relative speedup vs C for a few functions: Cortex A7 A8 A9 A53 A72 A73 inv_txfm_add_4x4_dct_dct_0_10bpc_neon: 2.79 5.08 2.99 2.83 3.49 4.44 inv_txfm_add_4x4_dct_dct_1_10bpc_neon: 5.74 9.43 5.72 7.19 6.73 6.92 inv_txfm_add_8x8_dct_dct_0_10bpc_neon: 3.13 3.68 2.79 3.25 3.21 3.33 inv_txfm_add_8x8_dct_dct_1_10bpc_neon: 7.09 10.41 7.00 10.55 8.06 9.02 inv_txfm_add_16x16_dct_dct_0_10bpc_neon: 5.01 6.76 4.56 5.58 5.52 2.97 inv_txfm_add_16x16_dct_dct_1_10bpc_neon: 8.62 12.48 13.71 11.75 15.94 16.86 inv_txfm_add_16x16_dct_dct_2_10bpc_neon: 6.05 8.81 6.13 8.18 7.90 12.27 inv_txfm_add_32x32_dct_dct_0_10bpc_neon: 2.90 3.90 2.16 2.63 3.56 2.74 inv_txfm_add_32x32_dct_dct_1_10bpc_neon: 13.57 17.00 13.30 13.76 14.54 17.08 inv_txfm_add_32x32_dct_dct_2_10bpc_neon: 8.29 10.54 8.05 10.68 12.75 14.36 inv_txfm_add_32x32_dct_dct_3_10bpc_neon: 6.78 8.40 7.60 10.12 8.97 12.96 inv_txfm_add_32x32_dct_dct_4_10bpc_neon: 6.48 6.74 6.00 7.38 7.67 9.70 inv_txfm_add_64x64_dct_dct_0_10bpc_neon: 3.02 4.59 2.21 2.65 3.36 2.47 inv_txfm_add_64x64_dct_dct_1_10bpc_neon: 9.86 11.30 9.14 13.80 12.46 14.83 inv_txfm_add_64x64_dct_dct_2_10bpc_neon: 8.65 9.76 7.60 12.05 10.55 12.62 inv_txfm_add_64x64_dct_dct_3_10bpc_neon: 7.78 8.65 6.98 10.63 9.15 11.73 inv_txfm_add_64x64_dct_dct_4_10bpc_neon: 6.61 7.01 5.52 8.41 8.33 9.69 --- src/arm/32/itx16.S | 3428 ++++++++++++++++++++++++++++++++++++++++++++ src/arm/32/util.S | 8 + 2 files changed, 3436 insertions(+) create mode 100644 src/arm/32/itx16.S diff --git a/src/arm/32/itx16.S b/src/arm/32/itx16.S new file mode 100644 index 0000000000..db8ecffe6e --- /dev/null +++ b/src/arm/32/itx16.S @@ -0,0 +1,3428 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); + +// Most of the functions use the following register layout: +// r0-r3 external parameters +// r4 function pointer to first transform +// r5 function pointer to second transform +// r6 output parameter for helper function +// r7 input parameter for helper function +// r8 input stride for helper function +// r9 scratch variable for helper functions +// r10-r11 pointer to list of eob thresholds, eob threshold value, +// scratch variables within helper functions (backed up) + +// The SIMD registers most often use the following layout: +// d0-d3 multiplication coefficients +// d4-d7 scratch registers +// d8-d15 unused in some transforms, used for scratch registers in others +// d16-v31 inputs/outputs of transforms + +// Potential further optimizations, that are left unimplemented for now: +// - Trying to keep multiplication coefficients in registers across multiple +// transform functions. (The register layout is designed to potentially +// allow this.) +// - Use a simplified version of the transforms themselves for cases where +// we know a significant number of inputs are zero. E.g. if the eob value +// indicates only a quarter of input values are set, for idct16 and up, +// a significant amount of calculation can be skipped, at the cost of more +// code duplication and special casing. + +// A macro for cases where a thumb mov can express the constant in one +// instruction, while arm mode requires two separate movw+movt pairs. +.macro mov_const reg, val +#if CONFIG_THUMB + mov.w \reg, #\val +#else + movw \reg, #((\val) & 0xffff) + movt \reg, #(((\val) >> 16) & 0xffff) +#endif +.endm + +const idct_coeffs, align=4 + // idct4 + .int 2896, 2896*8*(1<<16), 1567, 3784 + // idct8 + .int 799, 4017, 3406, 2276 + // idct16 + .int 401, 4076, 3166, 2598 + .int 1931, 3612, 3920, 1189 + // idct32 + .int 201, 4091, 3035, 2751 + .int 1751, 3703, 3857, 1380 + .int 995, 3973, 3513, 2106 + .int 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) + .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) + .int 4076, 401, 4017, 799 + + .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) + .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) + .int -3166, -2598, -799, -4017 + + .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) + .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) + .int 3612, 1931, 2276, 3406 + + .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) + .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) + .int -3920, -1189, -3406, -2276 +endconst + +const iadst4_coeffs, align=4 + .int 1321, 3803, 2482, 3344 +endconst + +const iadst8_coeffs, align=4 + .int 4076, 401, 3612, 1931 + .int 2598, 3166, 1189, 3920 + // idct_coeffs + .int 2896, 0, 1567, 3784 +endconst + +const iadst16_coeffs, align=4 + .int 4091, 201, 3973, 995 + .int 3703, 1751, 3290, 2440 + .int 2751, 3035, 2106, 3513 + .int 1380, 3857, 601, 4052 +endconst + +.macro vmul_vmla d0, s0, s1, c0, c1 + vmul.i32 \d0, \s0, \c0 + vmla.i32 \d0, \s1, \c1 +.endm + +.macro vmul_vmls d0, s0, s1, c0, c1 + vmul.i32 \d0, \s0, \c0 + vmls.i32 \d0, \s1, \c1 +.endm + +.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 + vqrdmulh.s32 \r0, \r0, \c + vqrdmulh.s32 \r1, \r1, \c +.ifnb \r2 + vqrdmulh.s32 \r2, \r2, \c + vqrdmulh.s32 \r3, \r3, \c +.endif +.ifnb \r4 + vqrdmulh.s32 \r4, \r4, \c + vqrdmulh.s32 \r5, \r5, \c + vqrdmulh.s32 \r6, \r6, \c + vqrdmulh.s32 \r7, \r7, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 +.ifnb \load + vld1.16 {\load}, [\src, :128], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + vqadd.s16 \adddst, \adddst, \addsrc +.endif +.ifnb \max + vmax.s16 \max, \max, q6 +.endif +.ifnb \min + vmin.s16 \min, \min, q7 +.endif +.ifnb \store + vst1.16 {\store}, [\dst, :128], r1 +.endif +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + load_add_store q0, q8, , , , , , \dst, \src, \shiftbits + load_add_store q1, q9, , , , , , \dst, \src, \shiftbits + load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits + load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits + load_add_store q4, q12, q2, q10, q9, q8, , \dst, \src, \shiftbits + load_add_store q5, q13, q3, q11, q10, q9, q8, \dst, \src, \shiftbits + load_add_store q0, q14, q4, q12, q11, q10, q9, \dst, \src, \shiftbits + load_add_store q1, q15, q5, q13, q12, q11, q10, \dst, \src, \shiftbits + load_add_store , , q0, q14, q13, q12, q11, \dst, \src, \shiftbits + load_add_store , , q1, q15, q14, q13, q12, \dst, \src, \shiftbits + load_add_store , , , , q15, q14, q13, \dst, \src, \shiftbits + load_add_store , , , , , q15, q14, \dst, \src, \shiftbits + load_add_store , , , , , , q15, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src, shiftbits=4 + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + load_add_store q0, q8, , , , , , \dst, \src, \shiftbits + load_add_store q1, q9, , , , , , \dst, \src, \shiftbits + load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits + load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits + load_add_store , , q2, q10, q9, q8, , \dst, \src, \shiftbits + load_add_store , , q3, q11, q10, q9, q8, \dst, \src, \shiftbits + load_add_store , , , , q11, q10, q9, \dst, \src, \shiftbits + load_add_store , , , , , q11, q10, \dst, \src, \shiftbits + load_add_store , , , , , , q11, \dst, \src, \shiftbits +.endm +.macro load_add_store4 load1, load2, shift, addsrc, adddst, max, min, store1, store2, dst, src, shiftbits=4 +.ifnb \load1 + vld1.16 {\load1}, [\src, :64], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #\shiftbits +.endif +.ifnb \load2 + vld1.16 {\load2}, [\src, :64], r1 +.endif +.ifnb \addsrc + vqadd.s16 \adddst, \adddst, \addsrc +.endif +.ifnb \max + vmax.s16 \max, \max, q6 +.endif +.ifnb \store1 + vst1.16 {\store1}, [\dst, :64], r1 +.endif +.ifnb \min + vmin.s16 \min, \min, q7 +.endif +.ifnb \store2 + vst1.16 {\store2}, [\dst, :64], r1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + mov \src, \dst + load_add_store4 d0, d1, q8, , , , , , , \dst, \src + load_add_store4 d2, d3, q9, , , , , , , \dst, \src + load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src + load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src + load_add_store4 d8, d9, q12, q2, q10, q9, q8, , , \dst, \src + load_add_store4 d10, d11, q13, q3, q11, q10, q9, d16, d17, \dst, \src + load_add_store4 d0, d1, q14, q4, q12, q11, q10, d18, d19, \dst, \src + load_add_store4 d2, d3, q15, q5, q13, q12, q11, d20, d21, \dst, \src + load_add_store4 , , , q0, q14, q13, q12, d22, d23, \dst, \src + load_add_store4 , , , q1, q15, q14, q13, d24, d25, \dst, \src + load_add_store4 , , , , , q15, q14, d26, d27, \dst, \src + load_add_store4 , , , , , , q15, d28, d29, \dst, \src + load_add_store4 , , , , , , , d30, d31, \dst, \src +.endm +.macro load_add_store_4x8 dst, src, shiftbits=4 + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + mov \src, \dst + load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits + load_add_store4 d2, d3, q9, , , , , , , \dst, \src, \shiftbits + load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src, \shiftbits + load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src, \shiftbits + load_add_store4 , , , q2, q10, q9, q8, , , \dst, \src, \shiftbits + load_add_store4 , , , q3, q11, q10, q9, d16, d17, \dst, \src, \shiftbits + load_add_store4 , , , , , q11, q10, d18, d19, \dst, \src, \shiftbits + load_add_store4 , , , , , , q11, d20, d21, \dst, \src, \shiftbits + load_add_store4 , , , , , , , d22, d23, \dst, \src, \shiftbits +.endm +.macro load_add_store_4x4 dst, src, shiftbits=4 + mov \src, \dst + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff + mov \src, \dst + load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits + load_add_store4 d2, d3, q9, q0, q8, , , , , \dst, \src, \shiftbits + load_add_store4 , , , q1, q9, q8, , , , \dst, \src, \shiftbits + load_add_store4 , , , , , q9, q8, , , \dst, \src, \shiftbits + load_add_store4 , , , , , , q9, d16, d17, \dst, \src, \shiftbits + load_add_store4 , , , , , , , d18, d19, \dst, \src, \shiftbits +.endm + +.macro idct_dc w, h, shift + cmp r3, #0 + bne 1f + vmov.i16 q14, #0 + mov_const r12, 2896*8*(1<<16) + vld1.32 {d24[], d25[]}, [r2, :32] + vdup.32 d0, r12 + vqrdmulh.s32 q13, q12, d0[0] + vst1.32 {d28[0]}, [r2, :32] +.if (\w == 2*\h) || (2*\w == \h) + vqrdmulh.s32 q13, q13, d0[0] +.endif +.if \shift > 0 + vqrshrn.s32 d24, q13, #\shift + vqrshrn.s32 d25, q13, #\shift +.else + vqmovn.s32 d24, q13 + vqmovn.s32 d25, q13 +.endif + vqrdmulh.s16 q12, q12, d0[1] + mov r3, #\h + vrshr.s16 q12, q12, #4 + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {d0}, [r0, :64], r1 + vld1.16 {d1}, [r0, :64], r1 + vld1.16 {d2}, [r0, :64], r1 + vld1.16 {d3}, [r0, :64], r1 + subs r3, r3, #4 + vqadd.s16 q0, q0, q12 + sub r0, r0, r1, lsl #2 + vqadd.s16 q1, q1, q12 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmin.s16 q0, q0, q15 + vst1.16 {d0}, [r0, :64], r1 + vmin.s16 q1, q1, q15 + vst1.16 {d1}, [r0, :64], r1 + vst1.16 {d2}, [r0, :64], r1 + vst1.16 {d3}, [r0, :64], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w8_neon + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {q0}, [r0, :128], r1 + subs r3, r3, #4 + vld1.16 {q1}, [r0, :128], r1 + vqadd.s16 q0, q0, q12 + vld1.16 {q2}, [r0, :128], r1 + vqadd.s16 q1, q1, q12 + vld1.16 {q3}, [r0, :128], r1 + vqadd.s16 q2, q2, q12 + vqadd.s16 q3, q3, q12 + sub r0, r0, r1, lsl #2 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vst1.16 {q0}, [r0, :128], r1 + vmin.s16 q2, q2, q15 + vst1.16 {q1}, [r0, :128], r1 + vmin.s16 q3, q3, q15 + vst1.16 {q2}, [r0, :128], r1 + vst1.16 {q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w16_neon + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {q0, q1}, [r0, :128], r1 + subs r3, r3, #2 + vld1.16 {q2, q3}, [r0, :128], r1 + vqadd.s16 q0, q0, q12 + vqadd.s16 q1, q1, q12 + vqadd.s16 q2, q2, q12 + vqadd.s16 q3, q3, q12 + sub r0, r0, r1, lsl #1 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vst1.16 {q0, q1}, [r0, :128], r1 + vmin.s16 q3, q3, q15 + vst1.16 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w32_neon + sub r1, r1, #32 + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {q0, q1}, [r0, :128]! + subs r3, r3, #1 + vld1.16 {q2, q3}, [r0, :128] + vqadd.s16 q0, q0, q12 + vqadd.s16 q1, q1, q12 + vqadd.s16 q2, q2, q12 + vqadd.s16 q3, q3, q12 + sub r0, r0, #32 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vst1.16 {q0, q1}, [r0, :128]! + vmin.s16 q3, q3, q15 + vst1.16 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w64_neon + sub r1, r1, #96 + vmvn.i16 q15, #0xfc00 // 0x3ff +1: + vld1.16 {q0, q1}, [r0, :128]! + subs r3, r3, #1 + vld1.16 {q2, q3}, [r0, :128]! + vqadd.s16 q0, q0, q12 + vld1.16 {q8, q9}, [r0, :128]! + vqadd.s16 q1, q1, q12 + vld1.16 {q10, q11}, [r0, :128] + vqadd.s16 q2, q2, q12 + vqadd.s16 q3, q3, q12 + vqadd.s16 q8, q8, q12 + vqadd.s16 q9, q9, q12 + vqadd.s16 q10, q10, q12 + vqadd.s16 q11, q11, q12 + sub r0, r0, #96 + vmax.s16 q0, q0, q14 + vmax.s16 q1, q1, q14 + vmax.s16 q2, q2, q14 + vmax.s16 q3, q3, q14 + vmax.s16 q8, q8, q14 + vmax.s16 q9, q9, q14 + vmax.s16 q10, q10, q14 + vmax.s16 q11, q11, q14 + vmin.s16 q0, q0, q15 + vmin.s16 q1, q1, q15 + vmin.s16 q2, q2, q15 + vmin.s16 q3, q3, q15 + vmin.s16 q8, q8, q15 + vst1.16 {q0, q1}, [r0, :128]! + vmin.s16 q9, q9, q15 + vst1.16 {q2, q3}, [r0, :128]! + vmin.s16 q10, q10, q15 + vst1.16 {q8, q9}, [r0, :128]! + vmin.s16 q11, q11, q15 + vst1.16 {q10, q11}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +.macro iwht4 + vadd.i32 q8, q8, q9 + vsub.i32 q13, q10, q11 + vsub.i32 q12, q8, q13 + vshr.s32 q12, q12, #1 + vsub.i32 q10, q12, q9 + vsub.i32 q9, q12, q11 + vadd.i32 q11, q13, q10 + vsub.i32 q8, q8, q9 +.endm + +.macro idct_4s_x4 r0, r1, r2, r3 + vmul_vmla q4, \r1, \r3, d1[1], d1[0] + vmul_vmla q2, \r0, \r2, d0[0], d0[0] + vmul_vmls q3, \r1, \r3, d1[0], d1[1] + vmul_vmls q5, \r0, \r2, d0[0], d0[0] + vrshr.s32 q4, q4, #12 + vrshr.s32 q2, q2, #12 + vrshr.s32 q3, q3, #12 + vrshr.s32 q5, q5, #12 + vqadd.s32 \r0, q2, q4 + vqsub.s32 \r3, q2, q4 + vqadd.s32 \r1, q5, q3 + vqsub.s32 \r2, q5, q3 +.endm + +.macro idct_2s_x4 r0, r1, r2, r3 + vmul_vmla d6, \r1, \r3, d1[1], d1[0] + vmul_vmla d4, \r0, \r2, d0[0], d0[0] + vmul_vmls d5, \r1, \r3, d1[0], d1[1] + vmul_vmls d7, \r0, \r2, d0[0], d0[0] + vrshr.s32 d6, d6, #12 + vrshr.s32 d4, d4, #12 + vrshr.s32 d5, d5, #12 + vrshr.s32 d7, d7, #12 + vqadd.s32 \r0, d4, d6 + vqsub.s32 \r3, d4, d6 + vqadd.s32 \r1, d7, d5 + vqsub.s32 \r2, d7, d5 +.endm + +function inv_dct_4s_x4_neon + movrel_local r12, idct_coeffs + vld1.32 {d0, d1}, [r12, :128] + idct_4s_x4 q8, q9, q10, q11 + bx lr +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel_local r12, iadst4_coeffs + vld1.32 {d0, d1}, [r12, :128] + + vsub.i32 q1, q8, q10 + vmul.i32 q2, q8, d0[0] + vmla.i32 q2, q10, d0[1] + vmla.i32 q2, q11, d1[0] + vmul.i32 q4, q9, d1[1] + vadd.i32 q1, q1, q11 + vmul.i32 q3, q8, d1[0] + vmls.i32 q3, q10, d0[0] + vmls.i32 q3, q11, d0[1] + + vadd.i32 \o3, q2, q3 + vmul.i32 \o2, q1, d1[1] + vadd.i32 \o0, q2, q4 + vadd.i32 \o1, q3, q4 + vsub.i32 \o3, \o3, q4 + + vrshr.s32 \o0, \o0, #12 + vrshr.s32 \o2, \o2, #12 + vrshr.s32 \o1, \o1, #12 + vrshr.s32 \o3, \o3, #12 +.endm + +function inv_adst_4s_x4_neon + iadst_4x4 q8, q9, q10, q11 + bx lr +endfunc + +function inv_flipadst_4s_x4_neon + iadst_4x4 q11, q10, q9, q8 + bx lr +endfunc + +function inv_identity_4s_x4_neon + mov r12, #0 + movt r12, #(5793-4096)*8 + vdup.32 d0, r12 + vqrdmulh.s32 q1, q8, d0[0] + vqrdmulh.s32 q2, q9, d0[0] + vqrdmulh.s32 q3, q10, d0[0] + vqrdmulh.s32 q4, q11, d0[0] + vqadd.s32 q8, q8, q1 + vqadd.s32 q9, q9, q2 + vqadd.s32 q10, q10, q3 + vqadd.s32 q11, q11, q4 + bx lr +endfunc + +function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 + push {r4-r5,lr} + vpush {q4-q5} + vmov.i16 q14, #0 + vmov.i16 q15, #0 + vld1.32 {q8, q9}, [r2, :128] + vst1.32 {q14, q15}, [r2, :128]! + vshr.s16 q8, q8, #2 + vld1.32 {q10, q11}, [r2, :128] + vshr.s16 q9, q9, #2 + vshr.s16 q10, q10, #2 + vshr.s16 q11, q11, #2 + + iwht4 + + vst1.32 {q14, q15}, [r2, :128] + transpose_4x4s q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 + + iwht4 + + vld1.16 {d0}, [r0, :64], r1 + vqmovn.s32 d16, q8 + vld1.16 {d1}, [r0, :64], r1 + vqmovn.s32 d17, q9 + vld1.16 {d2}, [r0, :64], r1 + vqmovn.s32 d18, q10 + vld1.16 {d3}, [r0, :64], r1 + vqmovn.s32 d19, q11 + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + vmov.i16 q14, #0 + vmov.i16 q15, #0 + vld1.32 {q8, q9}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128]! + vld1.32 {q10, q11}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128] + + blx r4 + + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q9 + vqmovn.s32 d18, q10 + vqmovn.s32 d19, q11 + transpose_4x4h q8, q9, d16, d17, d18, d19 + + blx r5 + + vld1.16 {d0}, [r0, :64], r1 + vld1.16 {d1}, [r0, :64], r1 + vrshr.s16 q8, q8, #4 + vld1.16 {d2}, [r0, :64], r1 + vrshr.s16 q9, q9, #4 + vld1.16 {d3}, [r0, :64], r1 + +L(itx_4x4_end): + vmvn.i16 q15, #0xfc00 // 0x3ff + sub r0, r0, r1, lsl #2 + vqadd.s16 q8, q8, q0 + vqadd.s16 q9, q9, q1 + vmax.s16 q8, q8, q14 + vmax.s16 q9, q9, q14 + vmin.s16 q8, q8, q15 + vmin.s16 q9, q9, q15 + vst1.16 {d16}, [r0, :64], r1 + vst1.16 {d17}, [r0, :64], r1 + vst1.16 {d18}, [r0, :64], r1 + vst1.16 {d19}, [r0, :64], r1 + + vpop {q4-q5} + pop {r4-r5,pc} +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 + push {r4-r5,lr} + vpush {q4-q5} + +.ifc \txfm1\()_\txfm2, dct_dct + cmp r3, #0 + bne 1f + vmov.i16 q14, #0 + mov_const r12, 2896*8*(1<<16) + vld1.32 {d16[], d17[]}, [r2, :32] + vdup.32 d4, r12 + vst1.32 {d28[0]}, [r2, :32] + vqrdmulh.s32 q8, q8, d4[0] + vld1.16 {d0}, [r0, :64], r1 + vqmovn.s32 d20, q8 + vqmovn.s32 d21, q8 + vld1.16 {d1}, [r0, :64], r1 + vqrdmulh.s16 q10, q10, d4[1] + vld1.16 {d2}, [r0, :64], r1 + vrshr.s16 q8, q10, #4 + vld1.16 {d3}, [r0, :64], r1 + vrshr.s16 q9, q10, #4 + b L(itx_4x4_end) +1: +.endif + movrel_local r4, inv_\txfm1\()_4s_x4_neon + movrel r5, X(inv_\txfm2\()_4h_x4_neon) + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_4s_x4 \r0, \r2, \r4, \r6 + + vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a + vmul_vmla q4, \r1, \r7, d2[1], d2[0] // -> t7a + vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a + vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a + vrshr.s32 \r1, q2, #12 // t4a + vrshr.s32 \r7, q4, #12 // t7a + vrshr.s32 \r3, q6, #12 // t5a + vrshr.s32 \r5, q7, #12 // t6a + + vqadd.s32 q2, \r1, \r3 // t4 + vqsub.s32 \r1, \r1, \r3 // t5a + vqadd.s32 q3, \r7, \r5 // t7 + vqsub.s32 \r3, \r7, \r5 // t6a + + vmul_vmls q4, \r3, \r1, d0[0], d0[0] // -> t5 + vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6 + vrshr.s32 q4, q4, #12 // t5 + vrshr.s32 q5, q6, #12 // t6 + + vqsub.s32 \r7, \r0, q3 // out7 + vqadd.s32 \r0, \r0, q3 // out0 + vqadd.s32 \r1, \r2, q5 // out1 + vqsub.s32 q6, \r2, q5 // out6 + vqadd.s32 \r2, \r4, q4 // out2 + vqsub.s32 \r5, \r4, q4 // out5 + vqadd.s32 \r3, \r6, q2 // out3 + vqsub.s32 \r4, \r6, q2 // out4 + vmov \r6, q6 // out6 +.endm + +.macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_2s_x4 \r0, \r2, \r4, \r6 + + vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a + vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a + vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a + vmul_vmla d7, \r5, \r3, d3[1], d3[0] // -> t6a + vrshr.s32 \r1, d4, #12 // t4a + vrshr.s32 \r7, d5, #12 // t7a + vrshr.s32 \r3, d6, #12 // t5a + vrshr.s32 \r5, d7, #12 // t6a + + vqadd.s32 d4, \r1, \r3 // t4 + vqsub.s32 \r1, \r1, \r3 // t5a + vqadd.s32 d5, \r7, \r5 // t7 + vqsub.s32 \r3, \r7, \r5 // t6a + + vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5 + vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6 + vrshr.s32 d6, d6, #12 // t5 + vrshr.s32 d7, d7, #12 // t6 + + vqsub.s32 \r7, \r0, d5 // out7 + vqadd.s32 \r0, \r0, d5 // out0 + vqadd.s32 \r1, \r2, d7 // out1 + vqsub.s32 d7, \r2, d7 // out6 + vqadd.s32 \r2, \r4, d6 // out2 + vqsub.s32 \r5, \r4, d6 // out5 + vqadd.s32 \r3, \r6, d4 // out3 + vqsub.s32 \r4, \r6, d4 // out4 + vmov \r6, d7 // out6 +.endm + +function inv_dct_4s_x8_neon + movrel_local r12, idct_coeffs + vld1.32 {q0, q1}, [r12, :128] + idct_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15 + bx lr +endfunc + +.macro iadst_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 + movrel_local r12, iadst8_coeffs + vld1.32 {q0, q1}, [r12, :128]! + + vmul_vmla q2, q15, q8, d0[0], d0[1] + vmul_vmls q3, q15, q8, d0[1], d0[0] + vmul_vmla q4, q13, q10, d1[0], d1[1] + vrshr.s32 q8, q2, #12 // t0a + vrshr.s32 q15, q3, #12 // t1a + vmul_vmls q5, q13, q10, d1[1], d1[0] + vmul_vmla q6, q11, q12, d2[0], d2[1] + vrshr.s32 q10, q4, #12 // t2a + vrshr.s32 q13, q5, #12 // t3a + vmul_vmls q7, q11, q12, d2[1], d2[0] + vmul_vmla q2, q9, q14, d3[0], d3[1] + vrshr.s32 q12, q6, #12 // t4a + vrshr.s32 q11, q7, #12 // t5a + vmul_vmls q3, q9, q14, d3[1], d3[0] + vrshr.s32 q14, q2, #12 // t6a + vrshr.s32 q9, q3, #12 // t7a + + vld1.32 {q0}, [r12] + + vqadd.s32 q2, q8, q12 // t0 + vqsub.s32 q3, q8, q12 // t4 + vqadd.s32 q4, q15, q11 // t1 + vqsub.s32 q5, q15, q11 // t5 + vqadd.s32 q6, q10, q14 // t2 + vqsub.s32 q7, q10, q14 // t6 + vqadd.s32 q10, q13, q9 // t3 + vqsub.s32 q11, q13, q9 // t7 + + vmul_vmla q8, q3, q5, d1[1], d1[0] + vmul_vmls q12, q3, q5, d1[0], d1[1] + vmul_vmls q14, q11, q7, d1[1], d1[0] + + vrshr.s32 q3, q8, #12 // t4a + vrshr.s32 q5, q12, #12 // t5a + + vmul_vmla q8, q11, q7, d1[0], d1[1] + + vrshr.s32 q7, q14, #12 // t6a + vrshr.s32 q11, q8, #12 // t7a + + vqadd.s32 \r0, q2, q6 // out0 + vqsub.s32 q2, q2, q6 // t2 + vqadd.s32 \r7, q4, q10 // out7 + vqsub.s32 q4, q4, q10 // t3 + vqneg.s32 \r7, \r7 // out7 + + vqadd.s32 \r1, q3, q7 // out1 + vqsub.s32 q3, q3, q7 // t6 + vqadd.s32 \r6, q5, q11 // out6 + vqsub.s32 q5, q5, q11 // t7 + vqneg.s32 \r1, \r1 // out1 + + vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12) + vmul_vmls q6, q2, q4, d0[0], d0[0] // -> out4 (q12 or q11) + vmul_vmls q12, q3, q5, d0[0], d0[0] // -> out5 (q13 or q10) + vrshr.s32 q2, q10, #12 // out3 + vmul_vmla q10, q3, q5, d0[0], d0[0] // -> out2 (q10 or q13) + vrshr.s32 q3, q12, #12 // out5 + vrshr.s32 \r2, q10, #12 // out2 (q10 or q13) + vrshr.s32 \r4, q6, #12 // out4 (q12 or q11) + + vqneg.s32 \r3, q2 // out3 + vqneg.s32 \r5, q3 // out5 +.endm + +function inv_adst_4s_x8_neon + iadst_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15 + bx lr +endfunc + +function inv_flipadst_4s_x8_neon + iadst_4s_x8 q15, q14, q13, q12, q11, q10, q9, q8 + bx lr +endfunc + +function inv_identity_4s_x8_neon + vqshl.s32 q8, q8, #1 + vqshl.s32 q9, q9, #1 + vqshl.s32 q10, q10, #1 + vqshl.s32 q11, q11, #1 + vqshl.s32 q12, q12, #1 + vqshl.s32 q13, q13, #1 + vqshl.s32 q14, q14, #1 + vqshl.s32 q15, q15, #1 + bx lr +endfunc + +function inv_txfm_add_8x8_neon + vmov.i32 q0, #0 + mov r7, #8*4 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r7 +.endr + + blx r4 + + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q12, #1 + vqrshrn.s32 d18, q9, #1 + vqrshrn.s32 d19, q13, #1 + vqrshrn.s32 d20, q10, #1 + vqrshrn.s32 d21, q14, #1 + vqrshrn.s32 d22, q11, #1 + vqrshrn.s32 d23, q15, #1 + + cmp r3, r10 + transpose_4x8h q8, q9, q10, q11 + + blt 1f + + sub r2, r2, r7, lsl #3 + vpush {q8-q11} + + add r2, r2, #16 + vmov.i32 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r7 +.endr + + blx r4 + + vqrshrn.s32 d31, q15, #1 + vqrshrn.s32 d30, q11, #1 + vqrshrn.s32 d29, q14, #1 + vqrshrn.s32 d28, q10, #1 + vqrshrn.s32 d27, q13, #1 + vqrshrn.s32 d26, q9, #1 + vqrshrn.s32 d25, q12, #1 + vqrshrn.s32 d24, q8, #1 + vpop {q8-q11} + + transpose_4x8h q12, q13, q14, q15 + + b 2f + +1: + vmov.i16 q12, #0 + vmov.i16 q13, #0 + vmov.i16 q14, #0 + vmov.i16 q15, #0 + +2: + blx r5 + + load_add_store_8x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,r10,pc} +endfunc + +.macro def_fn_8x8 txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + push {r4-r5,r7,r10,lr} + vpush {q4-q7} + mov r10, #\eob_half + movrel_local r4, inv_\txfm1\()_4s_x8_neon + movrel r5, X(inv_\txfm2\()_8h_x8_neon) + b inv_txfm_add_8x8_neon +endfunc +.endm + +def_fn_8x8 dct, dct, 10 +def_fn_8x8 identity, identity, 10 +def_fn_8x8 dct, adst, 10 +def_fn_8x8 dct, flipadst, 10 +def_fn_8x8 dct, identity, 4 +def_fn_8x8 adst, dct, 10 +def_fn_8x8 adst, adst, 10 +def_fn_8x8 adst, flipadst, 10 +def_fn_8x8 flipadst, dct, 10 +def_fn_8x8 flipadst, adst, 10 +def_fn_8x8 flipadst, flipadst, 10 +def_fn_8x8 identity, dct, 4 +def_fn_8x8 adst, identity, 4 +def_fn_8x8 flipadst, identity, 4 +def_fn_8x8 identity, adst, 4 +def_fn_8x8 identity, flipadst, 4 + +function inv_txfm_add_8x4_neon + mov_const r12, 2896*8*(1<<16) + vmov.i32 q0, #0 + vmov.i32 q1, #0 + vld1.16 {q8, q9}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vdup.32 d4, r12 + vld1.16 {q10, q11}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q12, q13}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q14, q15}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + + scale_input d4[0], q8, q9, q10, q11, q12, q13, q14, q15 + + blx r4 + + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q9 + vqmovn.s32 d18, q10 + vqmovn.s32 d19, q11 + vqmovn.s32 d20, q12 + vqmovn.s32 d21, q13 + vqmovn.s32 d22, q14 + vqmovn.s32 d23, q15 + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + vswp d17, d20 + vswp d19, d21 + vswp d18, d20 + vswp d21, d22 + + blx r5 + + load_add_store_8x4 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,r10,pc} +endfunc + +function inv_txfm_add_4x8_neon + mov_const r12, 2896*8*(1<<16) + vmov.i32 q0, #0 + cmp r3, r10 + mov r7, #32 + blt 1f + + add r2, r2, #16 + vdup.32 d2, r12 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r7 +.endr + + scale_input d2[0], q8, q9, q10, q11 + sub r2, r2, r7, lsl #2 + + blx r4 + + sub r2, r2, #16 + + vqmovn.s32 d24, q8 + vqmovn.s32 d25, q9 + vqmovn.s32 d26, q10 + vqmovn.s32 d27, q11 + transpose_4x4h q12, q13, d24, d25, d26, d27 + + b 2f + +1: + vmov.i16 q12, #0 + vmov.i16 q13, #0 + +2: + mov_const r12, 2896*8*(1<<16) + vmov.i32 q0, #0 + vdup.32 d2, r12 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r7 +.endr + scale_input d2[0], q8, q9, q10, q11 + blx r4 + + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q9 + vqmovn.s32 d18, q10 + vqmovn.s32 d19, q11 + transpose_4x4h q8, q9, d16, d17, d18, d19 + + vmov q10, q12 + vmov q11, q13 + + blx r5 + + load_add_store_4x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,r10,pc} +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + push {r4-r5,r7,r10,lr} + vpush {q4-q7} + movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon +.if \w == 4 + mov r10, #\eob_half +.endif + movrel r5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct, 13 +def_fn_48 \w, \h, identity, identity, 13 +def_fn_48 \w, \h, dct, adst, 13 +def_fn_48 \w, \h, dct, flipadst, 13 +def_fn_48 \w, \h, dct, identity, 4 +def_fn_48 \w, \h, adst, dct, 13 +def_fn_48 \w, \h, adst, adst, 13 +def_fn_48 \w, \h, adst, flipadst, 13 +def_fn_48 \w, \h, flipadst, dct, 13 +def_fn_48 \w, \h, flipadst, adst, 13 +def_fn_48 \w, \h, flipadst, flipadst, 13 +def_fn_48 \w, \h, identity, dct, 16 +def_fn_48 \w, \h, adst, identity, 4 +def_fn_48 \w, \h, flipadst, identity, 4 +def_fn_48 \w, \h, identity, adst, 16 +def_fn_48 \w, \h, identity, flipadst, 16 +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + +function inv_dct_2s_x16_neon + movrel_local r12, idct_coeffs + vld1.32 {q0, q1}, [r12, :128]! + + idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30 + + vld1.32 {q0, q1}, [r12, :128] + sub r12, r12, #32 + + vmul_vmls d4, d17, d31, d0[0], d0[1] // -> t8a + vmul_vmla d5, d17, d31, d0[1], d0[0] // -> t15a + vmul_vmls d6, d25, d23, d1[0], d1[1] // -> t9a + vrshr.s32 d17, d4, #12 // t8a + vrshr.s32 d31, d5, #12 // t15a + vmul_vmla d4, d25, d23, d1[1], d1[0] // -> t14a + vmul_vmls d5, d21, d27, d2[0], d2[1] // -> t10a + vrshr.s32 d23, d6, #12 // t9a + vrshr.s32 d25, d4, #12 // t14a + vmul_vmla d6, d21, d27, d2[1], d2[0] // -> t13a + vmul_vmls d4, d29, d19, d3[0], d3[1] // -> t11a + vrshr.s32 d21, d5, #12 // t10a + vrshr.s32 d27, d6, #12 // t13a + vmul_vmla d5, d29, d19, d3[1], d3[0] // -> t12a + vrshr.s32 d19, d4, #12 // t11a + vrshr.s32 d29, d5, #12 // t12a + + vld1.32 {q0}, [r12, :128] + + vqsub.s32 d4, d17, d23 // t9 + vqadd.s32 d17, d17, d23 // t8 + vqsub.s32 d5, d31, d25 // t14 + vqadd.s32 d31, d31, d25 // t15 + vqsub.s32 d23, d19, d21 // t10 + vqadd.s32 d19, d19, d21 // t11 + vqadd.s32 d25, d29, d27 // t12 + vqsub.s32 d29, d29, d27 // t13 + + vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a + vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a + vrshr.s32 d21, d6, #12 // t9a + vrshr.s32 d27, d7, #12 // t14a + + vmul_vmls d6, d29, d23, d1[0], d1[1] // -> t13a + vmul_vmla d7, d29, d23, d1[1], d1[0] // -> t10a + vrshr.s32 d29, d6, #12 // t13a + vneg.s32 d7, d7 + vrshr.s32 d23, d7, #12 // t10a + + vqsub.s32 d4, d17, d19 // t11a + vqadd.s32 d17, d17, d19 // t8a + vqsub.s32 d5, d31, d25 // t12a + vqadd.s32 d31, d31, d25 // t15a + vqadd.s32 d19, d21, d23 // t9 + vqsub.s32 d21, d21, d23 // t10 + vqsub.s32 d25, d27, d29 // t13 + vqadd.s32 d27, d27, d29 // t14 + + vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11 + vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12 + vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a + + vrshr.s32 d6, d6, #12 // t11 + vrshr.s32 d7, d7, #12 // t12 + vmul_vmla d5, d25, d21, d0[0], d0[0] // -> t13a + vrshr.s32 d4, d4, #12 // t10a + vrshr.s32 d5, d5, #12 // t13a + + vqadd.s32 d8, d16, d31 // out0 + vqsub.s32 d31, d16, d31 // out15 + vmov d16, d8 + vqadd.s32 d23, d30, d17 // out7 + vqsub.s32 d9, d30, d17 // out8 + vqadd.s32 d17, d18, d27 // out1 + vqsub.s32 d30, d18, d27 // out14 + vqadd.s32 d18, d20, d5 // out2 + vqsub.s32 d29, d20, d5 // out13 + vqadd.s32 d5, d28, d19 // out6 + vqsub.s32 d25, d28, d19 // out9 + vqadd.s32 d19, d22, d7 // out3 + vqsub.s32 d28, d22, d7 // out12 + vqadd.s32 d20, d24, d6 // out4 + vqsub.s32 d27, d24, d6 // out11 + vqadd.s32 d21, d26, d4 // out5 + vqsub.s32 d26, d26, d4 // out10 + vmov d24, d9 + vmov d22, d5 + + bx lr +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + movrel_local r12, iadst16_coeffs + vld1.32 {q0, q1}, [r12, :128]! + + vmul_vmla d4, d31, d16, d0[0], d0[1] // -> t0 + vmul_vmls d6, d31, d16, d0[1], d0[0] // -> t1 + vmul_vmla d8, d29, d18, d1[0], d1[1] // -> t2 + vrshr.s32 d16, d4, #12 // t0 + vrshr.s32 d31, d6, #12 // t1 + vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t3 + vmul_vmla d6, d27, d20, d2[0], d2[1] // -> t4 + vrshr.s32 d18, d8, #12 // t2 + vrshr.s32 d29, d4, #12 // t3 + vmul_vmls d8, d27, d20, d2[1], d2[0] // -> t5 + vmul_vmla d4, d25, d22, d3[0], d3[1] // -> t6 + vrshr.s32 d20, d6, #12 // t4 + vrshr.s32 d27, d8, #12 // t5 + vmul_vmls d6, d25, d22, d3[1], d3[0] // -> t7 + vld1.32 {q0, q1}, [r12, :128] + movrel_local r12, idct_coeffs + vmul_vmla d8, d23, d24, d0[0], d0[1] // -> t8 + vrshr.s32 d22, d4, #12 // t6 + vrshr.s32 d25, d6, #12 // t7 + vmul_vmls d4, d23, d24, d0[1], d0[0] // -> t9 + vmul_vmla d6, d21, d26, d1[0], d1[1] // -> t10 + vrshr.s32 d23, d8, #12 // t8 + vrshr.s32 d24, d4, #12 // t9 + vmul_vmls d8, d21, d26, d1[1], d1[0] // -> t11 + vmul_vmla d4, d19, d28, d2[0], d2[1] // -> t12 + vrshr.s32 d21, d6, #12 // t10 + vrshr.s32 d26, d8, #12 // t11 + vmul_vmls d6, d19, d28, d2[1], d2[0] // -> t13 + vmul_vmla d8, d17, d30, d3[0], d3[1] // -> t14 + vrshr.s32 d19, d4, #12 // t12 + vrshr.s32 d28, d6, #12 // t13 + vmul_vmls d4, d17, d30, d3[1], d3[0] // -> t15 + vrshr.s32 d17, d8, #12 // t14 + vrshr.s32 d30, d4, #12 // t15 + + vld1.32 {q0, q1}, [r12, :128] + + vqsub.s32 d5, d16, d23 // t8a + vqadd.s32 d16, d16, d23 // t0a + vqsub.s32 d7, d31, d24 // t9a + vqadd.s32 d31, d31, d24 // t1a + vqadd.s32 d23, d18, d21 // t2a + vqsub.s32 d18, d18, d21 // t10a + vqadd.s32 d24, d29, d26 // t3a + vqsub.s32 d29, d29, d26 // t11a + vqadd.s32 d21, d20, d19 // t4a + vqsub.s32 d20, d20, d19 // t12a + vqadd.s32 d26, d27, d28 // t5a + vqsub.s32 d27, d27, d28 // t13a + vqadd.s32 d19, d22, d17 // t6a + vqsub.s32 d22, d22, d17 // t14a + vqadd.s32 d28, d25, d30 // t7a + vqsub.s32 d25, d25, d30 // t15a + + vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8 + vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9 + vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10 + vrshr.s32 d17, d4, #12 // t8 + vrshr.s32 d30, d6, #12 // t9 + vmul_vmls d4, d18, d29, d3[0], d3[1] // -> t11 + vmul_vmls d6, d27, d20, d2[1], d2[0] // -> t12 + vrshr.s32 d18, d8, #12 // t10 + vrshr.s32 d29, d4, #12 // t11 + vmul_vmla d8, d27, d20, d2[0], d2[1] // -> t13 + vmul_vmls d4, d25, d22, d3[1], d3[0] // -> t14 + vrshr.s32 d27, d6, #12 // t12 + vrshr.s32 d20, d8, #12 // t13 + vmul_vmla d6, d25, d22, d3[0], d3[1] // -> t15 + vrshr.s32 d25, d4, #12 // t14 + vrshr.s32 d22, d6, #12 // t15 + + vqsub.s32 d2, d16, d21 // t4 + vqadd.s32 d16, d16, d21 // t0 + vqsub.s32 d3, d31, d26 // t5 + vqadd.s32 d31, d31, d26 // t1 + vqadd.s32 d21, d23, d19 // t2 + vqsub.s32 d23, d23, d19 // t6 + vqadd.s32 d26, d24, d28 // t3 + vqsub.s32 d24, d24, d28 // t7 + vqadd.s32 d19, d17, d27 // t8a + vqsub.s32 d17, d17, d27 // t12a + vqadd.s32 d28, d30, d20 // t9a + vqsub.s32 d30, d30, d20 // t13a + vqadd.s32 d27, d18, d25 // t10a + vqsub.s32 d18, d18, d25 // t14a + vqadd.s32 d20, d29, d22 // t11a + vqsub.s32 d29, d29, d22 // t15a + + vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a + vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a + vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a + vrshr.s32 d22, d4, #12 // t4a + vrshr.s32 d25, d6, #12 // t5a + vmul_vmla d4, d24, d23, d1[0], d1[1] // -> t7a + vmul_vmla d6, d17, d30, d1[1], d1[0] // -> t12 + vrshr.s32 d24, d8, #12 // t6a + vrshr.s32 d23, d4, #12 // t7a + vmul_vmls d8, d17, d30, d1[0], d1[1] // -> t13 + vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t14 + vrshr.s32 d17, d6, #12 // t12 + vmul_vmla d6, d29, d18, d1[0], d1[1] // -> t15 + vrshr.s32 d29, d8, #12 // t13 + vrshr.s32 d30, d4, #12 // t14 + vrshr.s32 d18, d6, #12 // t15 + + vqsub.s32 d2, d16, d21 // t2a +.ifc \o0, d16 + vqadd.s32 \o0, d16, d21 // out0 + vqsub.s32 d21, d31, d26 // t3a + vqadd.s32 \o15,d31, d26 // out15 +.else + vqadd.s32 d4, d16, d21 // out0 + vqsub.s32 d21, d31, d26 // t3a + vqadd.s32 \o15,d31, d26 // out15 + vmov \o0, d4 +.endif + vqneg.s32 \o15, \o15 // out15 + + vqsub.s32 d3, d29, d18 // t15a + vqadd.s32 \o13,d29, d18 // out13 + vqadd.s32 \o2, d17, d30 // out2 + vqsub.s32 d26, d17, d30 // t14a + vqneg.s32 \o13,\o13 // out13 + + vqadd.s32 \o1, d19, d27 // out1 + vqsub.s32 d27, d19, d27 // t10 + vqadd.s32 \o14,d28, d20 // out14 + vqsub.s32 d20, d28, d20 // t11 + vqneg.s32 \o1, \o1 // out1 + + vqadd.s32 \o3, d22, d24 // out3 + vqsub.s32 d22, d22, d24 // t6 + vqadd.s32 \o12,d25, d23 // out12 + vqsub.s32 d23, d25, d23 // t7 + vqneg.s32 \o3, \o3 // out3 + + vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) + vmul_vmla d4, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) + vmul_vmla d6, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26) + + vrshr.s32 d24, d24, #12 // out8 + vrshr.s32 d4, d4, #12 // out7 + vrshr.s32 d5, d6, #12 // out5 + vmul_vmls d8, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) + vmul_vmla d2, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) + vrshr.s32 d26, d8, #12 // out10 + + vmul_vmls d8, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) + vmul_vmla d22, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) + vmul_vmls d6, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22) + + vrshr.s32 \o4, d2, #12 // out4 + vrshr.s32 d7, d6, #12 // out9 + vrshr.s32 d6, d8, #12 // out11 + vrshr.s32 \o6, d22, #12 // out6 + +.ifc \o8, d23 + vmov \o8, d24 + vmov \o10,d26 +.endif + + vqneg.s32 \o7, d4 // out7 + vqneg.s32 \o5, d5 // out5 + vqneg.s32 \o11,d6 // out11 + vqneg.s32 \o9, d7 // out9 +.endm + +function inv_adst_2s_x16_neon + iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_flipadst_2s_x16_neon + iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 + bx lr +endfunc + +function inv_identity_2s_x16_neon + mov r12, #0 + movt r12, #2*(5793-4096)*8 + vdup.32 d0, r12 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s32 q1, \i, d0[0] + vqadd.s32 \i, \i, \i + vqadd.s32 \i, \i, q1 +.endr + bx lr +endfunc + +.macro identity_8x4_shift1 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s32 q2, \i, \c + vrshr.s32 q2, q2, #1 + vqadd.s32 \i, \i, q2 +.endr +.endm + +.macro identity_8x4 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s32 q2, \i, \c + vqadd.s32 \i, \i, \i + vqadd.s32 \i, \i, q2 +.endr +.endm + +.macro def_horz_16 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x2_neon + push {lr} + vmov.i32 d7, #0 +.if \scale + mov_const r12, 2896*8*(1<<16) + vdup.32 d1, r12 +.endif +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r7, :64] + vst1.32 {d7}, [r7, :64], r8 +.endr +.if \scale + scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + blx r4 + vqrshrn.s32 d16, q8, #\shift + vqrshrn.s32 d17, q9, #\shift + vqrshrn.s32 d18, q10, #\shift + vqrshrn.s32 d19, q11, #\shift + vqrshrn.s32 d20, q12, #\shift + vqrshrn.s32 d21, q13, #\shift + vqrshrn.s32 d22, q14, #\shift + vqrshrn.s32 d23, q15, #\shift + vuzp.16 q8, q9 + vuzp.16 q10, q11 + +.irp i, q8, q10, q9, q11 + vst1.16 {\i}, [r6, :128]! +.endr + + pop {pc} +endfunc +.endm + +def_horz_16 scale=0, shift=2 +def_horz_16 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_4x16_neon + push {lr} +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + blx r5 + load_add_store_4x16 r6, r7 + pop {pc} +endfunc + +function inv_txfm_add_16x16_neon + sub_sp_align 512 + ldrh r11, [r10], #2 +.irp i, 0, 2, 4, 6, 8, 10, 12, 14 + add r6, sp, #(\i*16*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 14 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #16*4 + bl inv_txfm_horz_16x2_neon +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #32 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +const eob_16x16 + .short 3, 10, 21, 36, 55, 78, 105, 256 +endconst + +const eob_16x16_identity + .short 2, 4, 6, 8, 10, 12, 14, 256 +endconst + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif + push {r4-r11,lr} + vpush {q4-q7} + movrel_local r4, inv_\txfm1\()_2s_x16_neon + movrel r5, X(inv_\txfm2\()_4h_x16_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_16x16 +.else + movrel_local r10, eob_16x16_identity +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_16x16_identity +.else + movrel_local r10, eob_16x16 +.endif +.endif + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct +def_fn_16x16 identity, identity +def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst +def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst +def_fn_16x16 identity, dct + +function inv_txfm_add_16x4_neon + cmp r3, r10 + mov r11, #16 + blt 1f + + add r6, r2, #8 + vmov.i32 d4, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r6, :64] + vst1.32 {d4}, [r6, :64], r11 +.endr + blx r4 + + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + vqrshrn.s32 d20, q12, #1 + vqrshrn.s32 d21, q13, #1 + vqrshrn.s32 d22, q14, #1 + vqrshrn.s32 d23, q15, #1 + vuzp.16 q8, q9 + mov r6, sp + vuzp.16 q10, q11 + vpush {q8-q11} + + b 2f + +1: + vmov.i16 q8, #0 + vmov.i16 q9, #0 + mov r6, sp + vpush {q8-q9} + vpush {q8-q9} + +2: + vmov.i32 d4, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r2, :64] + vst1.32 {d4}, [r2, :64], r11 +.endr + + blx r4 + + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + vqrshrn.s32 d20, q12, #1 + vqrshrn.s32 d21, q13, #1 + vqrshrn.s32 d22, q14, #1 + vqrshrn.s32 d23, q15, #1 + vuzp.16 q8, q9 + mov r6, sp + vuzp.16 q10, q11 + + vmov q12, q10 + vmov q13, q11 + + vpop {q10-q11} + blx r5 + mov r6, r0 + load_add_store_8x4 r6, r7 + + vpop {q10-q11} + vmov q8, q12 + vmov q9, q13 + blx r5 + add r6, r0, #16 + load_add_store_8x4 r6, r7 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_4x16_neon + ldrh r9, [r10, #4] + + mov r11, #64 + cmp r3, r9 + ldrh r9, [r10, #2] + blt 1f + + add r6, r2, #48 + vmov.i32 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r6, :128] + vst1.32 {q2}, [r6, :128], r11 +.endr + blx r4 + vqrshrn.s32 d28, q8, #1 + vqrshrn.s32 d29, q9, #1 + vqrshrn.s32 d30, q10, #1 + vqrshrn.s32 d31, q11, #1 + transpose_4x4h q14, q15, d28, d29, d30, d31 + + b 2f +1: + vmov.i16 q14, #0 + vmov.i16 q15, #0 +2: + cmp r3, r9 + ldrh r9, [r10] + blt 1f + + add r6, r2, #32 + vmov.i32 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r6, :128] + vst1.32 {q2}, [r6, :128], r11 +.endr + blx r4 + vqrshrn.s32 d24, q8, #1 + vqrshrn.s32 d25, q9, #1 + vqrshrn.s32 d26, q10, #1 + vqrshrn.s32 d27, q11, #1 + transpose_4x4h q12, q13, d24, d25, d26, d27 + + b 2f +1: + vmov.i16 q12, #0 + vmov.i16 q13, #0 +2: + cmp r3, r9 + blt 1f + + add r6, r2, #16 + vmov.i32 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.32 {\i}, [r6, :128] + vst1.32 {q2}, [r6, :128], r11 +.endr + blx r4 + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + transpose_4x4h q8, q9, d16, d17, d18, d19 + + b 2f +1: + vmov.i16 q8, #0 + vmov.i16 q9, #0 +2: + vmov.i16 q2, #0 + vpush {q8-q9} +.irp i, q8, q9, q10, q11 + vld1.16 {\i}, [r2, :128] + vst1.16 {q2}, [r2, :128], r11 +.endr + blx r4 + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + transpose_4x4h q8, q9, d16, d17, d18, d19 + vpop {q10-q11} + + blx r5 + + load_add_store_4x16 r0, r6 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +const eob_4x16 + .short 13, 29, 45, 64 +endconst + +const eob_4x16_identity1 + .short 16, 32, 48, 64 +endconst + +const eob_4x16_identity2 + .short 4, 8, 12, 64 +endconst + +.macro def_fn_416 w, h, txfm1, txfm2, eob_16x4 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 4 + movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon + movrel r5, X(inv_\txfm2\()_4h_x\h\()_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_4x16 +.else + movrel_local r10, eob_4x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_4x16_identity2 +.else + movrel_local r10, eob_4x16 +.endif +.endif +.else + mov r10, #\eob_16x4 + movrel_local r4, inv_\txfm1\()_2s_x\w\()_neon + movrel r5, X(inv_\txfm2\()_8h_x\h\()_neon) +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct, 3 +def_fn_416 \w, \h, identity, identity, 3 +def_fn_416 \w, \h, dct, adst, 3 +def_fn_416 \w, \h, dct, flipadst, 3 +def_fn_416 \w, \h, dct, identity, 2 +def_fn_416 \w, \h, adst, dct, 3 +def_fn_416 \w, \h, adst, adst, 3 +def_fn_416 \w, \h, adst, flipadst, 3 +def_fn_416 \w, \h, flipadst, dct, 3 +def_fn_416 \w, \h, flipadst, adst, 3 +def_fn_416 \w, \h, flipadst, flipadst, 3 +def_fn_416 \w, \h, identity, dct, 2 +def_fn_416 \w, \h, adst, identity, 2 +def_fn_416 \w, \h, flipadst, identity, 2 +def_fn_416 \w, \h, identity, adst, 2 +def_fn_416 \w, \h, identity, flipadst, 2 +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + +function inv_txfm_add_16x8_neon + sub_sp_align 256 + ldrh r11, [r10], #2 + +.irp i, 0, 2, 4, 6 + add r6, sp, #(\i*16*2) +.if \i > 0 + mov r8, #(8 - \i) + cmp r3, r11 + blt 1f +.if \i < 6 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #8*4 + bl inv_txfm_horz_scale_16x2_neon +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: + +.irp i, 0, 8 + add r7, sp, #(\i*2) + mov r8, #32 +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\j}, [r7, :128], r8 +.endr + blx r5 + + add r6, r0, #(\i*2) + load_add_store_8x8 r6, r7 +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_8x16_neon + add r10, r10, #2 + sub_sp_align 256 + ldrh r11, [r10], #4 + +.irp i, 0, 4, 8, 12 + add r6, sp, #(\i*8*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 12 + ldrh r11, [r10], #4 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #16*4 + + mov_const r12, 2896*8*(1<<16) + vmov.i32 q2, #0 + vdup.32 d0, r12 + +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\j}, [r7, :128] + vst1.32 {q2}, [r7, :128], r8 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 + blx r4 + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q9, #1 + vqrshrn.s32 d18, q10, #1 + vqrshrn.s32 d19, q11, #1 + vqrshrn.s32 d20, q12, #1 + vqrshrn.s32 d21, q13, #1 + vqrshrn.s32 d22, q14, #1 + vqrshrn.s32 d23, q15, #1 + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 +.irp j, d16, d20, d17, d21, d18, d22, d19, d23 + vst1.16 {\j}, [r6, :64]! +.endr +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: + +.irp i, 0, 4 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #16 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +const eob_8x16 + .short 3, 10, 21, 43, 59, 75, 91, 128 +endconst + +const eob_8x16_identity1 + .short 2, 4, 6, 64, 80, 96, 112, 128 +endconst + +const eob_8x16_identity2 + .short 2, 4, 6, 8, 10, 12, 14, 128 +endconst + +.macro def_fn_816 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 8 + movrel_local r4, inv_\txfm1\()_4s_x8_neon + movrel r5, X(inv_\txfm2\()_4h_x16_neon) +.else + movrel_local r4, inv_\txfm1\()_2s_x16_neon + movrel r5, X(inv_\txfm2\()_8h_x8_neon) +.endif +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_8x16 +.else + movrel_local r10, eob_8x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_8x16_identity2 +.else + movrel_local r10, eob_8x16 +.endif +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct +def_fn_816 \w, \h, identity, identity +def_fn_816 \w, \h, dct, adst +def_fn_816 \w, \h, dct, flipadst +def_fn_816 \w, \h, dct, identity +def_fn_816 \w, \h, adst, dct +def_fn_816 \w, \h, adst, adst +def_fn_816 \w, \h, adst, flipadst +def_fn_816 \w, \h, flipadst, dct +def_fn_816 \w, \h, flipadst, adst +def_fn_816 \w, \h, flipadst, flipadst +def_fn_816 \w, \h, identity, dct +def_fn_816 \w, \h, adst, identity +def_fn_816 \w, \h, flipadst, identity +def_fn_816 \w, \h, identity, adst +def_fn_816 \w, \h, identity, flipadst +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_2s_x16_neon + movrel_local r12, idct_coeffs, 4*16 + vld1.32 {q0, q1}, [r12, :128]! + + vmul_vmls d4, d16, d31, d0[0], d0[1] // -> t16a + vmul_vmla d6, d16, d31, d0[1], d0[0] // -> t31a + vmul_vmls d8, d24, d23, d1[0], d1[1] // -> t17a + vrshr.s32 d16, d4, #12 // t16a + vrshr.s32 d31, d6, #12 // t31a + vmul_vmla d4, d24, d23, d1[1], d1[0] // -> t30a + vmul_vmls d6, d20, d27, d2[0], d2[1] // -> t18a + vrshr.s32 d24, d8, #12 // t17a + vrshr.s32 d23, d4, #12 // t30a + vmul_vmla d8, d20, d27, d2[1], d2[0] // -> t29a + vmul_vmls d4, d28, d19, d3[0], d3[1] // -> t19a + vrshr.s32 d20, d6, #12 // t18a + vrshr.s32 d27, d8, #12 // t29a + vmul_vmla d6, d28, d19, d3[1], d3[0] // -> t28a + vld1.32 {q0, q1}, [r12, :128] + sub r12, r12, #4*24 + vmul_vmls d8, d18, d29, d0[0], d0[1] // -> t20a + vrshr.s32 d28, d4, #12 // t19a + vrshr.s32 d19, d6, #12 // t28a + vmul_vmla d4, d18, d29, d0[1], d0[0] // -> t27a + vmul_vmls d6, d26, d21, d1[0], d1[1] // -> t21a + vrshr.s32 d18, d8, #12 // t20a + vrshr.s32 d29, d4, #12 // t27a + vmul_vmla d8, d26, d21, d1[1], d1[0] // -> t26a + vmul_vmls d4, d22, d25, d2[0], d2[1] // -> t22a + vrshr.s32 d26, d6, #12 // t21a + vrshr.s32 d21, d8, #12 // t26a + vmul_vmla d6, d22, d25, d2[1], d2[0] // -> t25a + vmul_vmls d8, d30, d17, d3[0], d3[1] // -> t23a + vrshr.s32 d22, d4, #12 // t22a + vrshr.s32 d25, d6, #12 // t25a + vmul_vmla d4, d30, d17, d3[1], d3[0] // -> t24a + vrshr.s32 d30, d8, #12 // t23a + vrshr.s32 d17, d4, #12 // t24a + + vld1.32 {q0, q1}, [r12, :128] + + vqsub.s32 d5, d16, d24 // t17 + vqadd.s32 d16, d16, d24 // t16 + vqsub.s32 d7, d31, d23 // t30 + vqadd.s32 d31, d31, d23 // t31 + vqsub.s32 d24, d28, d20 // t18 + vqadd.s32 d28, d28, d20 // t19 + vqadd.s32 d23, d18, d26 // t20 + vqsub.s32 d18, d18, d26 // t21 + vqsub.s32 d20, d30, d22 // t22 + vqadd.s32 d30, d30, d22 // t23 + vqadd.s32 d26, d17, d25 // t24 + vqsub.s32 d17, d17, d25 // t25 + vqsub.s32 d22, d29, d21 // t26 + vqadd.s32 d29, d29, d21 // t27 + vqadd.s32 d25, d19, d27 // t28 + vqsub.s32 d19, d19, d27 // t29 + + vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a + vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a + vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a + vrshr.s32 d21, d4, #12 // t17a + vrshr.s32 d27, d6, #12 // t30a + vneg.s32 d8, d8 // -> t18a + vmul_vmls d5, d19, d24, d2[0], d2[1] // -> t29a + vmul_vmls d4, d22, d18, d3[0], d3[1] // -> t21a + vrshr.s32 d19, d8, #12 // t18a + vrshr.s32 d24, d5, #12 // t29a + vmul_vmla d6, d22, d18, d3[1], d3[0] // -> t26a + vmul_vmla d8, d17, d20, d3[1], d3[0] // -> t22a + vrshr.s32 d22, d4, #12 // t21a + vrshr.s32 d18, d6, #12 // t26a + vneg.s32 d8, d8 // -> t22a + vmul_vmls d5, d17, d20, d3[0], d3[1] // -> t25a + vrshr.s32 d17, d8, #12 // t22a + vrshr.s32 d20, d5, #12 // t25a + + vqsub.s32 d2, d27, d24 // t29 + vqadd.s32 d27, d27, d24 // t30 + vqsub.s32 d3, d21, d19 // t18 + vqadd.s32 d21, d21, d19 // t17 + vqsub.s32 d24, d16, d28 // t19a + vqadd.s32 d16, d16, d28 // t16a + vqsub.s32 d19, d30, d23 // t20a + vqadd.s32 d30, d30, d23 // t23a + vqsub.s32 d28, d17, d22 // t21 + vqadd.s32 d17, d17, d22 // t22 + vqadd.s32 d23, d26, d29 // t24a + vqsub.s32 d26, d26, d29 // t27a + vqadd.s32 d22, d20, d18 // t25 + vqsub.s32 d20, d20, d18 // t26 + vqsub.s32 d29, d31, d25 // t28a + vqadd.s32 d31, d31, d25 // t31a + + vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a + vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a + vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19 + vrshr.s32 d18, d4, #12 // t18a + vrshr.s32 d25, d6, #12 // t29a + vmul_vmla d5, d29, d24, d1[1], d1[0] // -> t28 + vmul_vmla d4, d26, d19, d1[1], d1[0] // -> t20 + vrshr.s32 d29, d8, #12 // t19 + vrshr.s32 d24, d5, #12 // t28 + vneg.s32 d4, d4 // -> t20 + vmul_vmls d6, d26, d19, d1[0], d1[1] // -> t27 + vmul_vmla d8, d20, d28, d1[1], d1[0] // -> t21a + vrshr.s32 d26, d4, #12 // t20 + vrshr.s32 d19, d6, #12 // t27 + vneg.s32 d8, d8 // -> t21a + vmul_vmls d5, d20, d28, d1[0], d1[1] // -> t26a + vrshr.s32 d20, d8, #12 // t21a + vrshr.s32 d28, d5, #12 // t26a + + vqsub.s32 d2, d16, d30 // t23 + vqadd.s32 d16, d16, d30 // t16 = out16 + vqsub.s32 d3, d31, d23 // t24 + vqadd.s32 d31, d31, d23 // t31 = out31 + vqsub.s32 d23, d21, d17 // t22a + vqadd.s32 d17, d21, d17 // t17a = out17 + vqadd.s32 d30, d27, d22 // t30a = out30 + vqsub.s32 d21, d27, d22 // t25a + vqsub.s32 d27, d18, d20 // t21 + vqadd.s32 d18, d18, d20 // t18 = out18 + vqadd.s32 d4, d29, d26 // t19a = out19 + vqsub.s32 d26, d29, d26 // t20a + vqadd.s32 d29, d25, d28 // t29 = out29 + vqsub.s32 d25, d25, d28 // t26 + vqadd.s32 d28, d24, d19 // t28a = out28 + vqsub.s32 d24, d24, d19 // t27a + vmov d19, d4 // out19 + + vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20 + vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27 + vrshr.s32 d20, d4, #12 // t20 + vrshr.s32 d22, d6, #12 // t27 + + vmul_vmla d4, d25, d27, d0[0], d0[0] // -> t26a + vmul_vmls d6, d25, d27, d0[0], d0[0] // -> t21a + vmov d27, d22 // t27 + vrshr.s32 d26, d4, #12 // t26a + + vmul_vmls d24, d21, d23, d0[0], d0[0] // -> t22 + vmul_vmla d4, d21, d23, d0[0], d0[0] // -> t25 + vrshr.s32 d21, d6, #12 // t21a + vrshr.s32 d22, d24, #12 // t22 + vrshr.s32 d25, d4, #12 // t25 + + vmul_vmls d4, d3, d2, d0[0], d0[0] // -> t23a + vmul_vmla d6, d3, d2, d0[0], d0[0] // -> t24a + vrshr.s32 d23, d4, #12 // t23a + vrshr.s32 d24, d6, #12 // t24a + + bx lr +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x2_neon + push {lr} + vmov.i32 d7, #0 + lsl r8, r8, #1 +.if \scale + mov_const r12, 2896*8*(1<<16) + vdup.32 d0, r12 +.endif + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r7, :64] + vst1.32 {d7}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 +.if \scale + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct_2s_x16_neon + vtrn.32 d16, d17 + vtrn.32 d18, d19 + vtrn.32 d20, d21 + vtrn.32 d22, d23 + vtrn.32 d24, d25 + vtrn.32 d26, d27 + vtrn.32 d28, d29 + vtrn.32 d30, d31 + +.macro store1 r0, r1, r2, r3 + vst1.16 {\r0}, [r6, :64]! + vst1.16 {\r1}, [r6, :64]! + vst1.16 {\r2}, [r6, :64]! + vst1.16 {\r3}, [r6, :64]! +.endm + store1 d16, d18, d20, d22 + store1 d24, d26, d28, d30 + store1 d17, d19, d21, d23 + store1 d25, d27, d29, d31 +.purgem store1 + sub r6, r6, #64*2 + + vmov.i32 d7, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.32 {\i}, [r7, :64] + vst1.32 {d7}, [r7, :64], r8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in d0[1] + scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct32_odd_2s_x16_neon + vtrn.32 d31, d30 + vtrn.32 d29, d28 + vtrn.32 d27, d26 + vtrn.32 d25, d24 + vtrn.32 d23, d22 + vtrn.32 d21, d20 + vtrn.32 d19, d18 + vtrn.32 d17, d16 +.macro store2 r0, r1, r2, r3, r4, r5, r6, r7, shift + vld1.32 {q0, q1}, [r6, :128]! + vld1.32 {q2, q3}, [r6, :128] + sub r6, r6, #32 + vqsub.s32 d15, d0, \r0 + vqadd.s32 d0, d0, \r0 + vqsub.s32 d14, d1, \r1 + vqadd.s32 d1, d1, \r1 + vqsub.s32 d13, d2, \r2 + vqadd.s32 d2, d2, \r2 + vqsub.s32 d12, d3, \r3 + vqadd.s32 d3, d3, \r3 + vqsub.s32 d11, d4, \r4 + vqadd.s32 d4, d4, \r4 + vqsub.s32 d10, d5, \r5 + vqadd.s32 d5, d5, \r5 + vqsub.s32 d9, d6, \r6 + vqadd.s32 d6, d6, \r6 + vqsub.s32 d8, d7, \r7 + vqadd.s32 d7, d7, \r7 + vqrshrn.s32 d0, q0, #\shift + vqrshrn.s32 d1, q1, #\shift + vqrshrn.s32 d2, q2, #\shift + vqrshrn.s32 d3, q3, #\shift + vqrshrn.s32 d4, q4, #\shift + vqrshrn.s32 d5, q5, #\shift + vqrshrn.s32 d6, q6, #\shift + vqrshrn.s32 d7, q7, #\shift + vrev32.16 q2, q2 + vrev32.16 q3, q3 + vst1.16 {q0, q1}, [r6, :128]! + vst1.16 {q2, q3}, [r6, :128]! +.endm + + store2 d31, d29, d27, d25, d23, d21, d19, d17, \shift + store2 d30, d28, d26, d24, d22, d20, d18, d16, \shift +.purgem store2 + pop {pc} +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_4x32_neon + push {r10-r11,lr} + lsl r8, r8, #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + + bl X(inv_dct_4h_x16_neon) + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vst1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + sub r7, r7, r8, lsr #1 + bl X(inv_dct32_odd_4h_x16_neon) + + neg r9, r8 + mov r10, r6 + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff +.macro combine r0, r1, r2, r3, op, stride + vld1.16 {d4}, [r7, :64], \stride + vld1.16 {d0}, [r10, :64], r1 + vld1.16 {d5}, [r7, :64], \stride + vld1.16 {d1}, [r10, :64], r1 + \op\().s16 d4, d4, \r0 + vld1.16 {d6}, [r7, :64], \stride + vld1.16 {d2}, [r10, :64], r1 + \op\().s16 d5, d5, \r1 + vld1.16 {d3}, [r10, :64], r1 + vrshr.s16 q2, q2, #4 + \op\().s16 d6, d6, \r2 + vld1.16 {d7}, [r7, :64], \stride + vqadd.s16 q0, q0, q2 + \op\().s16 d7, d7, \r3 + vmax.s16 q0, q0, q6 + vrshr.s16 q3, q3, #4 + vmin.s16 q0, q0, q7 + vqadd.s16 q1, q1, q3 + vst1.16 {d0}, [r6, :64], r1 + vmax.s16 q1, q1, q6 + vst1.16 {d1}, [r6, :64], r1 + vmin.s16 q1, q1, q7 + vst1.16 {d2}, [r6, :64], r1 + vst1.16 {d3}, [r6, :64], r1 +.endm + combine d31, d30, d29, d28, vqadd, r8 + combine d27, d26, d25, d24, vqadd, r8 + combine d23, d22, d21, d20, vqadd, r8 + combine d19, d18, d17, d16, vqadd, r8 + sub r7, r7, r8 + combine d16, d17, d18, d19, vqsub, r9 + combine d20, d21, d22, d23, vqsub, r9 + combine d24, d25, d26, d27, vqsub, r9 + combine d28, d29, d30, d31, vqsub, r9 +.purgem combine + + pop {r10-r11,pc} +endfunc + +const eob_32x32 + .short 3, 10, 21, 36, 55, 78, 105, 136, 171, 210, 253, 300, 351, 406, 465, 1024 +endconst + +const eob_16x32 + .short 3, 10, 21, 36, 55, 78, 105, 151, 183, 215, 247, 279, 311, 343, 375, 512 +endconst + +const eob_16x32_shortside + .short 3, 10, 21, 36, 55, 78, 105, 512 +endconst + +const eob_8x32 + .short 3, 10, 21, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 + push {r4-r7,lr} + vpush {q6-q7} + movrel_local r5, eob_32x32, 2 + + mov r6, #4*32 +1: + mov r12, #0 + movrel_local r4, eob_32x32, 6 +2: + vmov.i32 q0, #0 + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r6 +.endr + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q12 + vqmovn.s32 d18, q9 + vqmovn.s32 d19, q13 + vqmovn.s32 d20, q10 + vqmovn.s32 d21, q14 + vqmovn.s32 d22, q11 + vqmovn.s32 d23, q15 + transpose_4x8h q8, q9, q10, q11 + + load_add_store_8x4 r0, r7, shiftbits=2 + ldrh lr, [r4], #8 + sub r0, r0, r1, lsl #2 + cmp r3, lr + add r0, r0, #2*8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12, lsl #1 + add r0, r0, r1, lsl #2 + mls r2, r6, r12, r2 + add r2, r2, #4*4 + b 1b +9: + vpop {q6-q7} + pop {r4-r7,pc} +endfunc + +.macro shift_8_regs op, shift +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + push {r4-r9,lr} + vpush {q6-q7} + mov r9, #0 + mov_const r8, 2896*8*(1<<16) + movt r9, #2*(5793-4096)*8 + movrel_local r5, eob_16x32\hshort, 2 + + mov r6, #4*\h +1: + mov r12, #0 + movrel_local r4, eob_16x32\wshort, 6 +2: + vdup.i32 d0, r8 + vmov.i32 q1, #0 + vmov.32 d0[1], r9 + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q1}, [r2, :128], r6 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 + +.if \w == 16 + // 16x32 + identity_8x4_shift1 d0[1] +.else + // 32x16 + shift_8_regs vqshl.s32, 1 + identity_8x4 d0[1] +.endif + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q12 + vqmovn.s32 d18, q9 + vqmovn.s32 d19, q13 + vqmovn.s32 d20, q10 + vqmovn.s32 d21, q14 + vqmovn.s32 d22, q11 + vqmovn.s32 d23, q15 + transpose_4x8h q8, q9, q10, q11 + +.if \w == 16 + load_add_store_8x4 r0, r7, shiftbits=2 +.else + load_add_store_8x4 r0, r7, shiftbits=4 +.endif + ldrh lr, [r4], #8 + sub r0, r0, r1, lsl #2 + cmp r3, lr + add r0, r0, #2*8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12, lsl #1 + add r0, r0, r1, lsl #2 + mls r2, r6, r12, r2 + add r2, r2, #4*4 + b 1b +9: + vpop {q6-q7} + pop {r4-r9,pc} +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + push {r4-r5,lr} + vpush {q6-q7} + movrel_local r4, eob_8x32, 2 + + mov r12, #4*\h +1: + ldrh lr, [r4], #4 +.if \w == 8 + vmov.i32 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r12 +.endr + + vqrshrn.s32 d16, q8, #1 + vqrshrn.s32 d17, q12, #1 + vqrshrn.s32 d18, q9, #1 + vqrshrn.s32 d19, q13, #1 + vqrshrn.s32 d20, q10, #1 + vqrshrn.s32 d21, q14, #1 + vqrshrn.s32 d22, q11, #1 + vqrshrn.s32 d23, q15, #1 + + transpose_4x8h q8, q9, q10, q11 + + cmp r3, lr + load_add_store_8x4 r0, r5, shiftbits=2 + blt 9f + sub r2, r2, r12, lsl #3 + add r2, r2, #4*4 +.else + vmov.i32 q0, #0 + vmov.i32 q1, #0 + vld1.32 {q8, q9}, [r2, :128] + vst1.32 {q0, q1}, [r2, :128], r12 + vld1.32 {q10, q11}, [r2, :128] + vst1.32 {q0, q1}, [r2, :128], r12 + vld1.32 {q12, q13}, [r2, :128] + vst1.32 {q0, q1}, [r2, :128], r12 + vld1.32 {q14, q15}, [r2, :128] + vst1.32 {q0, q1}, [r2, :128], r12 + vqmovn.s32 d16, q8 + vqmovn.s32 d17, q10 + vqmovn.s32 d20, q9 + vqmovn.s32 d21, q11 + vqmovn.s32 d18, q12 + vqmovn.s32 d19, q14 + vqmovn.s32 d22, q13 + vqmovn.s32 d23, q15 + + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + + cmp r3, lr + load_add_store_4x8 r0, r5, shiftbits=3 + blt 9f + sub r0, r0, r1, lsl #3 + add r0, r0, #2*4 +.endif + b 1b + +9: + vpop {q6-q7} + pop {r4-r5,pc} +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 + idct_dc 32, 32, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 2048 + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, sp, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_horz_dct_32x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 2048 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 + idct_dc 16, 32, 1 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel_local r4, inv_dct_2s_x16_neon + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, sp, #(\i*16*2) + add r7, r2, #(\i*4) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #4*32 + bl inv_txfm_horz_scale_16x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #16*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 1024 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 + idct_dc 32, 16, 1 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel r5, X(inv_dct_4h_x16_neon) + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*4) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 14 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #4*16 + bl inv_txfm_horz_scale_dct_32x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 1024 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 + idct_dc 8, 32, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 512 + + movrel_local r10, eob_8x32, 2 + + mov r8, #4*32 + mov r9, #32 + mov r6, sp +1: + vmov.i32 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.32 {\i}, [r2, :128] + vst1.32 {q0}, [r2, :128], r8 +.endr + ldrh r11, [r10], #4 + sub r2, r2, r8, lsl #3 + sub r9, r9, #4 + add r2, r2, #4*4 + + bl inv_dct_4s_x8_neon + + vqrshrn.s32 d16, q8, #2 + vqrshrn.s32 d18, q9, #2 + vqrshrn.s32 d20, q10, #2 + vqrshrn.s32 d22, q11, #2 + vqrshrn.s32 d17, q12, #2 + vqrshrn.s32 d19, q13, #2 + vqrshrn.s32 d21, q14, #2 + vqrshrn.s32 d23, q15, #2 + + transpose_4x8h q8, q9, q10, q11 + + vst1.16 {q8, q9}, [r6, :128]! + cmp r3, r11 + vst1.16 {q10, q11}, [r6, :128]! + + bge 1b + cmp r9, #0 + beq 3f + + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r9, r9, #4 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4 + add r6, r0, #(\i*2) + add r7, sp, #(\i*2) + mov r8, #8*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 + idct_dc 32, 8, 2 + + push {r4-r11,lr} + vpush {q4-q7} + movrel_local r10, eob_8x32 + sub_sp_align 512 + ldrh r11, [r10], #2 + +.irp i, 0, 2, 4, 6 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*4) +.if \i > 0 + cmp r3, r11 + mov r8, #(8 - \i) + blt 1f +.if \i < 6 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #8*4 + bl inv_txfm_horz_dct_32x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: + mov r8, #2*32 + mov r9, #0 +1: + add r6, r0, r9, lsl #1 + add r7, sp, r9, lsl #1 // #(\i*2) + +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r7, :128], r8 +.endr + add r9, r9, #8 + + bl X(inv_dct_8h_x8_neon) + + cmp r9, #32 + + load_add_store_8x8 r6, r7 + + blt 1b + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + vld1.32 {q0, q1}, [r12, :128]! + + vqrdmulh.s32 d23, d16, d0[1] // t63a + vqrdmulh.s32 d16, d16, d0[0] // t32a + vqrdmulh.s32 d22, d17, d1[0] // t62a + vqrdmulh.s32 d17, d17, d1[1] // t33a + vqrdmulh.s32 d21, d18, d2[1] // t61a + vqrdmulh.s32 d18, d18, d2[0] // t34a + vqrdmulh.s32 d20, d19, d3[0] // t60a + vqrdmulh.s32 d19, d19, d3[1] // t35a + + vld1.32 {q0}, [r12, :128]! + + vqadd.s32 d24, d16, d17 // t32 + vqsub.s32 d25, d16, d17 // t33 + vqsub.s32 d26, d19, d18 // t34 + vqadd.s32 d27, d19, d18 // t35 + vqadd.s32 d28, d20, d21 // t60 + vqsub.s32 d29, d20, d21 // t61 + vqsub.s32 d30, d23, d22 // t62 + vqadd.s32 d31, d23, d22 // t63 + + vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a + vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a + vneg.s32 d4, d4 // t34a + vmul_vmls d8, d30, d25, d0[1], d0[0] // -> t33a + vrshr.s32 d26, d4, #12 // t34a + vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a + vrshr.s32 d29, d6, #12 // t61a + vrshr.s32 d25, d8, #12 // t33a + vrshr.s32 d30, d4, #12 // t62a + + vqadd.s32 d16, d24, d27 // t32a + vqsub.s32 d19, d24, d27 // t35a + vqadd.s32 d17, d25, d26 // t33 + vqsub.s32 d18, d25, d26 // t34 + vqsub.s32 d20, d31, d28 // t60a + vqadd.s32 d23, d31, d28 // t63a + vqsub.s32 d21, d30, d29 // t61 + vqadd.s32 d22, d30, d29 // t62 + + vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a + vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a + vmul_vmla d8, d20, d19, d1[0], d1[1] // -> t60 + vrshr.s32 d21, d4, #12 // t61a + vrshr.s32 d18, d6, #12 // t34a + vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35 + vrshr.s32 d20, d8, #12 // t60 + vrshr.s32 d19, d4, #12 // t35 + + vst1.32 {d16, d17, d18, d19}, [r6, :128]! + vst1.32 {d20, d21, d22, d23}, [r6, :128]! + + bx lr +endfunc + +function inv_dct64_step2_neon + movrel_local r12, idct_coeffs + vld1.32 {q0}, [r12, :128] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + vldr d16, [r6, #4*2*0] // t32a + vldr d17, [r9, #4*2*8] // t39a + vldr d18, [r9, #4*2*0] // t63a + vldr d19, [r6, #4*2*8] // t56a + vldr d20, [r6, #4*2*16] // t40a + vldr d21, [r9, #4*2*24] // t47a + vldr d22, [r9, #4*2*16] // t55a + vldr d23, [r6, #4*2*24] // t48a + + vqadd.s32 d24, d16, d17 // t32 + vqsub.s32 d25, d16, d17 // t39 + vqadd.s32 d26, d18, d19 // t63 + vqsub.s32 d27, d18, d19 // t56 + vqsub.s32 d28, d21, d20 // t40 + vqadd.s32 d29, d21, d20 // t47 + vqadd.s32 d30, d23, d22 // t48 + vqsub.s32 d31, d23, d22 // t55 + + vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a + vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a + vmul_vmla d8, d31, d28, d1[1], d1[0] // -> t40a + vrshr.s32 d25, d4, #12 // t56a + vrshr.s32 d27, d6, #12 // t39a + vneg.s32 d8, d8 // t40a + vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a + vrshr.s32 d31, d8, #12 // t40a + vrshr.s32 d28, d4, #12 // t55a + + vqadd.s32 d16, d24, d29 // t32a + vqsub.s32 d19, d24, d29 // t47a + vqadd.s32 d17, d27, d31 // t39 + vqsub.s32 d18, d27, d31 // t40 + vqsub.s32 d20, d26, d30 // t48a + vqadd.s32 d23, d26, d30 // t63a + vqsub.s32 d21, d25, d28 // t55 + vqadd.s32 d22, d25, d28 // t56 + + vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a + vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a + vmul_vmls d8, d20, d19, d0[0], d0[0] // -> t47 + vrshr.s32 d18, d4, #12 // t40a + vrshr.s32 d21, d6, #12 // t55a + vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48 + vrshr.s32 d19, d8, #12 // t47 + vrshr.s32 d20, d4, #12 // t48 + + vstr d16, [r6, #4*2*0] // t32a + vstr d17, [r9, #4*2*0] // t39 + vstr d18, [r6, #4*2*8] // t40a + vstr d19, [r9, #4*2*8] // t47 + vstr d20, [r6, #4*2*16] // t48 + vstr d21, [r9, #4*2*16] // t55a + vstr d22, [r6, #4*2*24] // t56 + vstr d23, [r9, #4*2*24] // t63a + + add r6, r6, #4*2 + sub r9, r9, #4*2 + cmp r6, r9 + blt 1b + bx lr +endfunc + +.macro load8 src, strd, zero, clear +.irp i, d16, d17, d18, d19, d20, d21, d22, d23 +.if \clear + vld1.32 {\i}, [\src, :64] + vst1.32 {\zero}, [\src, :64], \strd +.else + vld1.32 {\i}, [\src, :64], \strd +.endif +.endr +.endm + +.macro store16 dst + vst1.32 {q8, q9}, [\dst, :128]! + vst1.32 {q10, q11}, [\dst, :128]! + vst1.32 {q12, q13}, [\dst, :128]! + vst1.32 {q14, q15}, [\dst, :128]! +.endm + +.macro clear_upper8 +.irp i, q12, q13, q14, q15 + vmov.i32 \i, #0 +.endr +.endm + +.macro vmov_if reg, val, cond +.if \cond + vmov.i32 \reg, \val +.endif +.endm + +.macro movdup_if reg, gpr, val, cond +.if \cond + mov_const \gpr, \val + vdup.32 \reg, \gpr +.endif +.endm + +.macro vst1_if regs, dst, dstalign, cond +.if \cond + vst1.32 \regs, \dst, \dstalign +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_2s_x64_neon + mov r6, sp + + push {r10-r11,lr} + + lsl r8, r8, #2 + + movdup_if d0, r12, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + add r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct_2s_x16_neon + + store16 r6 + + movdup_if d0, r12, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + lsr r8, r8, #1 + sub r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct32_odd_2s_x16_neon + + add r10, r6, #8*15 + sub r6, r6, #8*16 + + mov r9, #-8 + +.macro store_addsub r0, r1, r2, r3 + vld1.32 {d2}, [r6, :64]! + vld1.32 {d3}, [r6, :64]! + vqadd.s32 d6, d2, \r0 + vqsub.s32 \r0, d2, \r0 + vld1.32 {d4}, [r6, :64]! + vqadd.s32 d7, d3, \r1 + vqsub.s32 \r1, d3, \r1 + vld1.32 {d5}, [r6, :64]! + vqadd.s32 d2, d4, \r2 + sub r6, r6, #8*4 + vqsub.s32 \r2, d4, \r2 + vst1.32 {d6}, [r6, :64]! + vst1.32 {\r0}, [r10, :64], r9 + vqadd.s32 d3, d5, \r3 + vqsub.s32 \r3, d5, \r3 + vst1.32 {d7}, [r6, :64]! + vst1.32 {\r1}, [r10, :64], r9 + vst1.32 {d2}, [r6, :64]! + vst1.32 {\r2}, [r10, :64], r9 + vst1.32 {d3}, [r6, :64]! + vst1.32 {\r3}, [r10, :64], r9 +.endm + store_addsub d31, d30, d29, d28 + store_addsub d27, d26, d25, d24 + store_addsub d23, d22, d21, d20 + store_addsub d19, d18, d17, d16 +.purgem store_addsub + + add r6, r6, #2*4*16 + + movrel_local r12, idct64_coeffs + movdup_if d0, lr, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + add r9, r7, r8, lsl #4 // offset 16 + add r10, r7, r8, lsl #3 // offset 8 + sub r9, r9, r8 // offset 15 + sub r11, r10, r8 // offset 7 + vld1.32 {d16}, [r7, :64] // in1 (offset 0) + vld1.32 {d17}, [r9, :64] // in31 (offset 15) + vld1.32 {d18}, [r10, :64] // in17 (offset 8) + vld1.32 {d19}, [r11, :64] // in15 (offset 7) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + add r7, r7, r8, lsl #2 // offset 4 + sub r9, r9, r8, lsl #2 // offset 11 + sub r10, r7, r8 // offset 3 + add r11, r9, r8 // offset 12 + vld1.32 {d16}, [r10, :64] // in7 (offset 3) + vld1.32 {d17}, [r11, :64] // in25 (offset 12) + vld1.32 {d18}, [r9, :64] // in23 (offset 11) + vld1.32 {d19}, [r7, :64] // in9 (offset 4) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + sub r10, r10, r8, lsl #1 // offset 1 + sub r9, r9, r8, lsl #1 // offset 9 + add r10, r10, r8 // offset 2 + add r9, r9, r8 // offset 10 + add r7, r7, r8 // offset 5 + add r11, r11, r8 // offset 13 + vld1.32 d16, [r10, :64] // in5 (offset 2) + vld1.32 d17, [r11, :64] // in27 (offset 13) + vld1.32 d18, [r9, :64] // in21 (offset 10) + vld1.32 d19, [r7, :64] // in11 (offset 5) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, 2896*8*(1<<16), \scale + vmov_if d7, #0, \clear + sub r10, r10, r8 // offset 1 + sub r9, r9, r8 // offset 9 + add r11, r11, r8 // offset 14 + add r7, r7, r8 // offset 6 + vld1.32 d16, [r10, :64] // in3 (offset 1) + vld1.32 d17, [r11, :64] // in29 (offset 14) + vld1.32 d18, [r9, :64] // in19 (offset 9) + vld1.32 d19, [r7, :64] // in13 (offset 6) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + + sub r6, r6, #2*4*32 + add r9, r6, #2*4*7 + + bl inv_dct64_step2_neon + + pop {r10-r11,pc} +endfunc +.endm + +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + +function inv_txfm_horz_dct_64x2_neon + vdup.32 q4, r9 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, #2*56 + + push {r10-r11,lr} + + mov r10, #2*64 + mov r11, #-2*4*4 + +1: + vld1.32 {d16, d17, d18, d19}, [r7, :128]! + vld1.32 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.32 {d20, d21, d22, d23}, [r7, :128]! + vld1.32 {d24, d25, d26, d27}, [r8, :128], r11 + vtrn.32 d16, d17 + vtrn.32 d18, d19 + vtrn.32 d20, d21 + vtrn.32 d22, d23 + vtrn.32 d31, d30 + vtrn.32 d29, d28 + vtrn.32 d27, d26 + vtrn.32 d25, d24 + +.macro store_addsub src0, src1, src2, src3, src4, src5, src6, src7 + vqsub.s32 d7, \src0, \src1 + vqsub.s32 d6, \src2, \src3 + vqsub.s32 d5, \src4, \src5 + vqsub.s32 d4, \src6, \src7 + vqadd.s32 d0, \src0, \src1 + vqadd.s32 d1, \src2, \src3 + vqadd.s32 d2, \src4, \src5 + vqadd.s32 d3, \src6, \src7 + vrshl.s32 q3, q3, q4 + vrshl.s32 q2, q2, q4 + vrshl.s32 q0, q0, q4 + vrshl.s32 q1, q1, q4 + vqmovn.s32 d7, q3 + vqmovn.s32 d6, q2 + vqmovn.s32 d0, q0 + vqmovn.s32 d1, q1 + vrev32.16 q3, q3 + vst1.16 {q0}, [r6, :128], r10 + vst1.16 {q3}, [r9, :128], r10 +.endm + store_addsub d16, d31, d18, d29, d20, d27, d22, d25 + store_addsub d17, d30, d19, d28, d21, d26, d23, d24 +.purgem store_addsub + sub r6, r6, r10, lsl #1 + sub r9, r9, r10, lsl #1 + add r6, r6, #16 + sub r9, r9, #16 + + cmp r7, r8 + blt 1b + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_vert_dct_4x64_neon + lsl r8, r8, #1 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, r1, lsl #6 + sub r9, r9, r1 + + push {r10-r11,lr} + + neg r10, r1 + mov r11, #-2*4*4 + +1: + vld1.16 {d16, d17, d18, d19}, [r7, :128]! + vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.16 {d20, d21, d22, d23}, [r7, :128]! + vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 + + vmov.i16 q6, #0 + vmvn.i16 q7, #0xfc00 // 0x3ff +.macro add_dest_addsub src0, src1, src2, src3 + vld1.16 {d0}, [r6, :64], r1 + vld1.16 {d1}, [r9, :64], r10 + vqadd.s16 d4, \src0, \src1 + vld1.16 {d2}, [r6, :64] + vqsub.s16 d5, \src0, \src1 + vld1.16 {d3}, [r9, :64] + vqadd.s16 d6, \src2, \src3 + vqsub.s16 d7, \src2, \src3 + sub r6, r6, r1 + sub r9, r9, r10 + vrshr.s16 q2, q2, #4 + vrshr.s16 q3, q3, #4 + vqadd.s16 q2, q2, q0 + vqadd.s16 q3, q3, q1 + vmax.s16 q2, q2, q6 + vmax.s16 q3, q3, q6 + vmin.s16 q2, q2, q7 + vmin.s16 q3, q3, q7 + vst1.16 {d4}, [r6, :64], r1 + vst1.16 {d5}, [r9, :64], r10 + vst1.16 {d6}, [r6, :64], r1 + vst1.16 {d7}, [r9, :64], r10 +.endm + add_dest_addsub d16, d31, d17, d30 + add_dest_addsub d18, d29, d19, d28 + add_dest_addsub d20, d27, d21, d26 + add_dest_addsub d22, d25, d23, d24 +.purgem add_dest_addsub + cmp r7, r8 + blt 1b + + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 + idct_dc 64, 64, 2 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_dct_clear_2s_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x2_neon +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r7, r5, #(\i*2) + mov r8, #64*2 + bl X(inv_txfm_dct_4h_x64_neon) + add r6, r0, #(\i*2) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 + idct_dc 64, 32, 1 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_dct_clear_scale_2s_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-1 // shift + bl inv_txfm_horz_dct_64x2_neon +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i*2) + add r7, r5, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 + idct_dc 32, 64, 1 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 32*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, r5, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_horz_scale_dct_32x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r7, r5, #(\i*2) + mov r8, #32*2 + bl X(inv_txfm_dct_4h_x64_neon) + add r6, r0, #(\i*2) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 32*32*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 + idct_dc 64, 16, 2 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 64*16*2+64*4*2 + add r4, sp, #64*4*2 + + movrel_local r10, eob_16x32 + +.irp i, 0, 2, 4, 6, 8, 10, 12, 14 + add r6, r4, #(\i*64*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*4) + mov r8, #16*4 + bl inv_txfm_dct_clear_2s_x64_neon + add r6, r4, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x2_neon +.if \i < 8 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: + movrel r5, X(inv_dct_4h_x16_neon) +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i*2) + add r7, r4, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 64*16*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 + idct_dc 16, 64, 2 + + push {r4-r11,lr} + vpush {q4-q7} + + sub_sp_align 16*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + + movrel_local r4, inv_dct_2s_x16_neon +.irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + add r6, r5, #(\i*16*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 30 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*4) + mov r8, #32*4 + bl inv_txfm_horz_16x2_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 2 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r7, r5, #(\i*2) + mov r8, #16*2 + bl X(inv_txfm_dct_4h_x64_neon) + add r6, r0, #(\i*2) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 16*32*2+64*4*2 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc diff --git a/src/arm/32/util.S b/src/arm/32/util.S index c8ac12bf03..c3710d3767 100644 --- a/src/arm/32/util.S +++ b/src/arm/32/util.S @@ -158,6 +158,14 @@ vtrn.8 \r2, \r3 .endm +.macro transpose_4x4s q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 + vswp \r1, \r4 // vtrn.64 \q0, \q2 + vswp \r3, \r6 // vtrn.64 \q1, \q3 + + vtrn.32 \q0, \q1 + vtrn.32 \q2, \q3 +.endm + .macro transpose_4x4h q0, q1, r0, r1, r2, r3 vtrn.32 \q0, \q1 From 4f9c18c3f5cdda51f0bcb9ea22652c5bba18b408 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Mon, 8 Mar 2021 13:15:48 +0100 Subject: [PATCH 136/155] Add colored help to the CLI --- Cargo.toml | 2 +- src/bin/common.rs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 8c9ee8615d..ff8777f62b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,7 +60,7 @@ dump_lookahead_data = ["byteorder", "image"] arg_enum_proc_macro = "0.3" bitstream-io = "1" cfg-if = "1.0" -clap = { version = "2", optional = true, default-features = false } +clap = { version = "2", optional = true, default-features = false, features = ["color"] } libc = "0.2" y4m = { version = "0.7", optional = true } backtrace = { version = "0.3", optional = true } diff --git a/src/bin/common.rs b/src/bin/common.rs index 6706eae272..8a4d6cb6a7 100644 --- a/src/bin/common.rs +++ b/src/bin/common.rs @@ -89,6 +89,7 @@ pub fn parse_cli() -> Result { .about("AV1 video encoder") .setting(AppSettings::DeriveDisplayOrder) .setting(AppSettings::SubcommandsNegateReqs) + .setting(AppSettings::ColoredHelp) .arg(Arg::with_name("FULLHELP") .help("Prints more detailed help information") .long("fullhelp")) From 07a46a77ee5a823e25cf50bd5d1c072330a2ac7a Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 2 Mar 2021 17:50:47 +0900 Subject: [PATCH 137/155] Extract ssim_boost to a function --- src/rdo.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/rdo.rs b/src/rdo.rs index f065ded1cd..6b79f3c732 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -147,8 +147,6 @@ fn cdef_dist_wxh_8x8( debug_assert!(src2.plane_cfg.xdec == 0); debug_assert!(src2.plane_cfg.ydec == 0); - let coeff_shift = bit_depth - 8; - // Sum into columns to improve auto-vectorization let mut sum_s_cols: [u16; 8] = [0; 8]; let mut sum_d_cols: [u16; 8] = [0; 8]; @@ -197,11 +195,19 @@ fn cdef_dist_wxh_8x8( let svar = sum_s2 - ((sum_s * sum_s + 32) >> 6); let dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6); let sse = (sum_d2 + sum_s2 - 2 * sum_sd) as f64; + RawDistortion::new( + (sse * ssim_boost(svar, dvar, bit_depth) + 0.5_f64) as u64, + ) +} + +#[inline(always)] +fn ssim_boost(svar: i64, dvar: i64, bit_depth: usize) -> f64 { + let coeff_shift = bit_depth - 8; + //The two constants were tuned for CDEF, but can probably be better tuned for use in general RDO - let ssim_boost = (4033_f64 / 16_384_f64) + (4033_f64 / 16_384_f64) * (svar + dvar + (16_384 << (2 * coeff_shift))) as f64 - / f64::sqrt(((16_265_089i64 << (4 * coeff_shift)) + svar * dvar) as f64); - RawDistortion::new((sse * ssim_boost + 0.5_f64) as u64) + / f64::sqrt(((16_265_089i64 << (4 * coeff_shift)) + svar * dvar) as f64) } #[allow(unused)] From 6704be90c655710be16ad9de9eb29bc37c481540 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 2 Mar 2021 21:51:10 +0900 Subject: [PATCH 138/155] Round ssim_boost to a 12.12 fixed-point value --- src/rdo.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/rdo.rs b/src/rdo.rs index 6b79f3c732..f82238c436 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -194,20 +194,20 @@ fn cdef_dist_wxh_8x8( // Use sums to calculate distortion let svar = sum_s2 - ((sum_s * sum_s + 32) >> 6); let dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6); - let sse = (sum_d2 + sum_s2 - 2 * sum_sd) as f64; - RawDistortion::new( - (sse * ssim_boost(svar, dvar, bit_depth) + 0.5_f64) as u64, - ) + let sse = (sum_d2 + sum_s2 - 2 * sum_sd) as u64; + RawDistortion::new(ssim_boost(svar, dvar, bit_depth).mul_u64(sse)) } #[inline(always)] -fn ssim_boost(svar: i64, dvar: i64, bit_depth: usize) -> f64 { +fn ssim_boost(svar: i64, dvar: i64, bit_depth: usize) -> DistortionScale { let coeff_shift = bit_depth - 8; //The two constants were tuned for CDEF, but can probably be better tuned for use in general RDO - (4033_f64 / 16_384_f64) - * (svar + dvar + (16_384 << (2 * coeff_shift))) as f64 - / f64::sqrt(((16_265_089i64 << (4 * coeff_shift)) + svar * dvar) as f64) + DistortionScale::new( + (4033_f64 / 16_384_f64) + * (svar + dvar + (16_384 << (2 * coeff_shift))) as f64 + / f64::sqrt(((16_265_089i64 << (4 * coeff_shift)) + svar * dvar) as f64), + ) } #[allow(unused)] From 9d39aeac14353d206f6ab1c842cd935183962248 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Sat, 6 Mar 2021 17:51:00 +0900 Subject: [PATCH 139/155] Drop unused ActivityMask methods Rather than accessing a value at a time, we will project into a Vec. --- src/activity.rs | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/src/activity.rs b/src/activity.rs index 29369e34bb..d0dd7df22a 100644 --- a/src/activity.rs +++ b/src/activity.rs @@ -73,45 +73,4 @@ impl ActivityMask { } ActivityMask { variances, width, height, granularity } } - - pub fn variance_at(&self, x: usize, y: usize) -> Option { - let (dec_width, dec_height) = - (self.width >> self.granularity, self.height >> self.granularity); - if x > dec_width || y > dec_height { - None - } else { - Some(*self.variances.get(x + dec_width * y).unwrap()) - } - } - - pub fn mean_activity_of(&self, rect: Rect) -> Option { - let Rect { x, y, width, height } = rect; - let (x, y) = (x as usize, y as usize); - let granularity = self.granularity; - let (dec_x, dec_y) = (x >> granularity, y >> granularity); - let (dec_width, dec_height) = - (width >> granularity, height >> granularity); - - if x > self.width - || y > self.height - || (x + width) > self.width - || (y + height) > self.height - || dec_width == 0 - || dec_height == 0 - { - // Region lies out of the frame or is smaller than 8x8 on some axis - None - } else { - let activity = self - .variances - .chunks_exact(self.width >> granularity) - .skip(dec_y) - .take(dec_height) - .map(|row| row.iter().skip(dec_x).take(dec_width).sum::()) - .sum::() - / (dec_width as f64 * dec_height as f64); - - Some(activity.cbrt().sqrt()) - } - } } From 8003c93c810b9d2303c6c4c50ce2fe0d0eb2ff67 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Wed, 3 Mar 2021 16:17:56 +0900 Subject: [PATCH 140/155] Match variance values in ActivityMask with cdef_dist_wxh_8x8 --- src/activity.rs | 56 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/src/activity.rs b/src/activity.rs index d0dd7df22a..e8318cc382 100644 --- a/src/activity.rs +++ b/src/activity.rs @@ -10,11 +10,12 @@ use crate::frame::*; use crate::tiling::*; use crate::util::*; +use itertools::izip; use rust_hawktracer::*; #[derive(Debug, Default, Clone)] pub struct ActivityMask { - variances: Vec, + variances: Vec, // Width and height of the original frame that is masked width: usize, height: usize, @@ -50,27 +51,44 @@ impl ActivityMask { }; let block = luma.subregion(block_rect); - - let mean: f64 = block - .rows_iter() - .flatten() - .map(|&pix| { - let pix: i16 = CastFromPrimitive::cast_from(pix); - pix as f64 - }) - .sum::() - / 64.0_f64; - let variance: f64 = block - .rows_iter() - .flatten() - .map(|&pix| { - let pix: i16 = CastFromPrimitive::cast_from(pix); - (pix as f64 - mean).powi(2) - }) - .sum::(); + let variance = variance_8x8(&block); variances.push(variance); } } ActivityMask { variances, width, height, granularity } } } + +// Adapted from the source variance calculation in cdef_dist_wxh_8x8. +#[inline(never)] +fn variance_8x8(src: &PlaneRegion<'_, T>) -> u32 { + debug_assert!(src.plane_cfg.xdec == 0); + debug_assert!(src.plane_cfg.ydec == 0); + + // Sum into columns to improve auto-vectorization + let mut sum_s_cols: [u16; 8] = [0; 8]; + let mut sum_s2_cols: [u32; 8] = [0; 8]; + + // Check upfront that 8 rows are available. + let _row = &src[7]; + + for j in 0..8 { + let row = &src[j][0..8]; + for (sum_s, sum_s2, s) in izip!(&mut sum_s_cols, &mut sum_s2_cols, row) { + // Don't convert directly to u32 to allow better vectorization + let s: u16 = u16::cast_from(*s); + *sum_s += s; + + // Convert to u32 to avoid overflows when multiplying + let s: u32 = s as u32; + *sum_s2 += s * s; + } + } + + // Sum together the sum of columns + let sum_s = sum_s_cols.iter().map(|&a| u32::cast_from(a)).sum::(); + let sum_s2 = sum_s2_cols.iter().sum::(); + + // Use sums to calculate variance + sum_s2 - ((sum_s * sum_s + 32) >> 6) +} From 3255c9beb4a69bd8b2fd71a35e6a9d496f577f60 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Wed, 3 Mar 2021 17:17:05 +0900 Subject: [PATCH 141/155] Store ActivityMask with (w_in_imp_b, h_in_imp_b) dimensions --- src/activity.rs | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/activity.rs b/src/activity.rs index e8318cc382..e589b65c55 100644 --- a/src/activity.rs +++ b/src/activity.rs @@ -15,12 +15,10 @@ use rust_hawktracer::*; #[derive(Debug, Default, Clone)] pub struct ActivityMask { - variances: Vec, + variances: Box<[u32]>, // Width and height of the original frame that is masked width: usize, height: usize, - // Side of unit (square) activity block in log2 - granularity: usize, } impl ActivityMask { @@ -28,24 +26,25 @@ impl ActivityMask { pub fn from_plane(luma_plane: &Plane) -> ActivityMask { let PlaneConfig { width, height, .. } = luma_plane.cfg; - let granularity = 3; + // Width and height are padded to 8×8 block size. + let w_in_imp_b = width.align_power_of_two_and_shift(3); + let h_in_imp_b = height.align_power_of_two_and_shift(3); let aligned_luma = Rect { x: 0_isize, y: 0_isize, - width: (width >> granularity) << granularity, - height: (height >> granularity) << granularity, + width: w_in_imp_b << 3, + height: h_in_imp_b << 3, }; let luma = PlaneRegion::new(luma_plane, aligned_luma); - let mut variances = - Vec::with_capacity((height >> granularity) * (width >> granularity)); + let mut variances = Vec::with_capacity(w_in_imp_b * h_in_imp_b); - for y in 0..height >> granularity { - for x in 0..width >> granularity { + for y in 0..h_in_imp_b { + for x in 0..w_in_imp_b { let block_rect = Area::Rect { - x: (x << granularity) as isize, - y: (y << granularity) as isize, + x: (x << 3) as isize, + y: (y << 3) as isize, width: 8, height: 8, }; @@ -55,7 +54,7 @@ impl ActivityMask { variances.push(variance); } } - ActivityMask { variances, width, height, granularity } + ActivityMask { variances: variances.into_boxed_slice(), width, height } } } From c1de05dab4cbe98be14f08aacb2f714b34c5639d Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Wed, 3 Mar 2021 15:56:05 +0900 Subject: [PATCH 142/155] Add activity_scales to FrameInvariants --- src/activity.rs | 10 ++++++++++ src/api/internal.rs | 17 +++++++++++++---- src/encoder.rs | 7 +++++++ src/rdo.rs | 2 +- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/activity.rs b/src/activity.rs index e589b65c55..5c0a764d0c 100644 --- a/src/activity.rs +++ b/src/activity.rs @@ -8,6 +8,7 @@ // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::frame::*; +use crate::rdo::{ssim_boost, DistortionScale}; use crate::tiling::*; use crate::util::*; use itertools::izip; @@ -56,6 +57,15 @@ impl ActivityMask { } ActivityMask { variances: variances.into_boxed_slice(), width, height } } + + #[hawktracer(activity_mask_fill_scales)] + pub fn fill_scales( + &self, bit_depth: usize, activity_scales: &mut Box<[DistortionScale]>, + ) { + for (dst, &src) in activity_scales.iter_mut().zip(self.variances.iter()) { + *dst = ssim_boost(src as i64, src as i64, bit_depth); + } + } } // Adapted from the source variance calculation in cdef_dist_wxh_8x8. diff --git a/src/api/internal.rs b/src/api/internal.rs index b2ab16cd7c..57bda98002 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -1165,6 +1165,19 @@ impl ContextInner { ); frame_data.fi.set_quantizers(&qps); + if self.config.tune == Tune::Psychovisual { + let frame = + self.frame_q[&frame_data.fi.input_frameno].as_ref().unwrap(); + frame_data.fi.activity_mask = + ActivityMask::from_plane(&frame.planes[0]); + frame_data.fi.activity_mask.fill_scales( + frame_data.fi.sequence.bit_depth, + &mut frame_data.fi.activity_scales, + ); + } else { + frame_data.fi.activity_mask = ActivityMask::default(); + } + if self.rc_state.needs_trial_encode(fti) { let mut trial_fs = frame_data.fs.clone(); let data = @@ -1186,10 +1199,6 @@ impl ContextInner { frame_data.fi.set_quantizers(&qps); } - // TODO: replace with ActivityMask::from_plane() when - // the activity mask is actually used. - frame_data.fi.activity_mask = ActivityMask::default(); - let data = encode_frame(&frame_data.fi, &mut frame_data.fs, &self.inter_cfg); let enc_stats = frame_data.fs.enc_stats.clone(); diff --git a/src/encoder.rs b/src/encoder.rs index 8098285197..37d6af8553 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -606,6 +606,8 @@ pub struct FrameInvariants { pub block_importances: Box<[f32]>, /// Pre-computed distortion_scale. pub distortion_scales: Box<[DistortionScale]>, + /// Pre-computed activity_scale. + pub activity_scales: Box<[DistortionScale]>, /// Target CPU feature level. pub cpu_feature_level: crate::cpu_features::CpuFeatureLevel, @@ -747,6 +749,11 @@ impl FrameInvariants { w_in_imp_b * h_in_imp_b ] .into_boxed_slice(), + activity_scales: vec![ + DistortionScale::default(); + w_in_imp_b * h_in_imp_b + ] + .into_boxed_slice(), cpu_feature_level: Default::default(), activity_mask: Default::default(), enable_segmentation: config.speed_settings.enable_segmentation, diff --git a/src/rdo.rs b/src/rdo.rs index f82238c436..2a60adb818 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -199,7 +199,7 @@ fn cdef_dist_wxh_8x8( } #[inline(always)] -fn ssim_boost(svar: i64, dvar: i64, bit_depth: usize) -> DistortionScale { +pub fn ssim_boost(svar: i64, dvar: i64, bit_depth: usize) -> DistortionScale { let coeff_shift = bit_depth - 8; //The two constants were tuned for CDEF, but can probably be better tuned for use in general RDO From 1094586b237f39c880bd45e4ca2aec94d42a74da Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Thu, 4 Mar 2021 17:00:07 +0900 Subject: [PATCH 143/155] Derive segmentation index from spatiotemporal scaling --- src/context/superblock_unit.rs | 5 ++++ src/rdo.rs | 43 ++++++++++++++++++--------- src/segmentation.rs | 53 ++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 14 deletions(-) diff --git a/src/context/superblock_unit.rs b/src/context/superblock_unit.rs index 5957c9a131..7f0aa827cf 100644 --- a/src/context/superblock_unit.rs +++ b/src/context/superblock_unit.rs @@ -27,6 +27,11 @@ pub const BLOCK_TO_PLANE_SHIFT: usize = MI_SIZE_LOG2; pub const IMPORTANCE_BLOCK_TO_BLOCK_SHIFT: usize = 1; pub const LOCAL_BLOCK_MASK: usize = (1 << SUPERBLOCK_TO_BLOCK_SHIFT) - 1; +pub const MAX_SB_IN_IMP_B: usize = 1 + << (MAX_SB_SIZE_LOG2 + - IMPORTANCE_BLOCK_TO_BLOCK_SHIFT + - BLOCK_TO_PLANE_SHIFT); + /// Absolute offset in superblocks, where a superblock is defined /// to be an N*N square where N = (1 << SUPERBLOCK_TO_PLANE_SHIFT). #[derive(Clone, Copy, Debug, PartialEq)] diff --git a/src/rdo.rs b/src/rdo.rs index 2a60adb818..0ef37665d9 100644 --- a/src/rdo.rs +++ b/src/rdo.rs @@ -524,6 +524,31 @@ pub fn distortion_scale( fi.distortion_scales[y * fi.w_in_imp_b + x] } +pub fn spatiotemporal_scale( + fi: &FrameInvariants, frame_bo: PlaneBlockOffset, bsize: BlockSize, +) -> DistortionScale { + if !fi.config.temporal_rdo() && fi.config.tune != Tune::Psychovisual { + return DistortionScale::default(); + } + + let x0 = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT; + let y0 = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT; + let x1 = (x0 + bsize.width_imp_b()).min(fi.w_in_imp_b); + let y1 = (y0 + bsize.height_imp_b()).min(fi.h_in_imp_b); + let den = (((x1 - x0) * (y1 - y0)) as u64) << DistortionScale::SHIFT; + + let mut sum = 0; + for y in y0..y1 { + sum += fi.distortion_scales[y * fi.w_in_imp_b..][x0..x1] + .iter() + .zip(fi.activity_scales[y * fi.w_in_imp_b..][x0..x1].iter()) + .take(MAX_SB_IN_IMP_B) + .map(|(d, a)| d.0 as u64 * a.0 as u64) + .sum::(); + } + DistortionScale(((sum + (den >> 1)) / den) as u32) +} + pub fn distortion_scale_for( propagate_cost: f64, intra_cost: f64, ) -> DistortionScale { @@ -605,7 +630,7 @@ impl DistortionScale { /// Multiply, round and shift /// Internal implementation, so don't use multiply trait. #[inline] - fn mul_u64(self, dist: u64) -> u64 { + pub fn mul_u64(self, dist: u64) -> u64 { (self.0 as u64 * dist + (1 << Self::SHIFT >> 1)) >> Self::SHIFT } } @@ -817,21 +842,11 @@ fn luma_chroma_mode_rdo( // Find the best chroma prediction mode for the current luma prediction mode let mut chroma_rdo = |skip: bool| -> bool { - let mut zero_distortion = false; + use crate::segmentation::select_segment; - // If skip is true or segmentation is turned off, sidx is not coded. - let sidx_range = if skip || !fi.enable_segmentation { - 0..=0 - } else if fi.base_q_idx as i16 - + ts.segmentation.data[2][SegLvl::SEG_LVL_ALT_Q as usize] - < 1 - { - 0..=1 - } else { - 0..=2 - }; + let mut zero_distortion = false; - for sidx in sidx_range { + for sidx in select_segment(fi, ts, tile_bo, bsize, skip) { cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, sidx); let (tx_size, tx_type) = rdo_tx_size_type( diff --git a/src/segmentation.rs b/src/segmentation.rs index ecbfdf7464..4b21fd7210 100644 --- a/src/segmentation.rs +++ b/src/segmentation.rs @@ -9,6 +9,8 @@ use crate::context::*; use crate::header::PRIMARY_REF_NONE; +use crate::partition::BlockSize; +use crate::tiling::TileStateMut; use crate::util::Pixel; use crate::FrameInvariants; use crate::FrameState; @@ -69,3 +71,54 @@ pub fn segmentation_optimize( } } } + +pub fn select_segment( + fi: &FrameInvariants, ts: &TileStateMut<'_, T>, tile_bo: TileBlockOffset, + bsize: BlockSize, skip: bool, +) -> std::ops::RangeInclusive { + use crate::rdo::spatiotemporal_scale; + use arrayvec::ArrayVec; + + // If skip is true or segmentation is turned off, sidx is not coded. + if skip || !fi.enable_segmentation { + return 0..=0; + } + + let frame_bo = ts.to_frame_block_offset(tile_bo); + let scale = spatiotemporal_scale(fi, frame_bo, bsize); + + // TODO: Replace this calculation with precomputed scale thresholds. + let segment_2_is_lossless = fi.base_q_idx as i16 + + ts.segmentation.data[2][SegLvl::SEG_LVL_ALT_Q as usize] + < 1; + + let seg_ac_q: ArrayVec<[_; 3]> = if fi.enable_segmentation { + use crate::quantize::ac_q; + (0..=2) + .map(|sidx| { + ac_q( + (fi.base_q_idx as i16 + + ts.segmentation.data[sidx][SegLvl::SEG_LVL_ALT_Q as usize]) + .max(0) + .min(255) as u8, + 0, + fi.sequence.bit_depth, + ) + }) + .collect() + } else { + Default::default() + }; + + let sidx = if scale.mul_u64(seg_ac_q[1] as u64) < seg_ac_q[0] as u64 { + 1 + } else if !segment_2_is_lossless + && scale.mul_u64(seg_ac_q[2] as u64) > seg_ac_q[0] as u64 + { + 2 + } else { + 0 + }; + + sidx..=sidx +} From d5df5b7b5a106e7430f8d285a50d1f9ddd9f4413 Mon Sep 17 00:00:00 2001 From: David Michael Barr Date: Tue, 9 Mar 2021 17:53:04 +0900 Subject: [PATCH 144/155] Add search levels for segmentation Restore full search for speed preset 0. Enable simple segmentation for all other speed presets. --- src/api/config/speedsettings.rs | 55 +++++++++++++++++++++++++++------ src/encoder.rs | 3 +- src/segmentation.rs | 13 +++++--- 3 files changed, 57 insertions(+), 14 deletions(-) diff --git a/src/api/config/speedsettings.rs b/src/api/config/speedsettings.rs index 127afea9e6..be2c9a62fc 100644 --- a/src/api/config/speedsettings.rs +++ b/src/api/config/speedsettings.rs @@ -73,8 +73,10 @@ pub struct SpeedSettings { /// Enabled is slower. pub non_square_partition: bool, - /// Use segmentation. - pub enable_segmentation: bool, + /// Search level for segmentation. + /// + /// Full search is at least twice as slow. + pub segmentation: SegmentationLevel, /// Enable tx split for inter mode block. pub enable_inter_tx_split: bool, @@ -114,7 +116,7 @@ impl Default for SpeedSettings { sgr_complexity: SGRComplexityLevel::Full, use_satd_subpel: true, non_square_partition: true, - enable_segmentation: true, + segmentation: SegmentationLevel::Full, enable_inter_tx_split: false, fine_directional_intra: false, } @@ -159,7 +161,7 @@ impl SpeedSettings { sgr_complexity: Self::sgr_complexity_preset(speed), use_satd_subpel: Self::use_satd_subpel(speed), non_square_partition: Self::non_square_partition_preset(speed), - enable_segmentation: Self::enable_segmentation_preset(speed), + segmentation: Self::segmentation_preset(speed), enable_inter_tx_split: Self::enable_inter_tx_split_preset(speed), fine_directional_intra: Self::fine_directional_intra_preset(speed), } @@ -270,11 +272,12 @@ impl SpeedSettings { speed == 0 } - // FIXME: this is currently only enabled at speed 0 because choosing a segment - // requires doing RDO, but once that is replaced by a less bruteforce - // solution we should be able to enable segmentation at all speeds. - const fn enable_segmentation_preset(speed: usize) -> bool { - speed == 0 + fn segmentation_preset(speed: usize) -> SegmentationLevel { + if speed == 0 { + SegmentationLevel::Full + } else { + SegmentationLevel::Simple + } } // FIXME: With unknown reasons, inter_tx_split does not work if reduced_tx_set is false @@ -373,3 +376,37 @@ impl fmt::Display for SGRComplexityLevel { ) } } + +/// Search level for segmentation +#[derive( + Clone, + Copy, + Debug, + PartialOrd, + PartialEq, + FromPrimitive, + Serialize, + Deserialize, +)] +pub enum SegmentationLevel { + /// No segmentation is signalled. + Disabled, + /// Segmentation index is derived from source statistics. + Simple, + /// Search all segmentation indices. + Full, +} + +impl fmt::Display for SegmentationLevel { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!( + f, + "{}", + match self { + SegmentationLevel::Disabled => "Disabled", + SegmentationLevel::Simple => "Simple", + SegmentationLevel::Full => "Full", + } + ) + } +} diff --git a/src/encoder.rs b/src/encoder.rs index 37d6af8553..ee14af1bec 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -756,7 +756,8 @@ impl FrameInvariants { .into_boxed_slice(), cpu_feature_level: Default::default(), activity_mask: Default::default(), - enable_segmentation: config.speed_settings.enable_segmentation, + enable_segmentation: config.speed_settings.segmentation + != SegmentationLevel::Disabled, enable_inter_txfm_split: config.speed_settings.enable_inter_tx_split, sequence, config, diff --git a/src/segmentation.rs b/src/segmentation.rs index 4b21fd7210..793a9129f5 100644 --- a/src/segmentation.rs +++ b/src/segmentation.rs @@ -76,6 +76,7 @@ pub fn select_segment( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize, skip: bool, ) -> std::ops::RangeInclusive { + use crate::api::SegmentationLevel; use crate::rdo::spatiotemporal_scale; use arrayvec::ArrayVec; @@ -84,14 +85,18 @@ pub fn select_segment( return 0..=0; } - let frame_bo = ts.to_frame_block_offset(tile_bo); - let scale = spatiotemporal_scale(fi, frame_bo, bsize); - - // TODO: Replace this calculation with precomputed scale thresholds. let segment_2_is_lossless = fi.base_q_idx as i16 + ts.segmentation.data[2][SegLvl::SEG_LVL_ALT_Q as usize] < 1; + if fi.config.speed_settings.segmentation == SegmentationLevel::Full { + return if segment_2_is_lossless { 0..=1 } else { 0..=2 }; + } + + let frame_bo = ts.to_frame_block_offset(tile_bo); + let scale = spatiotemporal_scale(fi, frame_bo, bsize); + + // TODO: Replace this calculation with precomputed scale thresholds. let seg_ac_q: ArrayVec<[_; 3]> = if fi.enable_segmentation { use crate::quantize::ac_q; (0..=2) From 3fa9bf0934b651eebeb5a0b098e7062f0f291668 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Tue, 9 Mar 2021 15:02:30 +0100 Subject: [PATCH 145/155] Use the current threadpool if no thread nor threadpool are specified Fixes #2684. --- src/api/channel.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/api/channel.rs b/src/api/channel.rs index ff22edf6c4..e575198058 100644 --- a/src/api/channel.rs +++ b/src/api/channel.rs @@ -238,16 +238,18 @@ pub type VideoDataChannel = (FrameSender, PacketReceiver); impl Config { fn setup( &self, - ) -> Result<(ContextInner, Arc), InvalidConfig> { + ) -> Result<(ContextInner, Option>), InvalidConfig> { self.validate()?; let inner = self.new_inner()?; let pool = if let Some(ref p) = self.pool { - p.clone() + Some(p.clone()) + } else if self.threads == 0 { + None } else { let pool = ThreadPoolBuilder::new().num_threads(self.threads).build().unwrap(); - Arc::new(pool) + Some(Arc::new(pool)) }; Ok((inner, pool)) @@ -463,7 +465,7 @@ impl Config { let pass_channel = (rc_data_sender, rc_data_receiver); - pool.spawn(move || { + let run = move || { for f in receive_frame.iter() { // info!("frame in {}", inner.frame_count); while !inner.needs_more_fi_lookahead() { @@ -514,7 +516,13 @@ impl Config { } send_rc_pass1.send_pass_summary(&mut inner.rc_state); - }); + }; + + if let Some(pool) = pool { + pool.spawn(run); + } else { + rayon::spawn(run); + } Ok((channel, pass_channel)) } From 218308a157699a7d7c582980dd132caa014f847d Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Tue, 9 Mar 2021 15:14:17 +0100 Subject: [PATCH 146/155] Document the behavior --- src/api/config/mod.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/api/config/mod.rs b/src/api/config/mod.rs index d681c0706a..0644f9c00c 100644 --- a/src/api/config/mod.rs +++ b/src/api/config/mod.rs @@ -151,6 +151,9 @@ impl Config { /// /// The threadpool is shared across all the different parallel /// components in the encoder. + /// + /// If it is left unset `new_context()` will create a new default one + /// `new_*_channel()` will use the current global threadpool if present. pub fn with_threads(mut self, threads: usize) -> Self { self.threads = threads; self @@ -166,6 +169,8 @@ impl Config { #[cfg(feature = "unstable")] /// Use the provided threadpool + /// + /// It takes priority over `with_threads()` pub fn with_thread_pool(mut self, pool: Arc) -> Self { self.pool = Some(pool); self From 6e1cbdf189c595a409fdd5b045d5b1898a162fad Mon Sep 17 00:00:00 2001 From: Ingvar Stepanyan Date: Wed, 10 Mar 2021 13:38:30 +0000 Subject: [PATCH 147/155] Use global threadpool unless configured otherwise Follow-up to #2685, using the global threadpool in one more place. Fixes #2684. --- src/api/config/mod.rs | 8 +++++--- src/api/context.rs | 16 +++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/api/config/mod.rs b/src/api/config/mod.rs index 0644f9c00c..e8bfcb39c6 100644 --- a/src/api/config/mod.rs +++ b/src/api/config/mod.rs @@ -253,11 +253,13 @@ impl Config { let inner = self.new_inner()?; let config = *inner.config; let pool = if let Some(ref p) = self.pool { - p.clone() - } else { + Some(p.clone()) + } else if self.threads != 0 { let pool = ThreadPoolBuilder::new().num_threads(self.threads).build().unwrap(); - Arc::new(pool) + Some(Arc::new(pool)) + } else { + None }; Ok(Context { is_flushing: false, inner, pool, config }) diff --git a/src/api/context.rs b/src/api/context.rs index 7d2b9d1993..e607bbc22e 100644 --- a/src/api/context.rs +++ b/src/api/context.rs @@ -29,7 +29,7 @@ use std::sync::Arc; pub struct Context { pub(crate) inner: ContextInner, pub(crate) config: EncoderConfig, - pub(crate) pool: Arc, + pub(crate) pool: Option>, pub(crate) is_flushing: bool, } @@ -122,9 +122,12 @@ impl Context { } let inner = &mut self.inner; - let pool = &mut self.pool; + let run = move || inner.send_frame(frame, params); - pool.install(|| inner.send_frame(frame, params)) + match &self.pool { + Some(pool) => pool.install(run), + None => run(), + } } /// Returns the first-pass data of a two-pass encode for the frame that was @@ -285,9 +288,12 @@ impl Context { #[inline] pub fn receive_packet(&mut self) -> Result, EncoderStatus> { let inner = &mut self.inner; - let pool = &mut self.pool; + let mut run = move || inner.receive_packet(); - pool.install(|| inner.receive_packet()) + match &self.pool { + Some(pool) => pool.install(run), + None => run(), + } } /// Flushes the encoder. From 14df852b38565ae43b13290011854d82d60fa32d Mon Sep 17 00:00:00 2001 From: Ingvar Stepanyan Date: Wed, 10 Mar 2021 14:03:03 +0000 Subject: [PATCH 148/155] Minor doc nitpick Global pool is always present / created by Rayon. --- src/api/config/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/config/mod.rs b/src/api/config/mod.rs index e8bfcb39c6..36ad4e27b3 100644 --- a/src/api/config/mod.rs +++ b/src/api/config/mod.rs @@ -153,7 +153,7 @@ impl Config { /// components in the encoder. /// /// If it is left unset `new_context()` will create a new default one - /// `new_*_channel()` will use the current global threadpool if present. + /// `new_*_channel()` will use the default global threadpool. pub fn with_threads(mut self, threads: usize) -> Self { self.threads = threads; self From 647662ae47fa39ab936d75dabbbc1818fcd058a7 Mon Sep 17 00:00:00 2001 From: Ingvar Stepanyan Date: Wed, 10 Mar 2021 14:09:01 +0000 Subject: [PATCH 149/155] Extract a single Config::new_thread_pool helper --- src/api/channel.rs | 10 +--------- src/api/config/mod.rs | 25 ++++++++++++++++--------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/api/channel.rs b/src/api/channel.rs index e575198058..dc04b3a859 100644 --- a/src/api/channel.rs +++ b/src/api/channel.rs @@ -242,15 +242,7 @@ impl Config { self.validate()?; let inner = self.new_inner()?; - let pool = if let Some(ref p) = self.pool { - Some(p.clone()) - } else if self.threads == 0 { - None - } else { - let pool = - ThreadPoolBuilder::new().num_threads(self.threads).build().unwrap(); - Some(Arc::new(pool)) - }; + let pool = self.new_thread_pool(); Ok((inner, pool)) } diff --git a/src/api/config/mod.rs b/src/api/config/mod.rs index 36ad4e27b3..795fbfef23 100644 --- a/src/api/config/mod.rs +++ b/src/api/config/mod.rs @@ -234,6 +234,21 @@ impl Config { Ok(inner) } + + /// Create a new threadpool with this configuration if set, + /// or return `None` if global threadpool should be used instead. + pub(crate) fn new_thread_pool(&self) -> Option> { + if let Some(ref p) = self.pool { + Some(p.clone()) + } else if self.threads != 0 { + let pool = + ThreadPoolBuilder::new().num_threads(self.threads).build().unwrap(); + Some(Arc::new(pool)) + } else { + None + } + } + /// Creates a [`Context`] with this configuration. /// /// # Examples @@ -252,15 +267,7 @@ impl Config { pub fn new_context(&self) -> Result, InvalidConfig> { let inner = self.new_inner()?; let config = *inner.config; - let pool = if let Some(ref p) = self.pool { - Some(p.clone()) - } else if self.threads != 0 { - let pool = - ThreadPoolBuilder::new().num_threads(self.threads).build().unwrap(); - Some(Arc::new(pool)) - } else { - None - }; + let pool = self.new_thread_pool(); Ok(Context { is_flushing: false, inner, pool, config }) } From a07fe5277b50f991e808204630e2934a5c872059 Mon Sep 17 00:00:00 2001 From: Ingvar Stepanyan Date: Wed, 10 Mar 2021 14:37:26 +0000 Subject: [PATCH 150/155] Reword doc comment --- src/api/config/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api/config/mod.rs b/src/api/config/mod.rs index 795fbfef23..9b2d77e264 100644 --- a/src/api/config/mod.rs +++ b/src/api/config/mod.rs @@ -152,8 +152,8 @@ impl Config { /// The threadpool is shared across all the different parallel /// components in the encoder. /// - /// If it is left unset `new_context()` will create a new default one - /// `new_*_channel()` will use the default global threadpool. + /// If it is left unset, the encoder will use the default global + /// threadpool provided by Rayon instead. pub fn with_threads(mut self, threads: usize) -> Self { self.threads = threads; self From 8e8c9c70a1186919bd94c593037843adcb12fa01 Mon Sep 17 00:00:00 2001 From: Ingvar Stepanyan Date: Wed, 10 Mar 2021 14:00:17 +0000 Subject: [PATCH 151/155] Allow Rayon if Wasm is built with atomics support `target_feature = "atomics"` currently can be only enabled 1) on nightly and 2) via explicit rustc `-C target-feature=+atomics` flag, so this doesn't change anything for normal builds. However, if user explicitly opted-in to support for Wasm threads using that flag on nightly, then this change allows to use rav1e multithreading on Wasm too. The only other limitation is that APIs like `with_threads` will fail even with this flag - user must use preconfigured Wasm-compatible global rayon pool (which is possible after #2685 + #2686) and not locally constructed pools. --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 57d9184401..fc1ea4d416 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -87,7 +87,7 @@ mod wasm_bindgen { mod rayon { cfg_if::cfg_if! { - if #[cfg(target_arch="wasm32")] { + if #[cfg(all(target_arch="wasm32", not(target_feature = "atomics")))] { pub struct ThreadPoolBuilder (); impl ThreadPoolBuilder { pub fn new() -> ThreadPoolBuilder { From 2b289f9a56908f08e9962b62248db6c5546e0746 Mon Sep 17 00:00:00 2001 From: Ewout ter Hoeven Date: Thu, 11 Mar 2021 13:52:26 +0100 Subject: [PATCH 152/155] CI: Add Windows Arm64 job Adds a Windows job targeting the Arm64 platform. Moves Nasm installation to ilammy/setup-nasm and upgrades to Cargo-C v0.7.3 --- .github/workflows/rav1e.yml | 45 ++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/.github/workflows/rav1e.yml b/.github/workflows/rav1e.yml index b0d911762c..30a2822504 100644 --- a/.github/workflows/rav1e.yml +++ b/.github/workflows/rav1e.yml @@ -393,20 +393,17 @@ jobs: sccache --stop-server build-windows: - strategy: matrix: - conf: - - cargo-build - - cargo-test - - cargo-c include: - conf: cargo-build - toolchain: stable + target: x86_64-pc-windows-msvc + - conf: cargo-build + target: aarch64-pc-windows-msvc - conf: cargo-test - toolchain: stable + target: x86_64-pc-windows-msvc - conf: cargo-c - toolchain: stable-x86_64-pc-windows-gnu + target: x86_64-pc-windows-gnu env: RUST_BACKTRACE: full @@ -417,11 +414,11 @@ jobs: if: >- (github.event_name == 'push' && !endsWith(github.event.head_commit.message, 'CI: skip')) || (github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.names, 'skip-ci')) - runs-on: windows-latest steps: - uses: actions/checkout@v2 + - uses: ilammy/setup-nasm@v1 - name: Install sccache run: | $LINK = "https://github.com/mozilla/sccache/releases/download/0.2.12" @@ -429,30 +426,32 @@ jobs: curl -LO "$LINK/$SCCACHE_FILE.tar.gz" tar xzf "$SCCACHE_FILE.tar.gz" echo "$Env:GITHUB_WORKSPACE/$SCCACHE_FILE" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - - name: Install nasm - run: | - $NASM_VERSION="2.15.05" - $LINK = "https://www.nasm.us/pub/nasm/releasebuilds/$NASM_VERSION/win64" - $NASM_FILE = "nasm-$NASM_VERSION-win64" - curl --ssl-no-revoke -LO "$LINK/$NASM_FILE.zip" - 7z e -y "$NASM_FILE.zip" -o"C:\nasm" - echo "C:\nasm" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - name: Set MSVC x86_64 linker path + if: matrix.target != 'aarch64-pc-windows-msvc' run: | $LinkGlob = "VC\Tools\MSVC\*\bin\Hostx64\x64" $env:PATH = "$env:PATH;${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer" $LinkPath = vswhere -latest -products * -find "$LinkGlob" | Select-Object -Last 1 echo "$LinkPath" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + - name: Set MSVC Arm64 linker path + if: matrix.target == 'aarch64-pc-windows-msvc' + run: | + $LinkGlob = "VC\Tools\MSVC\*\bin\Hostx64\Arm64" + $env:PATH = "$env:PATH;${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer" + $LinkPath = vswhere -latest -products * -find "$LinkGlob" | Select-Object -Last 1 + echo "$LinkPath" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - name: Install ${{ matrix.toolchain }} uses: actions-rs/toolchain@v1 with: profile: minimal - toolchain: ${{ matrix.toolchain }} + toolchain: stable + target: ${{ matrix.target }} override: true + default: true - name: Install cargo-c if: matrix.conf == 'cargo-c' run: | - $LINK = "https://github.com/lu-zero/cargo-c/releases/download/v0.7.1" + $LINK = "https://github.com/lu-zero/cargo-c/releases/download/v0.7.3" $CARGO_C_FILE = "cargo-c-windows-msvc" curl -LO "$LINK/$CARGO_C_FILE.zip" 7z e -y "$CARGO_C_FILE.zip" -o"${env:USERPROFILE}\.cargo\bin" @@ -465,17 +464,17 @@ jobs: continue-on-error: true with: path: ~/.cargo/registry/cache - key: ${{ runner.os }}-${{ matrix.conf }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} + key: ${{ runner.os }}-${{ matrix.conf }}-${{ matrix.target }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} restore-keys: | - ${{ runner.os }}-${{ matrix.conf }}-cargo-registry- + ${{ runner.os }}-${{ matrix.conf }}-${{ matrix.target }}-cargo-registry- - name: Cache sccache output uses: actions/cache@v2 continue-on-error: true with: path: C:\sccache - key: ${{ runner.os }}-${{ matrix.conf }}-sccache-${{ hashFiles('**/Cargo.*') }} + key: ${{ runner.os }}-${{ matrix.conf }}-${{ matrix.target }}-sccache-${{ hashFiles('**/Cargo.*') }} restore-keys: | - ${{ runner.os }}-${{ matrix.conf }}-sccache- + ${{ runner.os }}-${{ matrix.conf }}-${{ matrix.target }}-sccache- - name: Start sccache server run: | sccache --start-server From fe05c433f3968601d54d50766ac534eb28ff7199 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Sat, 13 Mar 2021 12:07:19 +0100 Subject: [PATCH 153/155] Remove a stray import --- src/api/channel.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/channel.rs b/src/api/channel.rs index dc04b3a859..217a128c64 100644 --- a/src/api/channel.rs +++ b/src/api/channel.rs @@ -20,7 +20,7 @@ use crossbeam::channel::*; use crate::encoder::*; use crate::frame::*; use crate::rate::RCState; -use crate::rayon::{ThreadPool, ThreadPoolBuilder}; +use crate::rayon::ThreadPool; use crate::util::Pixel; use std::io; From 832bcb6f572b7a34308938e9b425ab15181476d2 Mon Sep 17 00:00:00 2001 From: Zen Date: Sat, 13 Mar 2021 00:21:09 +0200 Subject: [PATCH 154/155] Document downscaled() --- v_frame/src/plane.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/v_frame/src/plane.rs b/v_frame/src/plane.rs index 99c8cc0ad8..f1dd8fe06b 100644 --- a/v_frame/src/plane.rs +++ b/v_frame/src/plane.rs @@ -424,6 +424,9 @@ impl Plane { } } + /// Returns plane with half the resolution for width and height. + /// Downscaled with 2x2 box filter. + /// Padded to dimensions with frame_width and frame_height. pub fn downsampled( &self, frame_width: usize, frame_height: usize, ) -> Plane { From 87c0297c631cf40bc1e9b03e8130928bd8e223b7 Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Fri, 12 Mar 2021 15:47:38 +0100 Subject: [PATCH 155/155] Bump pretty_assertion --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index ff8777f62b..65b87ef482 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -114,7 +114,7 @@ signal-hook = { version = "0.3", optional = true } [dev-dependencies] assert_cmd = "1.0" criterion = "0.3" -pretty_assertions = "0.6" +pretty_assertions = "0.7" interpolate_name = "0.2.2" rand = "0.8" rand_chacha = "0.3"