diff --git a/RBFilter_AVX2 pp.cpp b/RBFilter_AVX2 pp.cpp deleted file mode 100644 index bbbf73c..0000000 --- a/RBFilter_AVX2 pp.cpp +++ /dev/null @@ -1,1259 +0,0 @@ -#include "stdafx.h" -#include "RBFilter_AVX2.h" -#include -#include -#include -#include -#include -#include -#include -#include - - -#define MAX_RANGE_TABLE_SIZE 255 -#define ALIGN_SIZE 32 - -// only 1 of following 2 should be defined -#define EDGE_COLOR_USE_MAXIMUM -//#define EDGE_COLOR_USE_ADDITION - -// if EDGE_COLOR_USE_MAXIMUM is defined, then edge color detection works by calculating -// maximum difference among 3 components (RGB) of 2 colors, which tends to result in lower differences (since only largest among 3 is selected) -// if EDGE_COLOR_USE_ADDITION is defined, then edge color detection works by calculating -// sum of all 3 components, while enforcing 255 maximum. This method is much more sensitive to small differences - -#if defined(EDGE_COLOR_USE_MAXIMUM) && defined(EDGE_COLOR_USE_ADDITION) -#error Only 1 of those can be defined -#endif - -#if !defined(EDGE_COLOR_USE_MAXIMUM) && !defined(EDGE_COLOR_USE_ADDITION) -#error 1 of those must be defined -#endif - -CRBFilterAVX2::CRBFilterAVX2() -{ - m_range_table = new float[MAX_RANGE_TABLE_SIZE + 1]; - memset(m_range_table, 0, (MAX_RANGE_TABLE_SIZE + 1) * sizeof(float)); -} - -CRBFilterAVX2::~CRBFilterAVX2() -{ - release(); - - delete[] m_range_table; -} - -bool CRBFilterAVX2::initialize(int width, int height, int thread_count, bool pipelined) -{ - // basic sanity check, not strict - if (width < 16 || width > 10000) - return false; - - if (height < 2 || height > 10000) - return false; - - if (thread_count < 1 || thread_count > RBF_MAX_THREADS) - return false; - - release(); - - // round width up to nearest ALIGN_SIZE * thread_count - int round_up = ALIGN_SIZE * thread_count; - if (width % round_up) - { - width += round_up - width % round_up; - } - // round height to nearest even number - if (height & 1) - height++; - - m_reserved_width = width; - m_reserved_height = height; - m_thread_count = thread_count; - - m_stage_buffer[0] = (unsigned char*)_aligned_malloc(m_reserved_width * m_reserved_height * 4, ALIGN_SIZE); - if (!m_stage_buffer[0]) - return false; - - if (pipelined) - { - m_stage_buffer[1] = (unsigned char*)_aligned_malloc(m_reserved_width * m_reserved_height * 4, ALIGN_SIZE); - if (!m_stage_buffer[1]) - return false; - } - - ///////////////// - m_h_line_cache = new (std::nothrow) float*[m_thread_count]; - if (!m_h_line_cache) - return false; - - // zero just in case - for (int i = 0; i < m_thread_count; i++) - m_h_line_cache[i] = nullptr; - - for (int i = 0; i < m_thread_count; i++) - { - m_h_line_cache[i] = (float*)_aligned_malloc(m_reserved_width * 12 * sizeof(float) * 2 + 128, ALIGN_SIZE); - if (!m_h_line_cache[i]) - return false; - - // 1st 8 bytes of line cache should remain constant zero - memset(m_h_line_cache[i], 0, 8 * sizeof(float)); - } - - //////////////// - m_v_line_cache = new (std::nothrow) float*[m_thread_count]; - if (!m_v_line_cache) - return false; - - for (int i = 0; i < m_thread_count; i++) - m_v_line_cache[i] = nullptr; - - int v_line_size = (m_reserved_width * 16 * sizeof(float)) / m_thread_count; - for (int i = 0; i < m_thread_count; i++) - { - m_v_line_cache[i] = (float*)_aligned_malloc(v_line_size, ALIGN_SIZE); - if (!m_v_line_cache[i]) - return false; - } - - //////////////// - if (m_pipelined) - { - m_horizontal_thread = std::thread(&CRBFilterAVX2::threadFunctionHorizontal, this); - m_vertical_thread = std::thread(&CRBFilterAVX2::threadFunctionVertical, this); - //m_horizontal_threads = new std::thread[m_thread_count]; - //m_vertical_threads = new std::thread[m_thread_count]; - - //for (int i = 0; i < m_thread_count; i++) - //{ - // m_horizontal_threads[i] = std::thread(&CRBFilterAVX2::threadFunctionHorizontal, this, i); - // m_vertical_threads[i] = std::thread(&CRBFilterAVX2::threadFunctionVertical, this, i); - //} - } - - return true; -} - -void CRBFilterAVX2::release() -{ - if (m_stage_buffer[0]) - { - _aligned_free(m_stage_buffer[0]); - m_stage_buffer[0] = nullptr; - } - - if (m_stage_buffer[1]) - { - _aligned_free(m_stage_buffer[1]); - m_stage_buffer[1] = nullptr; - } - - if (m_h_line_cache) - { - for (int i = 0; i < m_thread_count; i++) - { - if (m_h_line_cache[i]) - _aligned_free(m_h_line_cache[i]); - } - delete[] m_h_line_cache; - m_h_line_cache = nullptr; - } - - if (m_v_line_cache) - { - for (int i = 0; i < m_thread_count; i++) - { - if (m_v_line_cache[i]) - _aligned_free(m_v_line_cache[i]); - } - delete[] m_v_line_cache; - m_v_line_cache = nullptr; - } - - //////////////// - if (m_pipelined) - { -// delete[] m_horizontal_threads; -// delete[] m_vertical_threads; - } - - m_reserved_width = 0; - m_reserved_height = 0; - m_thread_count = 0; - m_pipelined = false; - m_filter_counter = 0; -} - -int CRBFilterAVX2::getOptimalPitch(int width) const -{ - int round_up = ALIGN_SIZE * m_thread_count; - if (width % round_up) - { - width += round_up - width % round_up; - } - - return width * 4; -} - -void CRBFilterAVX2::setSigma(float sigma_spatial, float sigma_range) -{ - if (m_sigma_spatial != sigma_spatial || m_sigma_range != sigma_range) - { - m_sigma_spatial = sigma_spatial; - m_sigma_range = sigma_range; - - double alpha_f = (exp(-sqrt(2.0) / (sigma_spatial * 255.0))); - m_inv_alpha_f = (float)(1.0 - alpha_f); - double inv_sigma_range = 1.0 / (sigma_range * MAX_RANGE_TABLE_SIZE); - { - double ii = 0.f; - for (int i = 0; i <= MAX_RANGE_TABLE_SIZE; i++, ii -= 1.0) - { - m_range_table[i] = (float)(alpha_f * exp(ii * inv_sigma_range)); - } - } - } -} - -// example of edge color difference calculation from original implementation -// idea is to fit maximum edge color difference as single number in 0-255 range -// colors are added then 2 components are scaled 4x while 1 complement is scaled 2x -// this means 1 of the components is more dominant - -//int getDiffFactor(const unsigned char* color1, const unsigned char* color2) -//{ -// int c1 = abs(color1[0] - color2[0]); -// int c2 = abs(color1[1] - color2[1]); -// int c3 = abs(color1[2] - color2[2]); -// -// return ((c1 + c3) >> 2) + (c2 >> 1); -//} - - -inline void getDiffFactor3x(__m256i pix8, __m256i pix8p, __m256i* diff8x) -{ - __m256i byte_mask = _mm256_set1_epi32(255); - - // get absolute difference for each component per pixel - __m256i diff = _mm256_sub_epi8(_mm256_max_epu8(pix8, pix8p), _mm256_min_epu8(pix8, pix8p)); - -#ifdef EDGE_COLOR_USE_MAXIMUM - // get maximum of 3 components - __m256i diff_shift1 = _mm256_srli_epi32(diff, 8); // 2nd component - diff = _mm256_max_epu8(diff, diff_shift1); - diff_shift1 = _mm256_srli_epi32(diff_shift1, 8); // 3rd component - diff = _mm256_max_epu8(diff, diff_shift1); - // skip alpha component - diff = _mm256_and_si256(diff, byte_mask); // zero out all but 1st byte -#endif - -#ifdef EDGE_COLOR_USE_ADDITION - // add all component differences and saturate - __m256i diff_shift1 = _mm256_srli_epi32(diff, 8); // 2nd component - diff = _mm256_adds_epu8(diff, diff_shift1); - diff_shift1 = _mm256_srli_epi32(diff_shift1, 8); // 3rd component - diff = _mm256_adds_epu8(diff, diff_shift1); - diff = _mm256_and_si256(diff, byte_mask); // zero out all but 1st byte -#endif - - _mm256_store_si256(diff8x, diff); -} - -void CRBFilterAVX2::horizontalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch) -{ - // force height segments to be even cause this filter processes 2 lines at a time - int height_segment = (height / m_thread_count) & (~1); - int buffer_offset = thread_index * height_segment * pitch; - img_src += buffer_offset; - img_dst += buffer_offset; - - int width32 = pitch / 32; - - // last segment should account for uneven height - // since reserve buffer height is rounded up to even number, it's OK if source is uneven - // but that assumes hozitonal filter output buffer is the reservered buffer, or that destination is rounded up to even number - if (thread_index + 1 == m_thread_count) - height_segment = height - thread_index * height_segment; - -// float* alpha_cache_start = m_alpha_cache[thread_index]; - // cache line structure: - // 4 floats of alpha_f from line 1 - // 4 floats of alpha_f from line 2 - // 4 floats of source color premultiplied with 'm_inv_alpha_f' from line 1 - // 4 floats of source color premultiplied with 'm_inv_alpha_f' from line 2 - // 4 floats of 1st pass result color from line 1 - // 4 floats of 1st pass result color from line 2 - float* line_cache = m_h_line_cache[thread_index]; - const float* range_table = m_range_table; - - __declspec(align(32)) long color_diff[16]; - - _mm256_zeroall(); - - __m256i mask_unpack = _mm256_setr_epi8(12, -1, -1, -1, // pixel 1 R - 13, -1, -1, -1, // pixel 1 G - 14, -1, -1, -1, // pixel 1 B - 15, -1, -1, -1, // pixel 1 A - 12, -1, -1, -1, // pixel 2 R - 13, -1, -1, -1, // pixel 2 G - 14, -1, -1, -1, // pixel 2 B - 15, -1, -1, -1);// pixel 2 A - - __m256i mask_pack = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, // pixel 1 - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12); // pixel 2 - - __m256 inv_alpha = _mm256_set1_ps(m_inv_alpha_f); - - // process 2 horizontal lines at a time - for (int y = 0; y < height_segment; y+= 2) - { - __m256 alpha_prev = _mm256_set1_ps(1.f); - __m256 color_prev; - - - float* line_buffer = line_cache + 24 * pitch / 4; - // 1st line - int buffer_inc = (y + 1) * pitch - 32; - const __m256i* src1_8xCur = (const __m256i*)(img_src + buffer_inc); - const __m256i* src1_8xPrev = (const __m256i*)(img_src + buffer_inc + 4); - // 2nd line - buffer_inc += pitch; - const __m256i* src2_8xCur = (const __m256i*)(img_src + buffer_inc); - const __m256i* src2_8xPrev = (const __m256i*)(img_src + buffer_inc + 4); - - - ///////////////////////////// - // right to left pass - for (int x = 0; x < width32; x++) - { - __m256i pix8_1 = _mm256_load_si256(src1_8xCur--); - __m256i pix8p_1 = _mm256_loadu_si256(src1_8xPrev--); - getDiffFactor3x(pix8_1, pix8p_1, (__m256i*)color_diff); - - __m256i pix8_2 = _mm256_load_si256(src2_8xCur--); - __m256i pix8p_2 = _mm256_loadu_si256(src2_8xPrev--); - getDiffFactor3x(pix8_2, pix8p_2, (__m256i*)(color_diff + 8)); - - // last 4 pixels of 2 lines - __m256i pix8 = _mm256_permute2f128_si256(pix8_1, pix8_2, 1 | (3 << 4)); - - //////////////////// - // pixel 1 unpack - { - // alpha factor - float alpha2_f = range_table[color_diff[7]]; - float alpha1_f = range_table[color_diff[7 + 8]]; - __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, - alpha2_f, alpha2_f, alpha2_f, alpha2_f); - _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights - - // source pixel - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - if (x == 0) // have to initialize prev_color with last pixel color, this condition has no noticeable penalty - color_prev = pix2f; - pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color - _mm256_store_ps(line_buffer + 8, pix2f); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - _mm256_store_ps(line_buffer + 16, out_color); // cache final color - line_buffer -= 24; - } - - //////////////////// - // pixel 2 unpack - { - // alpha factor - float alpha2_f = range_table[color_diff[6]]; - float alpha1_f = range_table[color_diff[6 + 8]]; - __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, - alpha2_f, alpha2_f, alpha2_f, alpha2_f); - _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights - - // source pixel - pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color - _mm256_store_ps(line_buffer + 8, pix2f); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - _mm256_store_ps(line_buffer + 16, out_color); // cache final color - line_buffer -= 24; - } - - - //////////////////// - // pixel 3 unpack - { - // alpha factor - float alpha2_f = range_table[color_diff[5]]; - float alpha1_f = range_table[color_diff[5 + 8]]; - __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, - alpha2_f, alpha2_f, alpha2_f, alpha2_f); - _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights - - // source pixel - pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color - _mm256_store_ps(line_buffer + 8, pix2f); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - _mm256_store_ps(line_buffer + 16, out_color); // cache final color - line_buffer -= 24; - } - - //////////////////// - // pixel 4 unpack - { - // alpha factor - float alpha2_f = range_table[color_diff[4]]; - float alpha1_f = range_table[color_diff[4 + 8]]; - __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, - alpha2_f, alpha2_f, alpha2_f, alpha2_f); - _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights - - // source pixel - pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color - _mm256_store_ps(line_buffer + 8, pix2f); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - _mm256_store_ps(line_buffer + 16, out_color); // cache final color - line_buffer -= 24; - } - - // next 4 pixels of 2 lines - pix8 = _mm256_permute2f128_si256(pix8_1, pix8_2, 2 << 4); - - - //////////////////// - // pixel 5 unpack - { - // alpha factor - float alpha2_f = range_table[color_diff[3]]; - float alpha1_f = range_table[color_diff[3 + 8]]; - __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, - alpha2_f, alpha2_f, alpha2_f, alpha2_f); - _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights - - // source pixel - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color - _mm256_store_ps(line_buffer + 8, pix2f); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - _mm256_store_ps(line_buffer + 16, out_color); // cache final color - line_buffer -= 24; - } - - - //////////////////// - // pixel 6 unpack - { - // alpha factor - float alpha2_f = range_table[color_diff[2]]; - float alpha1_f = range_table[color_diff[2 + 8]]; - __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, - alpha2_f, alpha2_f, alpha2_f, alpha2_f); - _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights - - // source pixel - pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color - _mm256_store_ps(line_buffer + 8, pix2f); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - _mm256_store_ps(line_buffer + 16, out_color); // cache final color - line_buffer -= 24; - } - - - //////////////////// - // pixel 7 unpack - { - // alpha factor - float alpha2_f = range_table[color_diff[1]]; - float alpha1_f = range_table[color_diff[1 + 8]]; - __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, - alpha2_f, alpha2_f, alpha2_f, alpha2_f); - _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights - - // source pixel - pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color - _mm256_store_ps(line_buffer + 8, pix2f); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - _mm256_store_ps(line_buffer + 16, out_color); // cache final color - line_buffer -= 24; - } - - - //////////////////// - // pixel 8 unpack - { - // alpha factor - float alpha2_f = range_table[color_diff[0]]; - float alpha1_f = range_table[color_diff[0 + 8]]; - __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, - alpha2_f, alpha2_f, alpha2_f, alpha2_f); - _mm256_store_ps(line_buffer, alpha_f_8x); // cache weights - - // source pixel - pix8 = _mm256_slli_si256(pix8, 4); // shift left to unpack next pixel - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); // extracts 1 pixel components from BYTE to DWORD - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - pix2f = _mm256_mul_ps(pix2f, inv_alpha); // pre-multiply source color - _mm256_store_ps(line_buffer + 8, pix2f); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - _mm256_store_ps(line_buffer + 16, out_color); // cache final color - line_buffer -= 24; - } - - - } - - ///////////////////////////// - // left to right pass - __m256i* dst1_pix8 = (__m256i*)(img_dst + y * pitch); - __m256i* dst2_pix8 = (__m256i*)(img_dst + (y + 1) * pitch); - - for (int x = 0; x < width32; x++) - { - __m256i result1; - __m256i result2; - - ///////////// - // 1st 4 pixels - // pixel 1 - { - // alpha - __m256 alpha_f_8x = _mm256_load_ps(line_buffer); - line_buffer += 24; - - // get pre-multiplied source color - __m256 pix2f = _mm256_load_ps(line_buffer + 8); - - // first pixel in line needs to initialize color_prev to original source color - if (x == 0) - color_prev = _mm256_div_ps(pix2f, inv_alpha); // source color was premultiplied - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - - // get final color from previous pass - __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); - out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color - __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer - pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 - - // pack result - result1 = _mm256_shuffle_epi8(pix2i, mask_pack); - } - - - // pixel 2 - { - // alpha - __m256 alpha_f_8x = _mm256_load_ps(line_buffer); - line_buffer += 24; - - // get pre-multiplied source color - __m256 pix2f = _mm256_load_ps(line_buffer + 8); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - - // get final color from previous pass - __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); - out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color - __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer - pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 - - // pack result - pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); - result1 = _mm256_srli_si256(result1, 4); // shift - result1 = _mm256_or_si256(result1, pix2i); // combine - } - - // pixel 3 - { - // alpha - __m256 alpha_f_8x = _mm256_load_ps(line_buffer); - line_buffer += 24; - - // get pre-multiplied source color - __m256 pix2f = _mm256_load_ps(line_buffer + 8); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - - // get final color from previous pass - __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); - out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color - __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer - pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 - - // pack result - pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); - result1 = _mm256_srli_si256(result1, 4); // shift - result1 = _mm256_or_si256(result1, pix2i); // combine - } - - // pixel 4 - { - // alpha - __m256 alpha_f_8x = _mm256_load_ps(line_buffer); - line_buffer += 24; - - // get pre-multiplied source color - __m256 pix2f = _mm256_load_ps(line_buffer + 8); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - - // get final color from previous pass - __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); - out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color - __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer - pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 - - // pack result - pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); - result1 = _mm256_srli_si256(result1, 4); // shift - result1 = _mm256_or_si256(result1, pix2i); // combine - } - - // next 4 pixels packed in result2 - // pixel 5 - { - // alpha - __m256 alpha_f_8x = _mm256_load_ps(line_buffer); - line_buffer += 24; - - // get pre-multiplied source color - __m256 pix2f = _mm256_load_ps(line_buffer + 8); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - - // get final color from previous pass - __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); - out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color - __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer - pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 - - // pack result - result2 = _mm256_shuffle_epi8(pix2i, mask_pack); - } - - // pixel 6 - { - // alpha - __m256 alpha_f_8x = _mm256_load_ps(line_buffer); - line_buffer += 24; - - // get pre-multiplied source color - __m256 pix2f = _mm256_load_ps(line_buffer + 8); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - - // get final color from previous pass - __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); - out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color - __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer - pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 - - // pack result - pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); - result2 = _mm256_srli_si256(result2, 4); // shift - result2 = _mm256_or_si256(result2, pix2i); // combine - } - - // pixel 7 - { - // alpha - __m256 alpha_f_8x = _mm256_load_ps(line_buffer); - line_buffer += 24; - - // get pre-multiplied source color - __m256 pix2f = _mm256_load_ps(line_buffer + 8); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - - // get final color from previous pass - __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); - out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color - __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer - pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 - - // pack result - pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); - result2 = _mm256_srli_si256(result2, 4); // shift - result2 = _mm256_or_si256(result2, pix2i); // combine - } - - // pixel 8 - { - // alpha - __m256 alpha_f_8x = _mm256_load_ps(line_buffer); - line_buffer += 24; - - // get pre-multiplied source color - __m256 pix2f = _mm256_load_ps(line_buffer + 8); - - // filter - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // final color - __m256 out_color = _mm256_div_ps(color_prev, alpha_prev); // get final color - - // get final color from previous pass - __m256 pix2f_p = _mm256_load_ps(line_buffer + 16); - out_color = _mm256_add_ps(out_color, pix2f_p); // combine it with current final color - __m256i pix2i = _mm256_cvtps_epi32(out_color); // covert to integer - pix2i = _mm256_srli_epi32(pix2i, 1); // division by 2 - - // pack result - pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); - result2 = _mm256_srli_si256(result2, 4); // shift - result2 = _mm256_or_si256(result2, pix2i); // combine - } - - // separate packed results into lines - __m256i line1 = _mm256_permute2f128_si256(result1, result2, 2 << 4); - __m256i line2 = _mm256_permute2f128_si256(result1, result2, 1 | (3 << 4)); - - // store result - _mm256_store_si256(dst1_pix8++, line1); - _mm256_store_si256(dst2_pix8++, line2); - } - } - -} - - -void CRBFilterAVX2::verticalFilter(int thread_index, const unsigned char* img_src, unsigned char* img_dst, int width, int height, int pitch) -{ - int width_segment = width / m_thread_count; - // make sure width segments round to 32 byte boundary - width_segment -= width_segment % 8; - int start_offset = width_segment * thread_index; - if (thread_index == m_thread_count - 1) // last one - { - width_segment = getOptimalPitch(width) / 4 - start_offset; - } - - int width8 = width_segment / 8; - - // adjust img buffer starting positions - img_src += start_offset * 4; - img_dst += start_offset * 4; - - float* line_cache = m_v_line_cache[thread_index]; - const float* range_table = m_range_table; - - _mm256_zeroall(); - - __m256 inv_alpha = _mm256_set1_ps(m_inv_alpha_f); - - __m256i mask_pack = _mm256_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, // pixel 1 - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12); // pixel 2 - - __m256i mask_unpack = _mm256_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, // pixel 1 - 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1); // pixel 2 - - // used to store maximum difference between 2 pixels - __declspec(align(32)) long color_diff[8]; - - ///////////////// - // Bottom to top pass first - { - // last line processed separately since no previous - { - float* line_buffer = line_cache; - __m256i* dst_buf = (__m256i*)(img_dst + (height - 1) * pitch); - __m256i* src_8xCur = (__m256i*)(img_src + (height - 1) * pitch); - - __m256 one = _mm256_set1_ps(1.f); - - for (int x = 0; x < width8; x++) - { - __m256i pix8 = _mm256_load_si256(src_8xCur++); // load 8x pixel - _mm256_store_si256(dst_buf++, pix8); // copy to destination - - for (int i = 0; i < 4; i++) - { - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); - pix8 = _mm256_srli_si256(pix8, 4); // shift right - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - - _mm256_store_ps(line_buffer, one); - _mm256_store_ps(line_buffer + 8, pix2f); - - line_buffer += 16; - } - } - } - - // process other lines - for (int y = height - 2; y >= 0; y--) - { - float* line_buffer = line_cache; - __m256i* dst_buf = (__m256i*)(img_dst + y * pitch); - __m256i* src_8xCur = (__m256i*)(img_src + y * pitch); - __m256i* src_8xPrev = (__m256i*)(img_src + (y + 1) * pitch); - - for (int x = 0; x < width8; x++) - { - __m256i pix8 = _mm256_load_si256(src_8xCur++); // load 8x pixel - __m256i pix8p = _mm256_load_si256(src_8xPrev++); - __m256i pix_out; // final 8x packed pixels - - // get color differences - getDiffFactor3x(pix8, pix8p, (__m256i*)color_diff); - - //////////////////// - // pixel 1, 5 unpack - { - // alpha factor - float alpha2_f = range_table[color_diff[0]]; - float alpha1_f = range_table[color_diff[4]]; - __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, - alpha2_f, alpha2_f, alpha2_f, alpha2_f); - - // load previous line color factor - __m256 alpha_prev = _mm256_load_ps(line_buffer); - // load previous line color - __m256 color_prev = _mm256_load_ps(line_buffer + 8); - - // unpack current source pixel - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - - // filter - pix2f = _mm256_mul_ps(pix2f, inv_alpha); - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // store current factor and color as previous for next cycle - _mm256_store_ps(line_buffer, alpha_prev); - _mm256_store_ps(line_buffer + 8, color_prev); - line_buffer += 16; - - // calculate final color - pix2f = _mm256_div_ps(color_prev, alpha_prev); - - // pack float pixel into byte pixel - pix2i = _mm256_cvtps_epi32(pix2f); // convert to integer - pix_out = _mm256_shuffle_epi8(pix2i, mask_pack); - } - - // loop for other pixels - for(int i=1; i<4; i++) - { - // alpha factor - float alpha2_f = range_table[color_diff[i]]; - float alpha1_f = range_table[color_diff[i+4]]; - __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, - alpha2_f, alpha2_f, alpha2_f, alpha2_f); - - // load previous line color factor - __m256 alpha_prev = _mm256_load_ps(line_buffer); - // load previous line color - __m256 color_prev = _mm256_load_ps(line_buffer + 8); - - // unpack current source pixel - pix8 = _mm256_srli_si256(pix8, 4); // shift right - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - - // filter - pix2f = _mm256_mul_ps(pix2f, inv_alpha); - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // store current factor and color as previous for next cycle - _mm256_store_ps(line_buffer, alpha_prev); - _mm256_store_ps(line_buffer + 8, color_prev); - line_buffer += 16; - - // calculate final color - pix2f = _mm256_div_ps(color_prev, alpha_prev); - - // pack float pixel into byte pixel - pix2i = _mm256_cvtps_epi32(pix2f); // convert to integer - pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); - pix_out = _mm256_srli_si256(pix_out, 4); // shift - pix_out = _mm256_or_si256(pix_out, pix2i); // combine - } - - // store result - _mm256_store_si256(dst_buf++, pix_out); - } - } - } - - ///////////////// - // Top to bottom pass last - { - // first line processed separately since no previous - { - float* line_buffer = line_cache; - __m256i* dst_line = (__m256i*)img_dst; - __m256i* src_8xCur = (__m256i*)img_src; - - __m256 one = _mm256_set1_ps(1.f); - - for (int x = 0; x < width8; x++) - { - __m256i pix8 = _mm256_load_si256(src_8xCur++); // load 8x pixel - __m256i pix8_d = _mm256_load_si256(dst_line); - pix8_d = _mm256_avg_epu8(pix8_d, pix8); // average out - _mm256_store_si256(dst_line++, pix8_d); - - for (int i = 0; i < 4; i++) - { - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); - pix8 = _mm256_srli_si256(pix8, 4); // shift right - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - - _mm256_store_ps(line_buffer, one); - _mm256_store_ps(line_buffer + 8, pix2f); - - line_buffer += 16; - } - } - } - - // process other lines - for (int y = 1; y < height; y++) - { - float* line_buffer = line_cache; - __m256i* dst_buf = (__m256i*)(img_dst + y * pitch); - __m256i* src_8xCur = (__m256i*)(img_src + y * pitch); - __m256i* src_8xPrev = (__m256i*)(img_src + (y - 1) * pitch); - - for (int x = 0; x < width8; x++) - { - __m256i pix8 = _mm256_load_si256(src_8xCur++); // load 8x pixel - __m256i pix8p = _mm256_load_si256(src_8xPrev++); - __m256i pix_out; // final 8x packed pixels - - // get color differences - getDiffFactor3x(pix8, pix8p, (__m256i*)color_diff); - - //////////////////// - // pixel 1, 5 unpack - { - // alpha factor - float alpha2_f = range_table[color_diff[0]]; - float alpha1_f = range_table[color_diff[4]]; - __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, - alpha2_f, alpha2_f, alpha2_f, alpha2_f); - - // load previous line color factor - __m256 alpha_prev = _mm256_load_ps(line_buffer); - // load previous line color - __m256 color_prev = _mm256_load_ps(line_buffer + 8); - - // unpack current source pixel - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - - // filter - pix2f = _mm256_mul_ps(pix2f, inv_alpha); - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // store current factor and color as previous for next cycle - _mm256_store_ps(line_buffer, alpha_prev); - _mm256_store_ps(line_buffer + 8, color_prev); - line_buffer += 16; - - // calculate final color - pix2f = _mm256_div_ps(color_prev, alpha_prev); - - // pack float pixel into byte pixel - pix2i = _mm256_cvtps_epi32(pix2f); // convert to integer - pix_out = _mm256_shuffle_epi8(pix2i, mask_pack); - } - - // loop for other pixels - for (int i = 1; i<4; i++) - { - // alpha factor - float alpha2_f = range_table[color_diff[i]]; - float alpha1_f = range_table[color_diff[i + 4]]; - __m256 alpha_f_8x = _mm256_set_ps(alpha1_f, alpha1_f, alpha1_f, alpha1_f, - alpha2_f, alpha2_f, alpha2_f, alpha2_f); - - // load previous line color factor - __m256 alpha_prev = _mm256_load_ps(line_buffer); - // load previous line color - __m256 color_prev = _mm256_load_ps(line_buffer + 8); - - // unpack current source pixel - pix8 = _mm256_srli_si256(pix8, 4); // shift right - __m256i pix2i = _mm256_shuffle_epi8(pix8, mask_unpack); - __m256 pix2f = _mm256_cvtepi32_ps(pix2i); // convert to floats - - // filter - pix2f = _mm256_mul_ps(pix2f, inv_alpha); - alpha_prev = _mm256_fmadd_ps(alpha_prev, alpha_f_8x, inv_alpha); // filter factor - color_prev = _mm256_fmadd_ps(color_prev, alpha_f_8x, pix2f); // filter color - - // store current factor and color as previous for next cycle - _mm256_store_ps(line_buffer, alpha_prev); - _mm256_store_ps(line_buffer + 8, color_prev); - line_buffer += 16; - - // calculate final color - pix2f = _mm256_div_ps(color_prev, alpha_prev); - - // pack float pixel into byte pixel - pix2i = _mm256_cvtps_epi32(pix2f); // convert to integer - pix2i = _mm256_shuffle_epi8(pix2i, mask_pack); - pix_out = _mm256_srli_si256(pix_out, 4); // shift - pix_out = _mm256_or_si256(pix_out, pix2i); // combine - } - - // average result with previous values in destination buffer - __m256i pix8_d = _mm256_load_si256(dst_buf); - pix_out = _mm256_avg_epu8(pix8_d, pix_out); - _mm256_store_si256(dst_buf++, pix_out); - } - } - } -} - -bool CRBFilterAVX2::filter(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch) -{ - // basic error checking - if (!m_stage_buffer[0]) - return false; - - if (width < 32 || width > m_reserved_width) - return false; - - if (height < 16 || height > m_reserved_height) - return false; - - if (pitch < width * 4) - return false; - - if (!out_data || !in_data) - return false; - - if (m_inv_alpha_f == 0.f) - return false; - - int thread_count_adjusted = m_thread_count - 1; - - ////////////////////////////////////////////// - // horizontal filter divided in threads - for (int i = 0; i < thread_count_adjusted; i++) - { - m_horizontal_tasks[i] = std::async(std::launch::async, &CRBFilterAVX2::horizontalFilter, this, i, in_data, m_stage_buffer[0], width, height, pitch); - } - - // use this thread for last segment - horizontalFilter(thread_count_adjusted, in_data, m_stage_buffer[0], width, height, pitch); - - // wait for result - for (int i = 0; i < thread_count_adjusted; i++) - { - m_horizontal_tasks[i].get(); - } - - ///////////////////////////////////////////// - // vertical filter divided in threads - for (int i = 0; i < thread_count_adjusted; i++) - { - m_vertical_tasks[i] = std::async(std::launch::async, &CRBFilterAVX2::verticalFilter, this, i, m_stage_buffer[0], out_data, width, height, pitch); - } - - // use this thread for last segment - verticalFilter(thread_count_adjusted, m_stage_buffer[0], out_data, width, height, pitch); - - // wait for result - for (int i = 0; i < thread_count_adjusted; i++) - { - m_vertical_tasks[i].get(); - } - - return true; -} - -bool CRBFilterAVX2::filterPipePush(unsigned char* out_data, const unsigned char* in_data, int width, int height, int pitch) -{ - // basic error checking - if (!m_stage_buffer[0]) - return false; - - if (width < 16 || width > m_reserved_width) - return false; - - if (height < 16 || height > m_reserved_height) - return false; - - if (pitch < width * 4) - return false; - - if (m_inv_alpha_f == 0.f) - return false; - - { - std::lock_guard lk(m_mutex_stage1); - - m_new_data_width = width; - m_new_data_height = height; - m_new_data_pitch = pitch; - m_new_data_in_buffer = in_data; - m_new_data_out_buffer = out_data; - m_new_data_stage_buffer = m_stage_buffer[m_submitted_image_count % 2]; - m_submitted_image_count++; - - m_has_new_data_stage1 = true; - } - - m_condition1.notify_one(); - - return true; -} - -void CRBFilterAVX2::filterPipeFlush() -{ - //filterPipePush(nullptr, nullptr, m_image_width, m_image_height, m_image_pitch); - - //if (m_filter_counter > 0) - //{ - // for (int i = 0; i < m_thread_count; i++) - // { - // if(m_vertical_tasks[i].valid()) - // m_vertical_tasks[i].get(); - // } - //} - - //m_filter_counter = 0; -} - -void CRBFilterAVX2::threadFunctionHorizontal() -{ - // Wait until filterPipePush() sends data - { - std::unique_lock lk(m_mutex_stage1); - bool& new_data = m_has_new_data_stage1; - m_condition1.wait(lk, [&new_data] {return new_data; }); - - int width = m_new_data_width; - int height = m_new_data_height; - int pitch = m_new_data_pitch; - const unsigned char* in_buffer = m_new_data_in_buffer; - unsigned char* out_buffer = m_new_data_out_buffer; - unsigned char* stage_buffer = m_new_data_stage_buffer; - - m_has_new_data_stage1 = false; - } - m_condition1.notify_one(); -} - -void CRBFilterAVX2::threadFunctionVertical() -{ - // wait until horizontal pass sumbits work - { - std::unique_lock lk(m_mutex_stage2); - bool& new_data = m_has_new_data_stage2; - m_condition2.wait(lk, [&new_data] {return new_data; }); - - int width = m_stage2_width; - int height = m_stage2_height; - int pitch = m_stage2_pitch; - const unsigned char* in_buffer = m_stage2_in_buffer; - unsigned char* out_buffer = m_stage2_out_buffer; - } - m_condition2.notify_one(); -} \ No newline at end of file