Skip to content

Commit

Permalink
Merge pull request pygame-community#2896 from Starbuck5/add-missing-d…
Browse files Browse the repository at this point in the history
…uff-loop-check

Add missing check in SSE2 alpha blitter
  • Loading branch information
itzpr3d4t0r authored Jun 2, 2024
2 parents c178856 + 89065d7 commit bc0902f
Showing 1 changed file with 57 additions and 47 deletions.
104 changes: 57 additions & 47 deletions src_c/simd_blitters_sse2.c
Original file line number Diff line number Diff line change
Expand Up @@ -589,73 +589,83 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
srcp128 = (__m128i *)srcp32;
dstp128 = (__m128i *)dstp32;

LOOP_UNROLLED4(
{
/* ==== load 4 pixels into SSE registers ==== */
if (n_iters_4) {
LOOP_UNROLLED4(
{
/* ==== load 4 pixels into SSE registers ==== */

/*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_src*/
pixels_src = _mm_loadu_si128(srcp128);
/*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_src*/
pixels_src = _mm_loadu_si128(srcp128);

/* isolate alpha channels
* [A10][00 ][A20][00 ][A30][00 ][A40][00 ] -> mm_src_alpha*/
mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, pixels_src);
/* isolate alpha channels
* [A10][00 ][A20][00 ][A30][00 ][A40][00 ] ->
* mm_src_alpha*/
mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, pixels_src);

/* shift right to position alpha channels for manipulation
* [0A1][00 ][0A2][00 ][0A3][00 ][0A4][00 ] -> mm_src_alpha*/
mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);
/* shift right to position alpha channels for manipulation
* [0A1][00 ][0A2][00 ][0A3][00 ][0A4][00 ] ->
* mm_src_alpha*/
mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);

/*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_dst*/
pixels_dst = _mm_loadu_si128(dstp128);
/*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_dst*/
pixels_dst = _mm_loadu_si128(dstp128);

/* ==== BATCH A (the 2 low pixels) ==== */
/* ==== BATCH A (the 2 low pixels) ==== */

/* shuffle alpha channels to duplicate 16 bit pairs
* [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha*/
unpacked_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
/* shuffle alpha channels to duplicate 16 bit pairs
* [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] ->
* mm_src_alpha*/
unpacked_alpha =
_mm_shufflelo_epi16(mm_src_alpha, 0b11110101);

/* spread alpha into final config for 16 bit math
* [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha*/
unpacked_alpha =
_mm_unpacklo_epi16(unpacked_alpha, unpacked_alpha);
/* spread alpha into final config for 16 bit math
* [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] ->
* unpacked_alpha*/
unpacked_alpha =
_mm_unpacklo_epi16(unpacked_alpha, unpacked_alpha);

/* 0A0R0G0B0A0R0G0B -> src1 */
src1 = _mm_unpacklo_epi8(pixels_src, mm_zero);
/* 0A0R0G0B0A0R0G0B -> src1 */
src1 = _mm_unpacklo_epi8(pixels_src, mm_zero);

/* 0A0R0G0B0A0R0G0B -> dst1 */
dst1 = _mm_unpacklo_epi8(pixels_dst, mm_zero);
/* 0A0R0G0B0A0R0G0B -> dst1 */
dst1 = _mm_unpacklo_epi8(pixels_dst, mm_zero);

ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE

batch_a_dst = sub_dst;
batch_a_dst = sub_dst;

/* ==== BATCH B (the 2 high pixels) ==== */
/* ==== BATCH B (the 2 high pixels) ==== */

/*[00 ][00 ][00 ][00 ][0A1][0A1][0A2][0A2] -> unpacked_alpha*/
unpacked_alpha = _mm_shufflehi_epi16(mm_src_alpha, 0b11110101);
/*[00 ][00 ][00 ][00 ][0A1][0A1][0A2][0A2] ->
* unpacked_alpha*/
unpacked_alpha =
_mm_shufflehi_epi16(mm_src_alpha, 0b11110101);

/*[0A1][0A1][0A1][0A1][0A2][0A2][0A2][0A2] -> unpacked_alpha*/
unpacked_alpha =
_mm_unpackhi_epi16(unpacked_alpha, unpacked_alpha);
/*[0A1][0A1][0A1][0A1][0A2][0A2][0A2][0A2] ->
* unpacked_alpha*/
unpacked_alpha =
_mm_unpackhi_epi16(unpacked_alpha, unpacked_alpha);

/*[0A][0R][0G][0B][0A][0R][0G][0B] -> src1*/
src1 = _mm_unpackhi_epi8(pixels_src, mm_zero);
/*[0A][0R][0G][0B][0A][0R][0G][0B] -> src1*/
src1 = _mm_unpackhi_epi8(pixels_src, mm_zero);

/*[0A][0R][0G][0B][0A][0R][0G][0B] -> dst1*/
dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero);
/*[0A][0R][0G][0B][0A][0R][0G][0B] -> dst1*/
dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero);

ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE

/* ==== combine batches and store ==== */
/* ==== combine batches and store ==== */

sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst);
/* zero out alpha */
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
_mm_storeu_si128(dstp128, sub_dst);
sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst);
/* zero out alpha */
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
_mm_storeu_si128(dstp128, sub_dst);

srcp128++;
dstp128++;
},
n, n_iters_4);
srcp128++;
dstp128++;
},
n, n_iters_4);
}

srcp32 = (Uint32 *)srcp128;
dstp32 = (Uint32 *)dstp128;
Expand Down

0 comments on commit bc0902f

Please sign in to comment.