Skip to content

Commit

Permalink
Optimize LZ4W decompressor (Stephane-D#393)
Browse files Browse the repository at this point in the history
- Clarified that D0 returns the unpacked size (might not be immediately obvious to some)
- Removed tst.b d1,d1 instruction (collapsed in the required add.w d1,d1 instruction)
- Fixed unpacked size report for lz4w_unpack_a, at the cost of 4 bytes in stack
- Relocated .next and .done to be branched via .s rather then .w
- Accounted for all branches to .next leaving d1 clear
  • Loading branch information
RealMalachi authored Mar 4, 2025
1 parent 2dd23eb commit 847c9cc
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 33 deletions.
Binary file modified lib/libmd.a
Binary file not shown.
Binary file modified lib/libmd_debug.a
Binary file not shown.
68 changes: 35 additions & 33 deletions src/tools_a.s
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ partition:
;// Size optimized (164 bytes) by Franck "hitchhikr" Charlet.
;// More optimizations by r57shell.
;//
;// aplib_decrunch: A0 = Source / A1 = Destination / Returns unpacked size
;// aplib_decrunch: A0 = Source / A1 = Destination / D0 Returns unpacked size
;// u32 aplib_unpack(u8 *src, u8 *dest); /* c prototype */
;//
;// -------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -252,9 +252,9 @@ aplib_decrunch:
;// ---------------------------------------------------------------------------
;// LZ4W unpacker for MC68000
;// by Stephane Dallongeville @2017
;// decomp code tweaked by HpMan
;// decomp code tweaked by HpMan, optimized further by Malachi
;//
;// lz4w_unpack_a: A0 = Source / A1 = Destination / Returns unpacked size
;// lz4w_unpack_a: A0 = Source / A1 = Destination / D0 Returns unpacked size
;// u16 lz4w_unpack(const u8 *src, u8 *dest); /* c prototype */
;// ---------------------------------------------------------------------------

Expand All @@ -271,15 +271,12 @@ aplib_decrunch:
.endm

func lz4w_unpack
move.l 4(%sp),%a0 ;// a0 = src
move.l 8(%sp),%a1 ;// a1 = dst
movem.l 4(%sp),%a0-%a1 ;// a0 = src, // a1 = dst

lz4w_unpack_a:
movem.l a2-a4, -(sp)
movem.l a1-a4, -(sp) ;// save dst for lz4w_unpack_a

lea .jump_table(pc), a3 ;// for LZ4W_NEXT macro

.next:
LZ4W_NEXT

.jump_table:
Expand Down Expand Up @@ -322,7 +319,19 @@ lz4w_unpack_a:
.lm_len_00:
move.w (a2)+, (a1)+
move.w (a2)+, (a1)+
LZ4W_NEXT
;// .next was moved it here for .s branching range
;// Additionally, all branches to .next have d1 already cleared.
;// The easiest way to take advantage of that, is to inline the macro..
moveq #0, d1
.next:
moveq #0, d0
move.b (a0)+, d0 ;// d0 = literal & match length
move.b (a0)+, d1 ;// d1 = match offset

add.w d0, d0
add.w d0, d0
move.l (a3,d0.w), a4
jmp (a4)

.litE_mat0: move.l (a0)+, (a1)+
.litC_mat0: move.l (a0)+, (a1)+
Expand All @@ -331,14 +340,12 @@ lz4w_unpack_a:
.lit6_mat0: move.l (a0)+, (a1)+
.lit4_mat0: move.l (a0)+, (a1)+
.lit2_mat0: move.l (a0)+, (a1)+
tst.b d1 ;// match offset null ?
beq .next ;// not a long match
add.w d1, d1 ;// len = len * 2, match offset null ?
beq.s .next ;// not a long match

.long_match_1:
move.w (a0)+, d0 ;// get long offset (already negated)

add.w d1, d1 ;// len = len * 2

add.w d0, d0 ;// bit 15 contains ROM source info
bcs.s .lm_rom

Expand All @@ -355,15 +362,12 @@ lz4w_unpack_a:
.lit5_mat0: move.l (a0)+, (a1)+
.lit3_mat0: move.l (a0)+, (a1)+
.lit1_mat0: move.w (a0)+, (a1)+

tst.b d1 ;// match offset null ?
beq .next ;// not a long match
add.w d1, d1 ;// len = len * 2, match offset null ?
beq.s .next ;// not a long match

.long_match_2:
move.w (a0)+, d0 ;// get long offset (already negated)

add.w d1, d1 ;// len = len * 2

add.w d0, d0 ;// bit 15 contains ROM source info
bcs.s .lm_rom

Expand All @@ -373,14 +377,12 @@ lz4w_unpack_a:
jmp (a4)

.lit0_mat0: ;// special case of lit=0 and mat=0
tst.b d1 ;// match offset null ?
beq .done ;// not a long match --> done
add.w d1, d1 ;// len = len * 2, match offset null ?
beq.s .done ;// not a long match --> done

.long_match_3:
move.w (a0)+, d0 ;// get long offset (already negated)

add.w d1, d1 ;// len = len * 2

add.w d0, d0 ;// bit 15 contains ROM source info
bcs.s .lm_rom

Expand All @@ -395,6 +397,17 @@ lz4w_unpack_a:
move.l .lmr_jump_table(pc,d1.w), a4
jmp (a4)

.done:
move.w (a0)+, d0 ;// need to copy a last byte ?
bpl.s .no_byte
move.b d0, (a1)+ ;// copy last byte
.no_byte:
move.l a1, d0
sub.l (sp)+, d0 ;// return op - dest

movem.l (sp)+, a2-a4
rts

.lmr_jump_table:
.long .lmr_len_00-0x00, .lmr_len_01-0x00, .lmr_len_00-0x02, .lmr_len_01-0x02, .lmr_len_00-0x04, .lmr_len_01-0x04, .lmr_len_00-0x06, .lmr_len_01-0x06, .lmr_len_00-0x08, .lmr_len_01-0x08, .lmr_len_00-0x0a, .lmr_len_01-0x0a, .lmr_len_00-0x0c, .lmr_len_01-0x0c, .lmr_len_00-0x0e, .lmr_len_01-0x0e
.long .lmr_len_00-0x10, .lmr_len_01-0x10, .lmr_len_00-0x12, .lmr_len_01-0x12, .lmr_len_00-0x14, .lmr_len_01-0x14, .lmr_len_00-0x16, .lmr_len_01-0x16, .lmr_len_00-0x18, .lmr_len_01-0x18, .lmr_len_00-0x1a, .lmr_len_01-0x1a, .lmr_len_00-0x1c, .lmr_len_01-0x1c, .lmr_len_00-0x1e, .lmr_len_01-0x1e
Expand Down Expand Up @@ -724,14 +737,3 @@ lz4w_unpack_a:
.lit3_matF: move.l (a0)+, (a1)+
.lit1_matF: move.w (a0)+, (a1)+
COPY_MATCH 15

.done:
move.w (a0)+, d0 ;// need to copy a last byte ?
bpl.s .no_byte
move.b d0, (a1)+ ;// copy last byte
.no_byte:
move.l a1, d0
sub.l 20(sp), d0 ;// return op - dest

movem.l (sp)+, a2-a4
rts

0 comments on commit 847c9cc

Please sign in to comment.