arith.a


!source "zeropage.a"

; --------------------------------------------------
; load16BitImmediate loads the 16 bit value given in .val into the memory location given
; by .addr 
; --------------------------------------------------
!macro load16BitImmediate .val, .addr {
    lda #<.val
    sta .addr
    lda #>.val
    sta .addr+1
}

; --------------------------------------------------
; asl16Bit multiplies the 16 bit value stored at .memAddr by 2
; --------------------------------------------------
!macro asl16Bit .memAddr {
    asl .memAddr
    rol .memAddr+1
}


; --------------------------------------------------
; rol16Bit rotates the 16 bit value stored at .memAddr on bit to the left
; --------------------------------------------------
!macro rol16Bit .memAddr {
    rol .memAddr
    rol .memAddr+1
}


; --------------------------------------------------
; lsr16Bit divides the 16 bit value stored at .memAddr by 2
; --------------------------------------------------
!macro lsr16Bit .memAddr {
    lsr .memAddr+1
    ror .memAddr
}

; --------------------------------------------------
; ror16Bit rotates the 16 bit value stored at .memAddr one bit to the right
; --------------------------------------------------
!macro ror16Bit .memAddr {
    ror .memAddr+1
    ror .memAddr
}


; --------------------------------------------------
; add16Bit implements a 16 bit add of the values stored at memAddr1 and memAddr2 
; The result is stored in .memAddr2
; --------------------------------------------------
!macro add16Bit .memAddr1, .memAddr2 {
    clc
    ; add lo bytes
    lda .memAddr1
    adc .memAddr2
    sta .memAddr2
    ; add hi bytes
    lda .memAddr1+1
    adc .memAddr2+1
    sta .memAddr2+1
}


; --------------------------------------------------
; sub16Bit subtracts the value stored at .memAddr1 from the value stored at the
; address .memAddr2. The result is stored in .memAddr2
; --------------------------------------------------
!macro sub16Bit .memAddr1, .memAddr2 {
    sec
    lda .memAddr2
    sbc .memAddr1
    sta .memAddr2
    lda .memAddr2+1
    sbc .memAddr1+1
    sta .memAddr2+1
}


; --------------------------------------------------
; move16Bit copies the 16 bit value stored at .memAddr1 to .memAddr2
; --------------------------------------------------
!macro move16Bit .memAddr1, .memAddr2 {
    ; copy lo byte
    lda .memAddr1
    sta .memAddr2
    ; copy hi byte
    lda .memAddr1+1
    sta .memAddr2+1
}


; --------------------------------------------------
; cmp16Bit compares the 16 bit values stored at memAddr1 and memAddr2 
; Z  flag is set in case these values are equal
; --------------------------------------------------
!macro cmp16Bit .memAddr1, .memAddr2 {
    lda .memAddr1+1
    cmp .memAddr2+1
    bne .unequal
    lda .memAddr1
    cmp .memAddr2
.unequal
}


; --------------------------------------------------
; cmp16BitImmediate compares the 16 bit value stored at memAddr with
; the immediate value given in .value.
; 
; Z  flag is set in case these values are equal. Carry is set
; if .value is greater or equal than the value store at .memAddr
; --------------------------------------------------
!macro cmp16BitImmediate .value, .memAddr {
    lda #>.value
    cmp .memAddr+1
    bne .unequal2
    lda #<.value
    cmp .memAddr
.unequal2
}

!macro callFunc .func, .addrL, .addrR {
    +load16BitImmediate .addrL, ARITH_SCRATCH1
    +load16BitImmediate .addrR, ARITH_SCRATCH3
    jsr .func
}

!macro callFuncMono .func, .addrL {
    +load16BitImmediate .addrL, ARITH_SCRATCH1
    jsr .func
}

!ifdef FAST_MUL {
; xy = (x^2 + y^2 - (x-y)^2)/2
; The following tables contain the LSB and MSB of i^2 where i=0, ..., 255
SQ_TAB_LSB
!byte $00, $01, $04, $09, $10, $19, $24, $31, $40, $51, $64, $79, $90, $A9, $C4, $E1
!byte $00, $21, $44, $69, $90, $B9, $E4, $11, $40, $71, $A4, $D9, $10, $49, $84, $C1
!byte $00, $41, $84, $C9, $10, $59, $A4, $F1, $40, $91, $E4, $39, $90, $E9, $44, $A1
!byte $00, $61, $C4, $29, $90, $F9, $64, $D1, $40, $B1, $24, $99, $10, $89, $04, $81
!byte $00, $81, $04, $89, $10, $99, $24, $B1, $40, $D1, $64, $F9, $90, $29, $C4, $61
!byte $00, $A1, $44, $E9, $90, $39, $E4, $91, $40, $F1, $A4, $59, $10, $C9, $84, $41
!byte $00, $C1, $84, $49, $10, $D9, $A4, $71, $40, $11, $E4, $B9, $90, $69, $44, $21
!byte $00, $E1, $C4, $A9, $90, $79, $64, $51, $40, $31, $24, $19, $10, $09, $04, $01
!byte $00, $01, $04, $09, $10, $19, $24, $31, $40, $51, $64, $79, $90, $A9, $C4, $E1
!byte $00, $21, $44, $69, $90, $B9, $E4, $11, $40, $71, $A4, $D9, $10, $49, $84, $C1
!byte $00, $41, $84, $C9, $10, $59, $A4, $F1, $40, $91, $E4, $39, $90, $E9, $44, $A1
!byte $00, $61, $C4, $29, $90, $F9, $64, $D1, $40, $B1, $24, $99, $10, $89, $04, $81
!byte $00, $81, $04, $89, $10, $99, $24, $B1, $40, $D1, $64, $F9, $90, $29, $C4, $61
!byte $00, $A1, $44, $E9, $90, $39, $E4, $91, $40, $F1, $A4, $59, $10, $C9, $84, $41
!byte $00, $C1, $84, $49, $10, $D9, $A4, $71, $40, $11, $E4, $B9, $90, $69, $44, $21
!byte $00, $E1, $C4, $A9, $90, $79, $64, $51, $40, $31, $24, $19, $10, $09, $04, $01

SQ_TAB_MSB
!byte $00, $00, $00, $00, $00, $00, $00, $00, $00, $00, $00, $00, $00, $00, $00, $00
!byte $01, $01, $01, $01, $01, $01, $01, $02, $02, $02, $02, $02, $03, $03, $03, $03
!byte $04, $04, $04, $04, $05, $05, $05, $05, $06, $06, $06, $07, $07, $07, $08, $08
!byte $09, $09, $09, $0A, $0A, $0A, $0B, $0B, $0C, $0C, $0D, $0D, $0E, $0E, $0F, $0F
!byte $10, $10, $11, $11, $12, $12, $13, $13, $14, $14, $15, $15, $16, $17, $17, $18
!byte $19, $19, $1A, $1A, $1B, $1C, $1C, $1D, $1E, $1E, $1F, $20, $21, $21, $22, $23
!byte $24, $24, $25, $26, $27, $27, $28, $29, $2A, $2B, $2B, $2C, $2D, $2E, $2F, $30
!byte $31, $31, $32, $33, $34, $35, $36, $37, $38, $39, $3A, $3B, $3C, $3D, $3E, $3F
!byte $40, $41, $42, $43, $44, $45, $46, $47, $48, $49, $4A, $4B, $4C, $4D, $4E, $4F
!byte $51, $52, $53, $54, $55, $56, $57, $59, $5A, $5B, $5C, $5D, $5F, $60, $61, $62
!byte $64, $65, $66, $67, $69, $6A, $6B, $6C, $6E, $6F, $70, $72, $73, $74, $76, $77
!byte $79, $7A, $7B, $7D, $7E, $7F, $81, $82, $84, $85, $87, $88, $8A, $8B, $8D, $8E
!byte $90, $91, $93, $94, $96, $97, $99, $9A, $9C, $9D, $9F, $A0, $A2, $A4, $A5, $A7
!byte $A9, $AA, $AC, $AD, $AF, $B1, $B2, $B4, $B6, $B7, $B9, $BB, $BD, $BE, $C0, $C2
!byte $C4, $C5, $C7, $C9, $CB, $CC, $CE, $D0, $D2, $D4, $D5, $D7, $D9, $DB, $DD, $DF
!byte $E1, $E2, $E4, $E6, $E8, $EA, $EC, $EE, $F0, $F2, $F4, $F6, $F8, $FA, $FC, $FE


; --------------------------------------------------
; mul16BitFast mutiplies the bytes contained in accu and x register 
; The high byte of the result is returned in accu, the lo byte in the x register.
; The three macro parameters specify temporary memory to use by the calculation.
;
; The base behind the speedup is the formula xy = (x^2 + y^ 2 - (x-y)^2)/2
; where the squares are read from the lookup tables above. This routine seems to be
; twice as fast as the simple multiplication routine mul16BitShiftAdd which uses shift and add
; --------------------------------------------------
!macro mul16BitLookup .addr1, .addr2, .addr3 {
.multiply:
	sta .addr1
	cpx .addr1
	bcc .sorted
	txa
	ldx .addr1
.sorted:
	sta .addr3
	stx .addr1
	sec
	sbc .addr1
	tay
	ldx .addr3
	lda SQ_TAB_LSB,x
	sbc SQ_TAB_LSB,y
	sta .addr2
	lda SQ_TAB_MSB,x
	sbc SQ_TAB_MSB,y
	sta .addr3
	clc
	ldx .addr1
	lda .addr2
	adc SQ_TAB_LSB,x
	sta .addr2
	lda .addr3
	adc SQ_TAB_MSB,x
	ror
	ror .addr2
	ldx .addr2	
}

; --------------------------------------------------
; This subroutine takes its two operands in X and A and returns the result
; The high byte of the result is returned in accu, the lo byte in the X register.
; --------------------------------------------------
mul16Bit
    +mul16BitLookup LOOKUP_SCRATCH1, LOOKUP_SCRATCH2, LOOKUP_SCRATCH3
    rts

} else {

SCRATCH
!byte 0,0,0
SCRATCH_RES
!byte 0,0
; --------------------------------------------------
; mul16BitFast mutiplies the bytes contained in accu and x register 
; The high byte of the result is returned in accu, the lo byte in the x register.
; The three macro parameters specify temporary memory to use by the calculation.
; --------------------------------------------------
!macro mul16BitShiftAdd .memRes {
    ldy #0
    sty .memRes              ; clear result
    sty .memRes+1             
    sta SCRATCH              ; copy operand 1
    sty SCRATCH+1            ; clear hibyte
    stx SCRATCH+2            ; copy operand 2

.loop
    lsr SCRATCH+2
    bcc .shift
    +add16Bit SCRATCH, .memRes
.shift
    +asl16Bit SCRATCH
    iny
    cpy #8
    bne .loop
    lda .memRes+1
    ldx .memRes
}

; --------------------------------------------------
; This subroutine takes its two operands in X and A and returns the result
; The high byte of the result is returned in accu, the lo byte in the X register.
; --------------------------------------------------
mul16Bit
    +mul16BitShiftAdd SCRATCH_RES
    rts
}


; --------------------------------------------------
; This subroutine takes its two 8 bit operands in the accu and the x register and divides
; them. The division result of A / X is returned in A and the remainder is returned in
; X. Code originally taken from
; 
; https://www.retro-programming.de/programming/assembler/fuer-fortgeschrittene/mul-div-ganzzahl/
;
; and modified by me including a bug fix for all cases where the dividend is smaller than the divisor. 
; In the original code the remainder was zero and not ZP_DIVIDEND in these cases.
; --------------------------------------------------
divMod8Bit
    ; setup state
    sta ZP_DIVIDEND
    stx ZP_DIVISOR
    lda #$00                       ; erstmal eine 0
    sta ZP_RESULT                  ; ins Ergebnis
    sta ZP_HELP                    ; und in die Hilfsvariable

    ; check for divison by zero
    lda ZP_DIVISOR                 ; prüfen ob durch 0 geteilt werden soll
    beq .stopPrg                   ; In case divisor is zero stop program

    ;check if dividend < divisor
    lda ZP_DIVIDEND
    cmp ZP_DIVISOR                 ; dividend < divisor?
    bcc .dividendLtDivisor         ; yes => The result is obvious (A = 0, X = ZP_DIVIDEND)

    ; perform division
    ldx #$08                       ; acht Schleifendurchläufe
.loop
    asl ZP_DIVIDEND                ; höchstes BIT ins Carry-Flag shiften
    rol ZP_HELP                    ; C-Flag ins niedrigste BIT rotieren 
    lda ZP_HELP                    ; ZP_HELP in den Akku
    cmp ZP_DIVISOR                 ; prüfen ob ZP_HELP größer/gleich Divisor
    bcc .skip                      ; falls nein -> weiter bei .skip
    sbc ZP_DIVISOR                 ; sonst Divisor vom Akku abziehen
    sta ZP_HELP                    ; Akku bei ZP_HELP speichern
.skip
    rol ZP_RESULT                  ; C-Flag ins Ergebnis rotieren
    dex                            ; Schleifenzähler verringern
    bne .loop                      ; solange größer 0, nochmal -> .loop
    lda ZP_RESULT                  
    ldx ZP_HELP
    rts
.dividendLtDivisor
    lda #0
    ldx ZP_DIVIDEND
    rts
.stopPrg
    brk


div16_DIVIDEND
!byte 0,0
div16_DIVISOR
!byte 0,0
div16_RESULT
!byte 0,0
div16_REMAINDER
!byte 0,0
div16_HELP
!byte 0,0

; --------------------------------------------------
; This subroutine takes its two 16 bit operands from the addresses div16_DIVIDEND and div16_DIVISOR 
; and divides them. The division result is returned in div16_RESULT and the remainder is returned in
; div16_REMAINDER.
; --------------------------------------------------
divMod16Bit
    ; setup state
    +load16BitImmediate 0, div16_RESULT
    +load16BitImmediate 0, div16_HELP

    ; check for divison by zero
    +cmp16BitImmediate 0, div16_DIVISOR
    beq .stopPrg

    ;check if dividend < divisor
    +cmp16Bit div16_DIVIDEND, div16_DIVISOR
    bcc .dividendTooSmall

    ; perform division
    ldx #16
.loop16
    +asl16Bit div16_DIVIDEND             
    +rol16Bit div16_HELP                 
    +cmp16Bit div16_HELP, div16_DIVISOR
    bcc .skip16                         
    +sub16Bit div16_DIVISOR, div16_HELP
.skip16
    +rol16Bit div16_RESULT               
    dex                                 
    bne .loop16                         
    +move16Bit div16_HELP, div16_REMAINDER                  
    rts
.dividendTooSmall
    +move16Bit div16_DIVIDEND, div16_REMAINDER
    rts 

; --------------------------------------------------
; The 32 bit integer routines represent a number as 5 bytes. The first byte
; is the sign where 1 indicates a negative sign and 0 a positive sign. The sign byte
; is followed by the 32 bits of the absolute value. The lowest value byte
; contains the eight least significant bits followed by the next
; more significant bits and so on.
;
; I.e. the memory layout is as follows: 
;
;  sign byte | byte 0 | byte 1 | byte 2 | byte 3
; --------------------------------------------------

; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4 
; and adds them. This routine ignores the sign byte.
;
; The result is returned in the second operand, i.e. *opR <- *opL + *opR 
; --------------------------------------------------
add32BitUnsigned
    ldy #1                         ; skip over sign byte
    clc
    lda (ARITH_SCRATCH1),y
    adc (ARITH_SCRATCH3),y
    sta (ARITH_SCRATCH3),y
    iny
    lda (ARITH_SCRATCH1),y
    adc (ARITH_SCRATCH3),y
    sta (ARITH_SCRATCH3),y    
    iny
    lda (ARITH_SCRATCH1),y
    adc (ARITH_SCRATCH3),y
    sta (ARITH_SCRATCH3),y    
    iny
    lda (ARITH_SCRATCH1),y
    adc (ARITH_SCRATCH3),y
    sta (ARITH_SCRATCH3),y    

    rts

; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4 
; and subtracts them. The caller has to ensure that *opL >= *opR
;
; The result is returned in the second operand, i.e. *opR <- *opL - *opR 
; --------------------------------------------------
sub32BitUnsigned
    ldy #1                         ; skip over sign byte
    sec
    lda (ARITH_SCRATCH1),y
    sbc (ARITH_SCRATCH3),y
    sta (ARITH_SCRATCH3),y
    iny
    lda (ARITH_SCRATCH1),y
    sbc (ARITH_SCRATCH3),y
    sta (ARITH_SCRATCH3),y    
    iny
    lda (ARITH_SCRATCH1),y
    sbc (ARITH_SCRATCH3),y
    sta (ARITH_SCRATCH3),y    
    iny
    lda (ARITH_SCRATCH1),y
    sbc (ARITH_SCRATCH3),y
    sta (ARITH_SCRATCH3),y    
    rts

; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4 
; and subtracts them. The caller has to ensure that *opR >= *opL. This routine ignores the sign bytes.
;
; The result is returned in the second operand, i.e. *opR <- *opR - *opL 
; --------------------------------------------------
sub32SwitchedUnsigned
    ldy #1                         ; skip over sign byte
    sec
    lda (ARITH_SCRATCH3),y
    sbc (ARITH_SCRATCH1),y
    sta (ARITH_SCRATCH3),y
    iny
    lda (ARITH_SCRATCH3),y
    sbc (ARITH_SCRATCH1),y
    sta (ARITH_SCRATCH3),y    
    iny
    lda (ARITH_SCRATCH3),y
    sbc (ARITH_SCRATCH1),y
    sta (ARITH_SCRATCH3),y    
    iny
    lda (ARITH_SCRATCH3),y
    sbc (ARITH_SCRATCH1),y
    sta (ARITH_SCRATCH3),y    
    rts


TEMP_MUL
!byte 0,0,0,0,0,0,0,0

; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4.
; It multiplies its operands as if they were unsigned 32 bit integers.
;
; The result is returned in the eight byte buffer starting at TEMP_MUL. 
; --------------------------------------------------
mul32BitUnsigned
    lda #0
    sta COUNT_L                   ; clear counter for digits of left operand
    sta COUNT_R                   ; clear counter for digits of right operand

    ldx #7                        ; clear temp buffer
    lda #0
.clear                            
    sta TEMP_MUL,X
    dex
    bpl .clear

.loopMul
    ldy COUNT_L
    iny                           ; skip sign byte
    lda (ARITH_SCRATCH1), y       ; load COUNT_L digit of left operand 
    beq .noExtraCarry
    tax                           ; and store it in X register
    ldy COUNT_R                   ; load COUNT_R digit of right operand in accu
    iny                           ; skip sign byte
    lda (ARITH_SCRATCH3), y
    beq .noExtraCarry
    jsr mul16Bit                  ; multiply these 8 bit values. Result MSB in accu, LSB in x register
    sta HELP_MUL                  ; store MSB of multiplication result in temp variable
    lda COUNT_L                   ; calculate index of positon where to add the 16 bit result
    clc
    adc COUNT_R                   ; This position is COUNT_L + COUNT_R
    tay                           ; move calculated index to y register
    txa                           ; move LSB of multiplication result to accu
    ;clc                          ; Carry is always clear due to the addition of COUNT_L and COUNT_R
    adc TEMP_MUL, y               ; add LSB to intermediate result
    sta TEMP_MUL, y
    iny
    lda HELP_MUL                  ; add MSB to intermediate result
    adc TEMP_MUL, y
    sta TEMP_MUL, y
    bcc .noExtraCarry
.carryLoop
    iny
    lda #0                        ; add carry to intermediate result
    adc TEMP_MUL, y
    sta TEMP_MUL, y
    bcs .carryLoop
.noExtraCarry
    inc COUNT_L                   ; Move processing to next digit of left operand
    lda COUNT_L
    cmp #4
    bne .loopMul
    lda #0                        ; Move processing to next digit of right operand
    sta COUNT_L
    inc COUNT_R
    lda COUNT_R
    cmp #4
    bne .loopMul

    rts


; --------------------------------------------------
; This subroutine expects its operand in ARITH_SCRATCH1/2.
; It squares its operand as if it was an unsigned 32 bit integer.
;
; The result is returned in the eight byte buffer starting at TEMP_MUL. 
; --------------------------------------------------
square32BitUnsigned
    lda #0
    sta COUNT_L                   ; clear counter for digits of left operand
    sta COUNT_R                   ; clear counter for digits of right operand
    ldx #7                        ; clear temp buffer
    lda #0
.squareClear                            
    sta TEMP_MUL,X
    dex
    bpl .squareClear

.loopSquare
    lda #1
    sta ARITH_SCRATCH3

    ldy COUNT_L
    cpy COUNT_R
    bne .squareNormal
!ifdef FAST_MUL {    
    iny                           ; skip sign byte
    lda (ARITH_SCRATCH1), y       ; load COUNT_L digit of left operand 
    beq .noExtraCarrySquare
    tay
    lda SQ_TAB_LSB, y
    tax
    lda SQ_TAB_MSB, y
    dec ARITH_SCRATCH3
    jmp .processResult
} else {
    dec ARITH_SCRATCH3
}
.squareNormal
    iny                           ; skip sign byte
    lda (ARITH_SCRATCH1), y       ; load COUNT_L digit of left operand 
    beq .noExtraCarrySquare
    tax                           ; and store it in X register
    ldy COUNT_R                   ; load COUNT_R digit of right operand in accu
    iny                           ; skip sign byte
    lda (ARITH_SCRATCH1), y
    beq .noExtraCarrySquare
    jsr mul16Bit                  ; multiply these 8 bit values. Result MSB in accu, LSB in x register
.processResult
    sta HELP_MUL                  ; store MSB of multiplication result in temp variable
.addTwice    
    lda COUNT_L                   ; calculate index of positon where to add the 16 bit result
    clc
    adc COUNT_R                   ; This position is COUNT_L + COUNT_R
    tay                           ; move calculated index to y register
    txa                           ; move LSB of multiplication result to accu
    ;clc                          ; Carry is always clear due to the addition of COUNT_L and COUNT_R
    adc TEMP_MUL, y               ; add LSB to intermediate result
    sta TEMP_MUL, y
    iny
    lda HELP_MUL                  ; add MSB to intermediate result
    adc TEMP_MUL, y
    sta TEMP_MUL, y
    bcc .checkDoubleAdd
.carryLoopSquare
    iny
    lda #0                         ; add carry to intermediate result
    adc TEMP_MUL, y
    sta TEMP_MUL, y
    bcs .carryLoopSquare
.checkDoubleAdd
    dec ARITH_SCRATCH3
    bne .noExtraCarrySquare
    jmp .addTwice
.noExtraCarrySquare
    inc COUNT_L                   ; Move processing to next digit of left operand
    lda COUNT_L
    cmp COUNT_R
    beq .loopSquare
    bcc .loopSquare
.nextDigit
    lda #0                        ; Move processing to next digit of right operand
    sta COUNT_L
    inc COUNT_R
    lda COUNT_R
    cmp #4
    beq .squareDone
    jmp .loopSquare
.squareDone
    rts


div32_RES_DIV
!byte 0,0,0,0,0
div32_RES_REM
!byte 0,0,0,0,0
div32_HELP
!byte 0,0,0,0,0
div32_DIVIDEND
!byte 0,0,0,0,0
div32_DIVISOR
!byte 0,0,0,0,0

ADDR_TEMP1
!byte 0, 0
ADDR_TEMP2
!byte 0, 0

; --------------------------------------------------
; This subroutine expects operand one in ARITH_SCRATCH1/2 and operand two in ARITH_SCRATCH3/4 
; and determines the division result and remainder when dividing operand one by operand two. This 
; routine ignores the sign byte and treats its operands as positive integers. 
; 
; The result is returned in the buffers div32_RES_DIV and div32_RES_REM. 
; --------------------------------------------------
divMod32BitUnsigned
    ; save pointers to input data
    +move16Bit ARITH_SCRATCH1, ADDR_TEMP1                        ; save address of dividend
    +move16Bit ARITH_SCRATCH3, ADDR_TEMP2                        ; save address of divisor

    ; copy dividend to div32_DIVIDEND
    ;+move16Bit ADDR_TEMP1, ARITH_SCRATCH1
    +load16BitImmediate div32_DIVIDEND, ARITH_SCRATCH3
    jsr move32Bit

    ; copy divisor to div32_DIVISOR
    +move16Bit ADDR_TEMP2, ARITH_SCRATCH1
    +load16BitImmediate div32_DIVISOR, ARITH_SCRATCH3
    jsr move32Bit

    ; setup state
    +callFuncMono clear32Bit, div32_RES_DIV
    +callFuncMono clear32Bit, div32_HELP

    ; check for divison by zero
    +callFuncMono isZero32Bit, div32_DIVISOR
    bne .noDivZero
    brk
.noDivZero
    ;check if dividend < divisor
    +callFunc cmp32Bit, div32_DIVIDEND, div32_DIVISOR
    bcs .goOn32
    jmp .dividendTooShort
.goOn32
    ; perform division
    ldx #32
.loop32
    ;+callFuncMono double32Bit, div32_DIVIDEND
    asl div32_DIVIDEND+1
    rol div32_DIVIDEND+2
    rol div32_DIVIDEND+3
    rol div32_DIVIDEND+4
    ;+callFuncMono rotateLeft32Bit, div32_HELP                
    rol div32_HELP+1
    rol div32_HELP+2
    rol div32_HELP+3
    rol div32_HELP+4
    ;+callFunc cmp32Bit, div32_HELP, div32_DIVISOR
    lda div32_HELP+4
    cmp div32_DIVISOR+4
    beq .n1                   
    jmp .endCmp32                                
.n1
    lda div32_HELP+3
    cmp div32_DIVISOR+3
    beq .n2                   
    jmp .endCmp32
.n2
    lda div32_HELP+2
    cmp div32_DIVISOR+2
    beq .n3
    jmp .endCmp32
.n3
    lda div32_HELP+1
    cmp div32_DIVISOR+1
.endCmp32    
    bcc .skip32
    ;+callFunc sub32SwitchedUnsigned, div32_DIVISOR, div32_HELP
    sec
    lda div32_HELP+1
    sbc div32_DIVISOR+1
    sta div32_HELP+1
    lda div32_HELP+2
    sbc div32_DIVISOR+2
    sta div32_HELP+2
    lda div32_HELP+3
    sbc div32_DIVISOR+3
    sta div32_HELP+3
    lda div32_HELP+4
    sbc div32_DIVISOR+4
    sta div32_HELP+4
.skip32
    ;+callFuncMono rotateLeft32Bit, div32_RES_DIV
    rol div32_RES_DIV+1
    rol div32_RES_DIV+2
    rol div32_RES_DIV+3
    rol div32_RES_DIV+4 
    dex                                 
    bne .loop32                         
    +callFunc move32Bit, div32_HELP, div32_RES_REM                  
    rts    
.dividendTooShort
    +callFunc move32Bit, div32_DIVIDEND, div32_RES_REM
    rts


; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4 
; and compares them. This routine ignores the sign byte it only compares the absolute values.
;
; The result is returned in the carry flag. Its is set if *opL >= *opR. In addition the the zero flag is set
; when the values are equal.
; --------------------------------------------------
cmp32BitUnsigned
    ldy #4                       ; start at MSB
    lda (ARITH_SCRATCH1), y
    cmp (ARITH_SCRATCH3), y
    beq .next1                   ; continue if equal
    rts                          ; carry contains result                   
.next1
    dey
    lda (ARITH_SCRATCH1), y
    cmp (ARITH_SCRATCH3), y
    beq .next2                   ; continue if equal
    rts                          ; carry contains result
.next2
    dey
    lda (ARITH_SCRATCH1), y
    cmp (ARITH_SCRATCH3), y
    beq .next3                   ; continue if equal
    rts                          ; carry contains result
.next3                           ; We get here only if all bytes before were equal
    dey
    lda (ARITH_SCRATCH1), y
    cmp (ARITH_SCRATCH3), y      ; carry contains result even if values are equal
.endCmp
    rts


; --------------------------------------------------
; This subroutine expects its operands in the zero page pointers ARITH_SCRATCH1 and ARITH_SCRATCH3
; and checks if they are equal. This routine takes the sign into account, i.e. +0 und -0 are considered
; to be equal. The special case where Null can have a positive or negative sign byte requires separate
; logic in the code below.
;
; The result is returned in the zero flag. Its is set if *opL == *opR.
; --------------------------------------------------
isEqual32Bit
    jsr cmp32BitUnsigned
    bne .equalDone                ; absolute values are different => numbers are unequal, zero flag is already cleared
    ; numbers are equal in their absolute value
    jsr isZero32Bit               ; check if *opL  is zero
    beq .equalDone                ; both values are zero => they are equal => return zero flag as result
.notZero
    ; Absolute values of operands are equal and they are nonzero => check sign bytes
    ldy #0
    lda (ARITH_SCRATCH1), y
    eor (ARITH_SCRATCH3), y      ; zero flag is set if the signs are equal as in that case the XOR result is zero
.equalDone
    rts


; --------------------------------------------------
; This subroutine expects its operand in the zero page pointer ARITH_SCRATCH1 and checks whether the 
; value referenced by it is equal to +0 or -0.
;
; The result is returned in the zero flag. It is set if *opL == +0 or -0.
; --------------------------------------------------
isZero32Bit
    ldy #4                        ; count from 4 to 1, ignore sign byte
.checkNextByte2
    lda (ARITH_SCRATCH1), y
    bne .notZero2                 ; There is at least one non zero byte, therefore the values is not zero. Zero flag remains clear
    dey
    bne .checkNextByte2           ; All bytes are zero => return with zero flag set (here Z is already set)
.notZero2
    rts


; --------------------------------------------------
; This subroutine expects its operand in the zero page pointer ARITH_SCRATCH1 and clears
; it, i.e. sets its value to +0
; --------------------------------------------------
clear32Bit
    ldy #4
    lda #0
.loopClear
    sta (ARITH_SCRATCH1), y
    dey
    bpl .loopClear
    rts

; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4 
; and compares them also considering the sign byte.
;
; The result is returned in the carry flag. Its is set if *opL >= *opR. In addition the 
; zero flag is set when the values are equal.
; --------------------------------------------------
cmp32Bit
    jsr isEqual32Bit
    bne .unequal
    ; values are equal
    sec                            ; Zero flag is already set => also set carry and we are done
    rts                            ; values are equal.
    ; In this code we do not have to deal with the case that left *and* right are zero with or 
    ; without different signs
.unequal
    ; If we are here the zero flag must not be
    ; set upon return as the values are unequal.
    ; We only have to deal with the carry flag.
    ;
    ; Check whether signs of left and right are equal
    ldy #0
    lda (ARITH_SCRATCH1), y
    eor (ARITH_SCRATCH3), y
    beq .signsEqual
    ;
    ; signs are unequal => left sign determines result.
    clc
    lda (ARITH_SCRATCH1), y
    bne .done                     ; left number is negative => Carry is already clear => we are done
    sec                           
    bcs .done                     ; left number is positive => set carry and we are done
.signsEqual
    ; signs of left and right are equal => we have to compare them
    lda (ARITH_SCRATCH1), y
    beq .bothSignsPositive
    ;
    ; both signs are negative
    jsr cmp32BitUnsigned
    ; both signs are negative => result is negation of comparison of the absolute values
    ; negate carry
    bcs .leftSmaller
    sec
    bcs .done
.leftSmaller
    clc
    bcc .done
.bothSignsPositive
    ;
    ; both signs are positive => use result of comparison of the absolute values
    jsr cmp32BitUnsigned
.done
    lda #1                         ; clear zero flag
    rts


; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4 
; The routine copies the value of oL to oR
;
; The result is returned in the second operand, i.e. *opR <- *opL 
; --------------------------------------------------
move32Bit
    ldy #4
.loopMove
    lda (ARITH_SCRATCH1),y
    sta (ARITH_SCRATCH3),y
    dey
    bpl .loopMove

    rts

!macro move32BitInline .src, .target {
    ldy #4
.loopMove
    lda .src,y
    sta .target,y
    dey
    bpl .loopMove
}

; --------------------------------------------------
; This subroutine expects its operand in ARITH_SCRATCH1/2.
; The routine then doubles the value of its operand by simply performing a left shift. It
; ignores the sign byte.
;
; The operand is modified, i.e. *op <- 2 * *op 
; --------------------------------------------------
double32Bit
    ldy #1                        ; skip sign value
    lda (ARITH_SCRATCH1),y
    asl
    sta (ARITH_SCRATCH1),y

    iny
    lda (ARITH_SCRATCH1),y
    rol
    sta (ARITH_SCRATCH1),y

    iny
    lda (ARITH_SCRATCH1),y
    rol
    sta (ARITH_SCRATCH1),y

    iny
    lda (ARITH_SCRATCH1),y
    rol
    sta (ARITH_SCRATCH1),y

    rts


; --------------------------------------------------
; This subroutine expects its operand in ARITH_SCRATCH1/2.
; The routine then rotates the value of its operand by performing a left shift which
; takes the carry into account. It ignores the sign byte.
;
; The operand is modified, i.e. *op <- rol *op 
; --------------------------------------------------
rotateLeft32Bit
    ldy #1                        ; skip sign value
    lda (ARITH_SCRATCH1),y
    rol
    sta (ARITH_SCRATCH1),y

    iny
    lda (ARITH_SCRATCH1),y
    rol
    sta (ARITH_SCRATCH1),y

    iny
    lda (ARITH_SCRATCH1),y
    rol
    sta (ARITH_SCRATCH1),y

    iny
    lda (ARITH_SCRATCH1),y
    rol
    sta (ARITH_SCRATCH1),y

    rts


; --------------------------------------------------
; This subroutine expects it operand in ARITH_SCRATCH1/2.
; The routine then halves the value of its operand by simply performing a right shift. It
; ignores the sign byte.
;
; The operand is modified, i.e. *op <- *op / 2 
; --------------------------------------------------
halve32Bit
    ldy #4 
    lda (ARITH_SCRATCH1),y                       
    lsr 
    sta (ARITH_SCRATCH1),y
    dey

    lda (ARITH_SCRATCH1),y                       
    ror 
    sta (ARITH_SCRATCH1),y
    dey

    lda (ARITH_SCRATCH1),y                       
    ror 
    sta (ARITH_SCRATCH1),y
    dey

    lda (ARITH_SCRATCH1),y                       
    ror 
    sta (ARITH_SCRATCH1),y

    rts

LEFT_GREATER_EQUAL_RIGHT
!byte 0

!macro prepareAddSub {
    lda #0
    sta LEFT_GREATER_EQUAL_RIGHT
    jsr cmp32BitUnsigned
    bcc .leftLessThanRight
    inc LEFT_GREATER_EQUAL_RIGHT
.leftLessThanRight
}

; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4.
; The routine adds the two operands and handles their signs correctly.
;
; The result is returned in the second operand, i.e. *opR <- *opL + *opR 
; --------------------------------------------------
add32Bit
    +prepareAddSub
    ldy #0
    lda (ARITH_SCRATCH1), y
    eor (ARITH_SCRATCH3), y
    beq .simpleAdd                         ; signs are equal => simply add values
    lda LEFT_GREATER_EQUAL_RIGHT
    bne .normalSub
    ; switched subtraction
    ; sign of result is sign of opR
    ; result is opR
    jsr sub32SwitchedUnsigned
    rts
.normalSub
    ; normal subtraction
    ; sign of result is sign of opL
    ; result is OpR
    lda (ARITH_SCRATCH1), y                ; set sign of result   
    sta (ARITH_SCRATCH3), y
    jsr sub32BitUnsigned
    rts
.simpleAdd
    ; addition
    ; sign of both operands is equal
    ; sign does not change
    jsr add32BitUnsigned
    rts


; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4.
; The routine subtracts the two operands and handles their signs correctly.
;
; The result is returned in the second operand, i.e. *opR <- *opL - *opR 
; --------------------------------------------------
sub32Bit
    +prepareAddSub
    ldy #0
    lda (ARITH_SCRATCH1), y
    eor (ARITH_SCRATCH3), y
    bne .simpleAdd2                       ; signs are different
    lda LEFT_GREATER_EQUAL_RIGHT
    bne .normalSub2
    ; switched subtraction
    ; sign of result is flipped
    ; result is opR
    lda (ARITH_SCRATCH3), y               ; set sign of result
    eor #1
    sta (ARITH_SCRATCH3), y
    jsr sub32SwitchedUnsigned
    rts
.normalSub2
    ; normal subtraction
    ; sign of result is unchanged
    ; result is opR
    jsr sub32BitUnsigned
    rts
.simpleAdd2
    ; add both operands
    ; sign of result is sign of opL
    ; result is opR
    lda (ARITH_SCRATCH1), y               ; set sign of result
    sta (ARITH_SCRATCH3), y
    jsr add32BitUnsigned
    rts

; --------------------------------------------------
; This subroutine expects its operand in ARITH_SCRATCH1/2. 
; The routine flips the sign of its operand
;
; The operand is modified, i.e. *op <- -*op 
; --------------------------------------------------
neg32Bit
    ldy #0
    lda (ARITH_SCRATCH1), y
    eor #1
    sta (ARITH_SCRATCH1), y
    rts

!macro neg32Inline .addr {
    lda .addr
    eor #1
    sta .addr
}

!macro normalize .zp, .offset {
    ldy #1
.loopNorm2
    lda TEMP_MUL-1+.offset, y
    sta (.zp), y
    iny
    cpy #5
    bne .loopNorm2

}

!macro mulCall .zp1, .zp2 {
    ldy #0
    lda (.zp1),y                ; set sign of result
    eor (.zp2),y
    sta (.zp2),y

    jsr mul32BitUnsigned
}

!macro squareCall .zp1 {
    ; The sign of the result always positive
    lda #0
    ldy #0
    sta (.zp1), y                 

    jsr square32BitUnsigned

}

; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4.
; The routine multiplies the two operands and handles their signs correctly.
;
; The result is returned in the second operand, i.e. *opR <- *opL * *opR 
; --------------------------------------------------
mul32Bit
    +mulCall ARITH_SCRATCH1, ARITH_SCRATCH3
    +normalize ARITH_SCRATCH3, 0

    rts

; --------------------------------------------------
; This subroutine expects its operand in ARITH_SCRATCH1/2.
; The routine squres its operand and handles the sign correctly.
;
; The result is returned in the operand, i.e. *opL <- *opL * *opL 
; --------------------------------------------------
square32Bit
    +squareCall ARITH_SCRATCH1
    +normalize ARITH_SCRATCH1, 0

    rts    

!ifdef FIXED8 {
; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4.
; The routine multiplies the two operands and handles their signs correctly. It is
; assumed that the most signifcant byte holds the integer part. Normalization
; is performed in such way that the resulting number also fits this format.
;
; The result is returned in the second operand, i.e. *opR <- *opL * *opR 
; --------------------------------------------------
mulFixedPoint8
    +mulCall ARITH_SCRATCH1, ARITH_SCRATCH3
    +normalize ARITH_SCRATCH3, 3

    rts

; --------------------------------------------------
; This subroutine expects its operand in ARITH_SCRATCH1/2.
; The routine squres its operand and handles the sign correctly. It is
; assumed that the most signifcant byte holds the integer part. Normalization
; is performed in such way that the resulting number also fits this format.
;
; The result is returned in the operand, i.e. *opL <- *opL * *opL 
; --------------------------------------------------
squareFixedPoint8
    +squareCall ARITH_SCRATCH1
    +normalize ARITH_SCRATCH1, 3

    rts 

}

!ifdef FIXED16 {
; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4.
; The routine multiplies the two operands and handles their signs correctly. It is
; assumed that the two most signifcant bytes hold the integer part. Normalization
; is performed in such way that the resulting number also fits this format.
;
; The result is returned in the second operand, i.e. *opR <- *opL * *opR 
; --------------------------------------------------
mulFixedPoint16
    +mulCall ARITH_SCRATCH1, ARITH_SCRATCH3
    +normalize ARITH_SCRATCH3, 2

    rts

; --------------------------------------------------
; This subroutine expects its operand in ARITH_SCRATCH1/2.
; The routine squres its operand and handles the sign correctly. It is
; assumed that the two most signifcant bytes hold the integer part. Normalization
; is performed in such way that the resulting number also fits this format.
;
; The result is returned in the operand, i.e. *opL <- *opL * *opL 
; --------------------------------------------------
squareFixedPoint16
    +squareCall ARITH_SCRATCH1
    +normalize ARITH_SCRATCH1, 2

    rts 

}

!ifdef FIXED24 {
; --------------------------------------------------
; This subroutine expects its operands in ARITH_SCRATCH1/2 and ARITH_SCRATCH3/4.
; The routine multiplies the two operands and handles their signs correctly.  It is
; assumed that the three most signifcant bytes hold the integer part. Normalization
; is performed in such way that the resulting number also fits this format.
;
; The result is returned in the second operand, i.e. *opR <- *opL * *opR 
; --------------------------------------------------
mulFixedPoint24
    +mulCall ARITH_SCRATCH1, ARITH_SCRATCH3
    +normalize ARITH_SCRATCH3, 1

    rts

; --------------------------------------------------
; This subroutine expects its operand in ARITH_SCRATCH1/2.
; The routine squres its operand and handles the sign correctly. It is
; assumed that the three most signifcant bytes hold the integer part. Normalization
; is performed in such way that the resulting number also fits this format.
;
; The result is returned in the operand, i.e. *opL <- *opL * *opL 
; --------------------------------------------------
squareFixedPoint24
    +squareCall ARITH_SCRATCH1
    +normalize ARITH_SCRATCH1, 1

    rts 

}