; SPDX-License-Identifier: MPL-2.0
; SPDX-FileCopyrightText: (c) 2025 A.M. Rowsell
; ============================================================
; Z80 Soft Float Library (4-byte) + Print + Parse (vasm syntax)
; ============================================================
; Float format in memory (big-endian, 4 bytes):
;   byte0: EXP   (8-bit biased exponent, 0 = zero)
;   byte1: S|F22..F16   (bit7 = sign, bits6..0 = top 7 fraction bits)
;   byte2: F15..F8
;   byte3: F7..F0
;
; For EXP != 0:
;   value = (-1)^S * (1.F) * 2^(EXP - FP_BIAS)
;   FP_BIAS = 127
;
; Calling convention (in-place ops):
;   HL -> A (4 bytes)
;   DE -> B (4 bytes)
;   fp_add: A = A + B  (stored back at HL)
;   fp_sub: A = A - B
;   fp_mul: A = A * B
;   fp_div: A = A / B
;
; Extra:
;   fp_print: print float at (HL) using external os_print_vec (A=ASCII)
;   fp_parse: parse null-terminated string at (DE) into float at (HL)
;
; Limitations:
;   - No NaN/Inf/denormals
;   - Truncation (no rounding)
;   - fp_print prints fixed decimals with a lightweight fraction path
;   - fp_parse supports optional +/- and '.' up to MAX_FRAC digits, no exponent notation
; ============================================================

.equ FP_BIAS,127
.equ FRAC_DIGITS,6
.equ MAX_FRAC,6

.extern os_print_vec
; ============================================================
; CODE
; ============================================================
.section "zone","acrx"
; ------------------------------------------------------------
; External routine you provide:
;   os_print_vec: prints ASCII character in A
; ------------------------------------------------------------
; os_print_vec is external, not defined here.

; ============================================================
; Public API: fp_add / fp_sub / fp_mul / fp_div
; ============================================================

; ------------------------------------------------------------
; fp_add: A = A + B
; ------------------------------------------------------------
fp_add:
    push hl
    push de
    call fp_unpackA
    pop de
    call fp_unpackB
    pop hl

    ; zero short-cuts
    ld a,(A_exp)
    or a
    jr nz,fp_add_checkB
    ; A==0 => result=B
    call fp_pack_from_B_into_A
    ret
fp_add_checkB:
    ld a,(B_exp)
    or a
    ret z

    ; if signs same -> magnitude add
    ld a,(A_sign)
    ld b,a
    ld a,(B_sign)
    xor b
    jp z,fp_add_same_sign

    ; signs differ -> magnitude subtract
    jp fp_add_diff_sign


; ------------------------------------------------------------
; fp_sub: A = A - B (flip B sign in memory, add, flip back)
; ------------------------------------------------------------
fp_sub:
    ; Flip sign bit of B byte1 (DE+1)
    push hl
    push de
    inc de
    ld a,(de)
    xor 0x80
    ld (de),a
    pop de
    pop hl

    call fp_add

    ; Flip sign bit back
    push hl
    push de
    inc de
    ld a,(de)
    xor 0x80
    ld (de),a
    pop de
    pop hl
    ret


; ------------------------------------------------------------
; fp_mul: A = A * B
; ------------------------------------------------------------
fp_mul:
    push hl
    push de
    call fp_unpackA
    pop de
    call fp_unpackB
    pop hl

    ; if A==0 or B==0 => 0
    ld a,(A_exp)
    or a
    jp z,fp_store_zero_A
    ld a,(B_exp)
    or a
    jp z,fp_store_zero_A

    ; sign = A_sign XOR B_sign
    ld a,(A_sign)
    ld b,a
    ld a,(B_sign)
    xor b
    ld (A_sign),a

    ; exponent = A_exp + B_exp - BIAS
    ld a,(A_exp)
    ld b,a
    ld a,(B_exp)
    add a,b
    sub FP_BIAS
    ld (A_exp),a

    ; product = A_mant * B_mant (24x24 => 48)
    call mul24x24_schoolbook

    ; normalize product into A mantissa
    call norm_product_to_A

    ; pack back into (HL)
    call fp_packA
    ret


; ------------------------------------------------------------
; fp_div: A = A / B
; ------------------------------------------------------------
fp_div:
    push hl
    push de
    call fp_unpackA
    pop de
    call fp_unpackB
    pop hl

    ; A==0 => 0
    ld a,(A_exp)
    or a
    jp z,fp_store_zero_A

    ; B==0 => return 0 (simple “error” behavior)
    ld a,(B_exp)
    or a
    jp z,fp_store_zero_A

    ; sign = A_sign XOR B_sign
    ld a,(A_sign)
    ld b,a
    ld a,(B_sign)
    xor b
    ld (A_sign),a

    ; exponent = A_exp - B_exp + BIAS
    ld a,(A_exp)
    ld b,a
    ld a,(B_exp)
    ld c,a
    ld a,b
    sub c
    add a,FP_BIAS
    ld (A_exp),a

    ; mantissa division
    call div_mantissas_to_A
    call normalize_A_mant

    call fp_packA
    ret


; ============================================================
; Add/Sub core (unpacked)
; ============================================================

fp_add_same_sign:
    call align_exponents_A_B
    call add24_A_plus_B

    ; if carry: shift right, exponent++
    jr nc,fp_add_same_sign_noCarry
    call shr24_A_1
    ld a,(A_exp)
    inc a
    ld (A_exp),a
fp_add_same_sign_noCarry:
    call normalize_A_mant
    call fp_packA
    ret


fp_add_diff_sign:
    ; compare |A| vs |B|, do larger - smaller, sign = sign(larger)
    call compare_mag_A_B
    jr c,fp_add_diff_sign_A_ge_B
    ; |B| > |A| => swap
    call swap_A_B_unpacked
fp_add_diff_sign_A_ge_B:
    call align_exponents_A_B
    call sub24_A_minus_B
    call is_A_mant_zero
    jp z,fp_store_zero_A
    call normalize_A_mant
    call fp_packA
    ret


; ============================================================
; Unpack / Pack helpers
; ============================================================

; Unpack A from (HL)
fp_unpackA:
    ld a,(hl)
    ld (A_exp),a
    or a
    jp z,fp_unpackA_zeroA
    inc hl
    ld a,(hl)
    ld b,a
    ; sign bit -> A_sign (0/1)
    and 0x80
    jp z,fp_unpackA_sa0
    ld a,1
    jr fp_unpackA_sa1
fp_unpackA_sa0:
    xor a
fp_unpackA_sa1:
    ld (A_sign),a

    ; mantissa bytes with hidden 1 inserted
    ld a,b
    and 0x7F
    or 0x80
    ld (A_m2),a
    inc hl
    ld a,(hl)
    ld (A_m1),a
    inc hl
    ld a,(hl)
    ld (A_m0),a
    ret
fp_unpackA_zeroA:
    xor a
    ld (A_sign),a
    ld (A_m2),a
    ld (A_m1),a
    ld (A_m0),a
    ret


; Unpack B from (DE)
fp_unpackB:
    ld a,(de)
    ld (B_exp),a
    or a
    jp z,fp_unpackB_zeroB
    inc de
    ld a,(de)
    ld b,a
    and 0x80
    jp z,fp_unpackB_sb0
    ld a,1
    jr fp_unpackB_sb1
fp_unpackB_sb0:
    xor a
fp_unpackB_sb1:
    ld (B_sign),a

    ld a,b
    and 0x7F
    or 0x80
    ld (B_m2),a
    inc de
    ld a,(de)
    ld (B_m1),a
    inc de
    ld a,(de)
    ld (B_m0),a
    ret
fp_unpackB_zeroB:
    xor a
    ld (B_sign),a
    ld (B_m2),a
    ld (B_m1),a
    ld (B_m0),a
    ret


; Pack unpacked A back into memory at (HL)
fp_packA:
    ld a,(A_exp)
    or a
    jr nz,fp_packA_packNZ
    ; store 0
    ld (hl),0
    inc hl
    ld (hl),0
    inc hl
    ld (hl),0
    inc hl
    ld (hl),0
    ret

fp_packA_packNZ:
    ld a,(A_exp)
    ld (hl),a
    inc hl

    ; remove hidden 1
    ld a,(A_m2)
    and 0x7F
    ld b,a

    ; apply sign bit7
    ld a,(A_sign)
    or a
    jp z,fp_packA_sign0
    ld a,b
    or 0x80
    jr fp_packA_storeB1
fp_packA_sign0:
    ld a,b
fp_packA_storeB1:
    ld (hl),a
    inc hl
    ld a,(A_m1)
    ld (hl),a
    inc hl
    ld a,(A_m0)
    ld (hl),a
    ret


; Pack from unpacked B into memory A (HL points to A destination)
fp_pack_from_B_into_A:
    ld a,(B_exp)
    ld (hl),a
    inc hl
    ld a,(B_m2)
    and 0x7F
    ld b,a
    ld a,(B_sign)
    or a
    jp z,fp_pack_from_B_bs0
    ld a,b
    or 0x80
    jr fp_pack_from_B_bs1
fp_pack_from_B_bs0:
    ld a,b
fp_pack_from_B_bs1:
    ld (hl),a
    inc hl
    ld a,(B_m1)
    ld (hl),a
    inc hl
    ld a,(B_m0)
    ld (hl),a
    ret


fp_store_zero_A:
    xor a
    ld (A_exp),a
    ld (A_sign),a
    ld (A_m2),a
    ld (A_m1),a
    ld (A_m0),a
    jp fp_packA


; ============================================================
; Exponent alignment / compare / swap
; ============================================================

; Ensure A_exp >= B_exp; shift smaller mantissa right by diff
align_exponents_A_B:
    ld a,(A_exp)
    ld b,a
    ld a,(B_exp)
    cp b
    jr z,align_exponents_A_B_done
    jr c,align_exponents_A_B_bigger_exp     ; B_exp < A_exp
    call swap_A_B_unpacked ; make A the larger exponent
align_exponents_A_B_bigger_exp:
    ld a,(A_exp)
    ld b,a
    ld a,(B_exp)
    ld c,a
    ld a,b
    sub c          ; A = diff
    call shr24_B_by_A
    ld a,(A_exp)
    ld (B_exp),a
align_exponents_A_B_done:
    ret


; Carry set if |A| >= |B|, else carry clear
compare_mag_A_B:
    ld a,(A_exp)
    ld b,a
    ld a,(B_exp)
    cp b
    jr z,compare_mag_A_B_cmpMant
    jr c,compare_mag_A_B_ge
    or a
    ret
compare_mag_A_B_ge:
    scf
    ret
compare_mag_A_B_cmpMant:
    ld a,(A_m2)
    ld b,a
    ld a,(B_m2)
    cp b
    jr z,compare_mag_A_B_m1
    jr c,compare_mag_A_B_ge2
    or a
    ret
compare_mag_A_B_ge2:
    scf
    ret
compare_mag_A_B_m1:
    ld a,(A_m1)
    ld b,a
    ld a,(B_m1)
    cp b
    jr z,compare_mag_A_B_m0
    jr c,compare_mag_A_B_ge3
    or a
    ret
compare_mag_A_B_ge3:
    scf
    ret
compare_mag_A_B_m0:
    ld a,(A_m0)
    ld b,a
    ld a,(B_m0)
    cp b
    jr c,compare_mag_A_B_ge4
    scf
    ret
compare_mag_A_B_ge4:
    scf
    ret


swap_A_B_unpacked:
    ld a,(A_exp)
    ld b,a
    ld a,(B_exp)
    ld (A_exp),a
    ld a,b
    ld (B_exp),a
    ld a,(A_sign)
    ld b,a
    ld a,(B_sign)
    ld (A_sign),a
    ld a,b
    ld (B_sign),a
    ld a,(A_m2)
    ld b,a
    ld a,(B_m2)
    ld (A_m2),a
    ld a,b
    ld (B_m2),a
    ld a,(A_m1)
    ld b,a
    ld a,(B_m1)
    ld (A_m1),a
    ld a,b
    ld (B_m1),a
    ld a,(A_m0)
    ld b,a
    ld a,(B_m0)
    ld (A_m0),a
    ld a,b
    ld (B_m0),a
    ret


; ============================================================
; 24-bit mantissa ops
; ============================================================

add24_A_plus_B:
    ld a,(B_m0)
    ld b,a
    ld a,(A_m0)
    add a,b
    ld (A_m0),a
    ld a,(B_m1)
    ld b,a
    ld a,(A_m1)
    adc a,b
    ld (A_m1),a
    ld a,(B_m2)
    ld b,a
    ld a,(A_m2)
    adc a,b
    ld (A_m2),a
    ret  ; carry meaningful


sub24_A_minus_B:
    ld a,(B_m0)
    ld b,a
    ld a,(A_m0)
    sub b
    ld (A_m0),a
    ld a,(B_m1)
    ld b,a
    ld a,(A_m1)
    sbc a,b
    ld (A_m1),a
    ld a,(B_m2)
    ld b,a
    ld a,(A_m2)
    sbc a,b
    ld (A_m2),a
    ret


is_A_mant_zero:
    ld a,(A_m2)
    ld b,a
    ld a,(A_m1)
    or b
    ld b,a
    ld a,(A_m0)
    or b
    ret


shr24_A_1:
    ld a,(A_m2)
    srl a
    ld (A_m2),a
    ld a,(A_m1)
    rr  a
    ld (A_m1),a
    ld a,(A_m0)
    rr  a
    ld (A_m0),a
    ret


shl24_A_1:
    ld a,(A_m0)
    add a,a
    ld (A_m0),a
    ld a,(A_m1)
    adc a,a
    ld (A_m1),a
    ld a,(A_m2)
    adc a,a
    ld (A_m2),a
    ret


; Shift B mantissa right by A bits (A=0..255)
shr24_B_by_A:
    ld (SHCNT),a
    ld a,(SHCNT)
    cp 24
    jr c,shr24_B_by_A_ok
    xor a
    ld (B_m2),a
    ld (B_m1),a
    ld (B_m0),a
    ret
shr24_B_by_A_ok:
    ld a,(SHCNT)
    or a
    ret z
shr24_B_by_A_loop:
    ld a,(B_m2)
    srl a
    ld (B_m2),a
    ld a,(B_m1)
    rr  a
    ld (B_m1),a
    ld a,(B_m0)
    rr  a
    ld (B_m0),a
    ld a,(SHCNT)
    dec a
    ld (SHCNT),a
    jr nz,shr24_B_by_A_loop
    ret


normalize_A_mant:
    call is_A_mant_zero
    jr nz,normalize_A_mant_nz
    xor a
    ld (A_exp),a
    ret
normalize_A_mant_nz:
    ld a,(A_m2)
    bit 7,a
    ret nz
normalize_A_mant_left_loop:
    ld a,(A_m0)
    add a,a
    ld (A_m0),a
    ld a,(A_m1)
    adc a,a
    ld (A_m1),a
    ld a,(A_m2)
    adc a,a
    ld (A_m2),a
    ld a,(A_exp)
    dec a
    ld (A_exp),a
    ld a,(A_m2)
    bit 7,a
    jr z,normalize_A_mant_left_loop
    ret


; ============================================================
; 8x8 -> 16 multiply (unsigned), shift-add
; in:  A = multiplicand, C = multiplier
; out: HL = 16-bit product
; ============================================================
mul8u:
    ld h,0
    ld l,0
    ld b,8
mul8u_m8:
    srl c
    jr nc,mul8u_noadd
    ld e,a
    ld d,0
    add hl,de
mul8u_noadd:
    add a,a
    djnz mul8u_m8
    ret


; ============================================================
; 24x24 schoolbook multiply into P0..P5 (P0 LSB)
; ============================================================
mul24x24_schoolbook:
    xor a
    ld (P0),a
    ld (P1),a
    ld (P2),a
    ld (P3),a
    ld (P4),a
    ld (P5),a

    ; (0,0) offset 0
    ld a,(B_m0)
    ld c,a
    ld a,(A_m0)
    call mul8u
    call add16_to_P_at0

    ; (0,1) offset 1
    ld a,(B_m1)
    ld c,a
    ld a,(A_m0)
    call mul8u
    call add16_to_P_at1

    ; (0,2) offset 2
    ld a,(B_m2)
    ld c,a
    ld a,(A_m0)
    call mul8u
    call add16_to_P_at2

    ; (1,0) offset 1
    ld a,(B_m0)
    ld c,a
    ld a,(A_m1)
    call mul8u
    call add16_to_P_at1

    ; (1,1) offset 2
    ld a,(B_m1)
    ld c,a
    ld a,(A_m1)
    call mul8u
    call add16_to_P_at2

    ; (1,2) offset 3
    ld a,(B_m2)
    ld c,a
    ld a,(A_m1)
    call mul8u
    call add16_to_P_at3

    ; (2,0) offset 2
    ld a,(B_m0)
    ld c,a
    ld a,(A_m2)
    call mul8u
    call add16_to_P_at2

    ; (2,1) offset 3
    ld a,(B_m1)
    ld c,a
    ld a,(A_m2)
    call mul8u
    call add16_to_P_at3

    ; (2,2) offset 4
    ld a,(B_m2)
    ld c,a
    ld a,(A_m2)
    call mul8u
    call add16_to_P_at4

    ret


add16_to_P_at0:
    ld a,(P0)
    add a,l
    ld (P0),a
    ld a,(P1)
    adc a,h
    ld (P1),a
    ret
add16_to_P_at1:
    ld a,(P1)
    add a,l
    ld (P1),a
    ld a,(P2)
    adc a,h
    ld (P2),a
    ret
add16_to_P_at2:
    ld a,(P2)
    add a,l
    ld (P2),a
    ld a,(P3)
    adc a,h
    ld (P3),a
    ret
add16_to_P_at3:
    ld a,(P3)
    add a,l
    ld (P3),a
    ld a,(P4)
    adc a,h
    ld (P4),a
    ret
add16_to_P_at4:
    ld a,(P4)
    add a,l
    ld (P4),a
    ld a,(P5)
    adc a,h
    ld (P5),a
    ret


; ============================================================
; Normalize product P into A mantissa
; P is 48-bit, P0 LSB .. P5 MSB
; ============================================================
norm_product_to_A:
    ld a,(P5)
    bit 7,a
    jr z,norm_product_shift23
    ld a,24
    call shr48_P_by_A
    ld a,(A_exp)
    inc a
    ld (A_exp),a
    jr norm_product_take
norm_product_shift23:
    ld a,23
    call shr48_P_by_A
norm_product_take:
    ld a,(P2)
    ld (A_m2),a
    ld a,(P1)
    ld (A_m1),a
    ld a,(P0)
    ld (A_m0),a
    ret


shr48_P_by_A:
    ld (SHCNT),a
    ld a,(SHCNT)
    or a
    ret z
shr48_P_by_A_loop:
    ld a,(P5)
    srl a
    ld (P5),a
    ld a,(P4)
    rr  a
    ld (P4),a
    ld a,(P3)
    rr  a
    ld (P3),a
    ld a,(P2)
    rr  a
    ld (P2),a
    ld a,(P1)
    rr  a
    ld (P1),a
    ld a,(P0)
    rr  a
    ld (P0),a
    ld a,(SHCNT)
    dec a
    ld (SHCNT),a
    jr nz,shr48_P_by_A_loop
    ret


; ============================================================
; Mantissa division (restoring-style)
; A_m = (A_m << 23) / B_m
; ============================================================
div_mantissas_to_A:
    ; P = A_m as 48-bit, then shift left 23
    xor a
    ld (P3),a
    ld (P4),a
    ld (P5),a
    ld a,(A_m0)
    ld (P0),a
    ld a,(A_m1)
    ld (P1),a
    ld a,(A_m2)
    ld (P2),a

    ld a,23
    call shl48_P_by_A

    ; clear quotient
    xor a
    ld (A_m2),a
    ld (A_m1),a
    ld (A_m0),a

    ld b,24
div_mantissas_loop:
    call shl24_A_1
    call shl48_P_1

    ; subtract divisor from high 24 bits of P (P5..P3)
    call sub24_Phigh_minus_B
    jr c,div_mantissas_restore
    ; success => set quotient LSB = 1
    ld a,(A_m0)
    or 0x1
    ld (A_m0),a
    jr div_mantissas_next
div_mantissas_restore:
    call add24_Phigh_plus_B
div_mantissas_next:
    djnz div_mantissas_loop
    ret


shl48_P_by_A:
    ld (SHCNT),a
    ld a,(SHCNT)
    or a
    ret z
shl48_P_by_A_loop:
    call shl48_P_1
    ld a,(SHCNT)
    dec a
    ld (SHCNT),a
    jr nz,shl48_P_by_A_loop
    ret


shl48_P_1:
    ld a,(P0)
    add a,a
    ld (P0),a
    ld a,(P1)
    adc a,a
    ld (P1),a
    ld a,(P2)
    adc a,a
    ld (P2),a
    ld a,(P3)
    adc a,a
    ld (P3),a
    ld a,(P4)
    adc a,a
    ld (P4),a
    ld a,(P5)
    adc a,a
    ld (P5),a
    ret


sub24_Phigh_minus_B:
    ld a,(B_m0)
    ld b,a
    ld a,(P3)
    sub b
    ld (P3),a
    ld a,(B_m1)
    ld b,a
    ld a,(P4)
    sbc a,b
    ld (P4),a
    ld a,(B_m2)
    ld b,a
    ld a,(P5)
    sbc a,b
    ld (P5),a
    ret    ; carry set indicates borrow


add24_Phigh_plus_B:
    ld a,(B_m0)
    ld b,a
    ld a,(P3)
    add a,b
    ld (P3),a
    ld a,(B_m1)
    ld b,a
    ld a,(P4)
    adc a,b
    ld (P4),a
    ld a,(B_m2)
    ld b,a
    ld a,(P5)
    adc a,b
    ld (P5),a
    ret


; ============================================================
; fp_print: fixed format printing
; Prints: [-]I.FFFFFF (FRAC_DIGITS digits)
; Uses os_print_vec (A=char)
; ============================================================
fp_print:
    ; zero?
    ld a,(hl)
    or a
    jr nz,fp_print_nz
    ld a,'0'
    call os_print_vec
    ld a,'.'
    call os_print_vec
    ld b,FRAC_DIGITS
fp_print_zf:
    ld a,'0'
    call os_print_vec
    djnz fp_print_zf
    ret

fp_print_nz:
    ; EXP -> PR_E (unbiased)
    ld a,(hl)
    sub FP_BIAS
    ld (PR_E),a
    inc hl

    ; sign + top fraction
    ld a,(hl)
    ld b,a
    and 0x80
    jp z,fp_print_ps0
    ld a,1
    jr fp_print_ps1
fp_print_ps0:
    xor a
fp_print_ps1:
    ld (PR_SIGN),a

    ; mantissa with hidden 1 inserted
    ld a,b
    and 0x7F
    or 0x80
    ld (PR_M2),a
    inc hl
    ld a,(hl)
    ld (PR_M1),a
    inc hl
    ld a,(hl)
    ld (PR_M0),a

    ; print '-'
    ld a,(PR_SIGN)
    or a
    jp z,fp_print_mag
    ld a,'-'
    call os_print_vec
fp_print_mag:
    ; S = (E - 23)
    ld a,(PR_E)
    sub 23

    ; clear int and remainder helpers
    xor a
    ld (PR_INT0),a
    ld (PR_INT1),a
    ld (PR_INT2),a
    ld (PR_INT3),a
    ld (PR_R3),a

    bit 7,a
    jp z,fp_print_S_nonneg

    ; S negative: INT = [M2][M1][M0][00] (i.e., M << 8), then shift right by -S
    neg
    ld b,a                 ; B = shift count

    xor a
    ld (PR_INT0),a
    ld a,(PR_M0)
    ld (PR_INT1),a
    ld a,(PR_M1)
    ld (PR_INT2),a
    ld a,(PR_M2)
    ld (PR_INT3),a

    call shr32_INT_to_INT_with_remainder
    jr fp_print_print_int_and_frac

fp_print_S_nonneg:
    ; S non-negative: INT = M (24-bit) then shift left S (cap at 31)
    cp 32
    jr c,fp_print_doShl
    ld a,31
fp_print_doShl:
    ld b,a
    ld a,(PR_M0)
    ld (PR_INT0),a
    ld a,(PR_M1)
    ld (PR_INT1),a
    ld a,(PR_M2)
    ld (PR_INT2),a
    xor a
    ld (PR_INT3),a
    call shl32_INT_by_B

fp_print_print_int_and_frac:
    call print_u32_dec
    ld a,'.'
    call os_print_vec
    ld b,FRAC_DIGITS
fp_print_fr:
    call mul_remainder_by_10
    ld a,(PR_R3)
    add a,'0'
    call os_print_vec
    xor a
    ld (PR_R3),a
    djnz fp_print_fr
    ret


; Shift-right PR_INT by B, collect shifted-out bits into PR_R3 (simplified)
shr32_INT_to_INT_with_remainder:
    xor a
    ld (PR_R3),a
    ld a,b
    or a
    ret z
shr32_INT_to_INT_with_remainder_loop:
    ld a,(PR_INT3)
    srl a
    ld (PR_INT3),a
    ld a,(PR_INT2)
    rr  a
    ld (PR_INT2),a
    ld a,(PR_INT1)
    rr  a
    ld (PR_INT1),a
    ld a,(PR_INT0)
    rr  a
    ld (PR_INT0),a
    ; carry has shifted-out bit; accumulate into PR_R3
    ld a,(PR_R3)
    add a,a
    adc a,0
    ld (PR_R3),a
    djnz shr32_INT_to_INT_with_remainder_loop
    ret


shl32_INT_by_B:
    ld a,b
    or a
    ret z
shl32_INT_by_B_loop:
    ld a,(PR_INT0)
    add a,a
    ld (PR_INT0),a
    ld a,(PR_INT1)
    adc a,a
    ld (PR_INT1),a
    ld a,(PR_INT2)
    adc a,a
    ld (PR_INT2),a
    ld a,(PR_INT3)
    adc a,a
    ld (PR_INT3),a
    djnz shl32_INT_by_B_loop
    ret


mul_remainder_by_10:
    ld a,(PR_R3)
    ld b,a
    add a,a      ; *2
    add a,a      ; *4
    add a,a      ; *8
    add a,b      ; *9
    add a,b      ; *10
    ld (PR_R3),a
    ret


; Print PR_INT (u32) as decimal
print_u32_dec:
    ld a,(PR_INT0)
    ld b,a
    ld a,(PR_INT1)
    or b
    ld b,a
    ld a,(PR_INT2)
    or b
    ld b,a
    ld a,(PR_INT3)
    or b
    jr nz,print_u32_dec_nz
    ld a,'0'
    call os_print_vec
    ret
print_u32_dec_nz:
    xor a
    ld (DIGLEN),a
print_u32_dec_dloop:
    call u32_div10_inplace   ; remainder in A, quotient back in PR_INT
    ld hl,DIGBUF
    ld b,0
    ld a,(DIGLEN)
    ld c,a
    add hl,bc
    add a,'0'
    ld (hl),a
    ld a,(DIGLEN)
    inc a
    ld (DIGLEN),a
    ld a,(PR_INT0)
    ld b,a
    ld a,(PR_INT1)
    or b
    ld b,a
    ld a,(PR_INT2)
    or b
    ld b,a
    ld a,(PR_INT3)
    or b
    jr nz,print_u32_dec_dloop

    ; print in reverse
    ld a,(DIGLEN)
    ld b,a
print_u32_dec_pr:
    dec b
    ld hl,DIGBUF
    ld c,b
    ld b,0
    add hl,bc
    ld a,(hl)
    call os_print_vec
    ld a,c
    or a
    jr nz,print_u32_dec_pr
    ret


; Divide PR_INT (u32) by 10, return remainder in A (0..9)
u32_div10_inplace:
    ld b,0          ; remainder
    ld hl,PR_INT3
    call u32_div10_step
    inc hl
    call u32_div10_step
    inc hl
    call u32_div10_step
    inc hl
    call u32_div10_step
    ld a,b
    ret
u32_div10_step:
    ; DE = remainder*256 + byte
    ld a,b
    ld d,a
    ld e,(hl)
    ld c,0          ; quotient byte
u32_div10_div:
    ld a,d
    or a
    jr nz,u32_div10_sub
    ld a,e
    cp 10
    jr c,u32_div10_done
u32_div10_sub:
    ld a,e
    sub 10
    ld e,a
    ld a,d
    sbc a,0
    ld d,a
    inc c
    jr u32_div10_div
u32_div10_done:
    ld (hl),c
    ld b,e
    ret


; ============================================================
; fp_parse: parse decimal string -> float
; DE -> "[-]ddd[.ddd]\0"
; HL -> output float
; ============================================================
fp_parse:
    xor a
    ld (P_SIGN),a
    ld (P_FRACN),a
    ld (P_S0),a
    ld (P_S1),a
    ld (P_S2),a
    ld (P_S3),a

    ; optional sign
    ld a,(de)
    cp '-'
    jr nz,fp_parse_chkplus
    ld a,1
    ld (P_SIGN),a
    inc de
    jr fp_parse_intpart
fp_parse_chkplus:
    ld a,(de)
    cp '+'
    jr nz,fp_parse_intpart
    inc de

fp_parse_intpart:
    ld a,(de)
    call is_digit
    jr nc,fp_parse_maybe_dot
fp_parse_il:
    ld a,(de)
    sub '0'
    ld c,a
    call u32_mul10_scaled
    call u32_add8_scaled
    inc de
    ld a,(de)
    call is_digit
    jr c,fp_parse_il

fp_parse_maybe_dot:
    ld a,(de)
    cp '.'
    jr nz,fp_parse_finish_scaled
    inc de

    ld b,MAX_FRAC
fp_parse_fl:
    ld a,(de)
    call is_digit
    jr nc,fp_parse_finish_scaled
    ld a,(de)
    sub '0'
    ld c,a
    call u32_mul10_scaled
    call u32_add8_scaled
    ld a,(P_FRACN)
    inc a
    ld (P_FRACN),a
    inc de
    djnz fp_parse_fl

fp_parse_finish_scaled:
    ; convert scaled u32 to float into (HL)
    call fp_from_u32_scaled_to_A

    ; divide by 10^k if needed
    ld a,(P_FRACN)
    or a
    jp z,fp_parse_apply_sign

    ; DE = &pow10_table[k]
    push hl
    ld e,a
    ld d,0
    ld hl,pow10_table
    add hl,de
    add hl,de
    add hl,de
    add hl,de
    ex de,hl
    pop hl
    call fp_div

fp_parse_apply_sign:
    ld a,(P_SIGN)
    or a
    ret z
    inc hl
    ld a,(hl)
    xor 0x80
    ld (hl),a
    ret


is_digit:
    cp '0'
    jr c,is_digit_no
    cp '9'+1
    jr nc,is_digit_no
    scf
    ret
is_digit_no:
    or a
    ret


; P_S = P_S*10  (uses PR_INT and PR_R0..3 as scratch)
u32_mul10_scaled:
    ; PR_INT = P
    ld a,(P_S0)
    ld (PR_INT0),a
    ld a,(P_S1)
    ld (PR_INT1),a
    ld a,(P_S2)
    ld (PR_INT2),a
    ld a,(P_S3)
    ld (PR_INT3),a
    ; PR_INT *=2
    ld b,1
    call shl32_INT_by_B

    ; PR_R = P
    ld a,(P_S0)
    ld (PR_R0),a
    ld a,(P_S1)
    ld (PR_R1),a
    ld a,(P_S2)
    ld (PR_R2),a
    ld a,(P_S3)
    ld (PR_R3),a
    ; PR_R *=8 (shift left 3)
    ld b,3
    call shl32_R_by_B

    ; P = PR_INT + PR_R
    ld a,(PR_R0)
    ld b,a
    ld a,(PR_INT0)
    add a,b
    ld (P_S0),a
    ld a,(PR_R1)
    ld b,a
    ld a,(PR_INT1)
    adc a,b
    ld (P_S1),a
    ld a,(PR_R2)
    ld b,a
    ld a,(PR_INT2)
    adc a,b
    ld (P_S2),a
    ld a,(PR_R3)
    ld b,a
    ld a,(PR_INT3)
    adc a,b
    ld (P_S3),a
    ret


shl32_R_by_B:
    ld a,b
    or a
    ret z
shl32_R_by_B_loop:
    ld a,(PR_R0)
    add a,a
    ld (PR_R0),a
    ld a,(PR_R1)
    adc a,a
    ld (PR_R1),a
    ld a,(PR_R2)
    adc a,a
    ld (PR_R2),a
    ld a,(PR_R3)
    adc a,a
    ld (PR_R3),a
    djnz shl32_R_by_B_loop
    ret


; P_S += C (0..9)
u32_add8_scaled:
    ld a,(P_S0)
    add a,c
    ld (P_S0),a
    ld a,(P_S1)
    adc a,0
    ld (P_S1),a
    ld a,(P_S2)
    adc a,0
    ld (P_S2),a
    ld a,(P_S3)
    adc a,0
    ld (P_S3),a
    ret


; Convert P_S (u32) to float at (HL). Positive only; sign handled by caller.
fp_from_u32_scaled_to_A:
    ld a,(P_S0)
    ld b,a
    ld a,(P_S1)
    or b
    ld b,a
    ld a,(P_S2)
    or b
    ld b,a
    ld a,(P_S3)
    or b
    jr nz,fp_from_u32_scaled_to_A_nz
    ld (hl),0
    inc hl
    ld (hl),0
    inc hl
    ld (hl),0
    inc hl
    ld (hl),0
    ret

fp_from_u32_scaled_to_A_nz:
    ; find MSB index in B (0..31)
    ld b,31
    ld a,(P_S3)
    ld c,a
    or a
    jr nz,fp_from_u32_scaled_to_A_scan
    ld b,23
    ld a,(P_S2)
    ld c,a
    or a
    jr nz,fp_from_u32_scaled_to_A_scan
    ld b,15
    ld a,(P_S1)
    ld c,a
    or a
    jr nz,fp_from_u32_scaled_to_A_scan
    ld b,7
    ld a,(P_S0)
    ld c,a
fp_from_u32_scaled_to_A_scan:
fp_from_u32_scaled_to_A_find:
    bit 7,c
    jr nz,fp_from_u32_scaled_to_A_found
    ld a,c
    add a,a
    ld c,a
    dec b
    jr fp_from_u32_scaled_to_A_find
fp_from_u32_scaled_to_A_found:
    ; EXP = FP_BIAS + B
    ld a,b
    add a,FP_BIAS
    ld (hl),a
    inc hl

    ; shift value left by (23-B), take top 24 bits
    ld a,23
    sub b
    ld b,a

    ; PR_INT = P_S
    ld a,(P_S0)
    ld (PR_INT0),a
    ld a,(P_S1)
    ld (PR_INT1),a
    ld a,(P_S2)
    ld (PR_INT2),a
    ld a,(P_S3)
    ld (PR_INT3),a
    call shl32_INT_by_B

    ; store sign=0, fraction = top 23 bits of mantissa (hidden 1 removed)
    ld a,(PR_INT3)
    and 0x7F
    ld (hl),a
    inc hl
    ld a,(PR_INT2)
    ld (hl),a
    inc hl
    ld a,(PR_INT1)
    ld (hl),a
    ret


; ============================================================
; BSS / WORKSPACE
; ============================================================
.balign 16
.bss

; Unpacked A
A_exp: .space 1
A_sign: .space 1
A_m2: .space 1
A_m1: .space 1
A_m0: .space 1

; Unpacked B
B_exp: .space 1
B_sign: .space 1
B_m2: .space 1
B_m1: .space 1
B_m0: .space 1

; 48-bit workspace (P0 LSB .. P5 MSB)
P0: .space 1
P1: .space 1
P2: .space 1
P3: .space 1
P4: .space 1
P5: .space 1

SHCNT: .space 1

; Print temps
PR_SIGN: .space 1
PR_E: .space 1
PR_M2: .space 1
PR_M1: .space 1
PR_M0: .space 1
PR_INT0: .space 1
PR_INT1: .space 1
PR_INT2: .space 1
PR_INT3: .space 1
PR_R0: .space 1
PR_R1: .space 1
PR_R2: .space 1
PR_R3: .space 1

; Parse temps
P_SIGN: .space 1
P_FRACN: .space 1
P_S0: .space 1
P_S1: .space 1
P_S2: .space 1
P_S3: .space 1

; Digit buffer
DIGBUF: .space 1
DIGLEN: .space 1

; ============================================================
; pow10_table: 10^k constants (k=0..6) in THIS float encoding
; Verified:
;   1.0      = 127 00 00 00
;   10.0     = 130 20 00 00
;   100.0    = 133 48 00 00
;   1000.0   = 136 7A 00 00
;   10000.0  = 140 1C 40 00
;   100000.0 = 143 43 50 00
;   1000000.0= 146 74 24 00
; ============================================================
.section "zone","acrx"
pow10_table:
    .byte 127, 0x00, 0x00, 0x00   ; 10^0 = 1
    .byte 130, 0x20, 0x00, 0x00   ; 10^1 = 10
    .byte 133, 0x48, 0x00, 0x00   ; 10^2 = 100
    .byte 136, 0x7A, 0x00, 0x00   ; 10^3 = 1000
    .byte 140, 0x1C, 0x40, 0x00   ; 10^4 = 10000
    .byte 143, 0x43, 0x50, 0x00   ; 10^5 = 100000
    .byte 146, 0x74, 0x24, 0x00   ; 10^6 = 1000000