zone/float.asm

; ============================================================
; Z80 Soft Float Library (4-byte) + Print + Parse (vasm syntax)
; ============================================================
; Float format in memory (big-endian, 4 bytes):
;   byte0: EXP   (8-bit biased exponent, 0 = zero)
;   byte1: S|F22..F16   (bit7 = sign, bits6..0 = top 7 fraction bits)
;   byte2: F15..F8
;   byte3: F7..F0
;
; For EXP != 0:
;   value = (-1)^S * (1.F) * 2^(EXP - FP_BIAS)
;   FP_BIAS = 127
;
; Calling convention (in-place ops):
;   HL -> A (4 bytes)
;   DE -> B (4 bytes)
;   fp_add: A = A + B  (stored back at HL)
;   fp_sub: A = A - B
;   fp_mul: A = A * B
;   fp_div: A = A / B
;
; Extra:
;   fp_print: print float at (HL) using external printChar (A=ASCII)
;   fp_parse: parse null-terminated string at (DE) into float at (HL)
;
; Limitations:
;   - No NaN/Inf/denormals
;   - Truncation (no rounding)
;   - fp_print prints fixed decimals with a lightweight fraction path
;   - fp_parse supports optional +/- and '.' up to MAX_FRAC digits, no exponent notation
; ============================================================

.equ FP_BIAS,127
.equ FRAC_DIGITS,6
.equ MAX_FRAC,6

.global printChar
; ============================================================
; CODE
; ============================================================
.text

; ------------------------------------------------------------
; External routine you provide:
;   printChar: prints ASCII character in A
; ------------------------------------------------------------
; printChar is external, not defined here.

; ============================================================
; Public API: fp_add / fp_sub / fp_mul / fp_div
; ============================================================

; ------------------------------------------------------------
; fp_add: A = A + B
; ------------------------------------------------------------
fp_add:
    push hl
    push de
    call fp_unpackA
    pop de
    call fp_unpackB
    pop hl

    ; zero short-cuts
    ld a,(A_exp)
    or a
    jr nz,.checkB
    ; A==0 => result=B
    call fp_pack_from_B_into_A
    ret
.checkB:
    ld a,(B_exp)
    or a
    ret z

    ; if signs same -> magnitude add
    ld a,(A_sign)
    ld b,a
    ld a,(B_sign)
    xor b
    jr z,fp_add_same_sign

    ; signs differ -> magnitude subtract
    jp fp_add_diff_sign


; ------------------------------------------------------------
; fp_sub: A = A - B (flip B sign in memory, add, flip back)
; ------------------------------------------------------------
fp_sub:
    ; Flip sign bit of B byte1 (DE+1)
    push hl
    push de
    inc de
    ld a,(de)
    xor 080h
    ld (de),a
    pop de
    pop hl

    call fp_add

    ; Flip sign bit back
    push hl
    push de
    inc de
    ld a,(de)
    xor 080h
    ld (de),a
    pop de
    pop hl
    ret


; ------------------------------------------------------------
; fp_mul: A = A * B
; ------------------------------------------------------------
fp_mul:
    push hl
    push de
    call fp_unpackA
    pop de
    call fp_unpackB
    pop hl

    ; if A==0 or B==0 => 0
    ld a,(A_exp)
    or a
    jr z,fp_store_zero_A
    ld a,(B_exp)
    or a
    jr z,fp_store_zero_A

    ; sign = A_sign XOR B_sign
    ld a,(A_sign)
    ld b,a
    ld a,(B_sign)
    xor b
    ld (A_sign),a

    ; exponent = A_exp + B_exp - BIAS
    ld a,(A_exp)
    ld b,a
    ld a,(B_exp)
    add a,b
    sub FP_BIAS
    ld (A_exp),a

    ; product = A_mant * B_mant (24x24 => 48)
    call mul24x24_schoolbook

    ; normalize product into A mantissa
    call norm_product_to_A

    ; pack back into (HL)
    call fp_packA
    ret


; ------------------------------------------------------------
; fp_div: A = A / B
; ------------------------------------------------------------
fp_div:
    push hl
    push de
    call fp_unpackA
    pop de
    call fp_unpackB
    pop hl

    ; A==0 => 0
    ld a,(A_exp)
    or a
    jr z,fp_store_zero_A

    ; B==0 => return 0 (simple “error” behavior)
    ld a,(B_exp)
    or a
    jr z,fp_store_zero_A

    ; sign = A_sign XOR B_sign
    ld a,(A_sign)
    ld b,a
    ld a,(B_sign)
    xor b
    ld (A_sign),a

    ; exponent = A_exp - B_exp + BIAS
    ld a,(A_exp)
    ld b,a
    ld a,(B_exp)
    ld c,a
    ld a,b
    sub c
    add a,FP_BIAS
    ld (A_exp),a

    ; mantissa division
    call div_mantissas_to_A
    call normalize_A_mant

    call fp_packA
    ret


; ============================================================
; Add/Sub core (unpacked)
; ============================================================

fp_add_same_sign:
    call align_exponents_A_B
    call add24_A_plus_B

    ; if carry: shift right, exponent++
    jr nc,.noCarry
    call shr24_A_1
    ld a,(A_exp)
    inc a
    ld (A_exp),a
.noCarry:
    call normalize_A_mant
    call fp_packA
    ret


fp_add_diff_sign:
    ; compare |A| vs |B|, do larger - smaller, sign = sign(larger)
    call compare_mag_A_B
    jr c,.A_ge_B
    ; |B| > |A| => swap
    call swap_A_B_unpacked
.A_ge_B:
    call align_exponents_A_B
    call sub24_A_minus_B
    call is_A_mant_zero
    jr z,fp_store_zero_A
    call normalize_A_mant
    call fp_packA
    ret


; ============================================================
; Unpack / Pack helpers
; ============================================================

; Unpack A from (HL)
fp_unpackA:
    ld a,(hl)
    ld (A_exp),a
    or a
    jr z,.zeroA
    inc hl
    ld a,(hl)
    ld b,a
    ; sign bit -> A_sign (0/1)
    and 080h
    jr z,.sa0
    ld a,1
    jr .sa1
.sa0:
    xor a
.sa1:
    ld (A_sign),a

    ; mantissa bytes with hidden 1 inserted
    ld a,b
    and 07Fh
    or 080h
    ld (A_m2),a
    inc hl
    ld a,(hl)
    ld (A_m1),a
    inc hl
    ld a,(hl)
    ld (A_m0),a
    ret
.zeroA:
    xor a
    ld (A_sign),a
    ld (A_m2),a
    ld (A_m1),a
    ld (A_m0),a
    ret


; Unpack B from (DE)
fp_unpackB:
    ld a,(de)
    ld (B_exp),a
    or a
    jr z,.zeroB
    inc de
    ld a,(de)
    ld b,a
    and 080h
    jr z,.sb0
    ld a,1
    jr .sb1
.sb0:
    xor a
.sb1:
    ld (B_sign),a

    ld a,b
    and 07Fh
    or 080h
    ld (B_m2),a
    inc de
    ld a,(de)
    ld (B_m1),a
    inc de
    ld a,(de)
    ld (B_m0),a
    ret
.zeroB:
    xor a
    ld (B_sign),a
    ld (B_m2),a
    ld (B_m1),a
    ld (B_m0),a
    ret


; Pack unpacked A back into memory at (HL)
fp_packA:
    ld a,(A_exp)
    or a
    jr nz,.packNZ
    ; store 0
    ld (hl),0
    inc hl
    ld (hl),0
    inc hl
    ld (hl),0
    inc hl
    ld (hl),0
    ret

.packNZ:
    ld a,(A_exp)
    ld (hl),a
    inc hl

    ; remove hidden 1
    ld a,(A_m2)
    and 07Fh
    ld b,a

    ; apply sign bit7
    ld a,(A_sign)
    or a
    jr z,.sign0
    ld a,b
    or 080h
    jr .storeB1
.sign0:
    ld a,b
.storeB1:
    ld (hl),a
    inc hl
    ld a,(A_m1)
    ld (hl),a
    inc hl
    ld a,(A_m0)
    ld (hl),a
    ret


; Pack from unpacked B into memory A (HL points to A destination)
fp_pack_from_B_into_A:
    ld a,(B_exp)
    ld (hl),a
    inc hl
    ld a,(B_m2)
    and 07Fh
    ld b,a
    ld a,(B_sign)
    or a
    jr z,.bs0
    ld a,b
    or 080h
    jr .bs1
.bs0:
    ld a,b
.bs1:
    ld (hl),a
    inc hl
    ld a,(B_m1)
    ld (hl),a
    inc hl
    ld a,(B_m0)
    ld (hl),a
    ret


fp_store_zero_A:
    xor a
    ld (A_exp),a
    ld (A_sign),a
    ld (A_m2),a
    ld (A_m1),a
    ld (A_m0),a
    jp fp_packA


; ============================================================
; Exponent alignment / compare / swap
; ============================================================

; Ensure A_exp >= B_exp; shift smaller mantissa right by diff
align_exponents_A_B:
    ld a,(A_exp)
    ld b,a
    ld a,(B_exp)
    cp b
    jr z,.done
    jr c,.A_bigger_exp     ; B_exp < A_exp
    call swap_A_B_unpacked ; make A the larger exponent
.A_bigger_exp:
    ld a,(A_exp)
    ld b,a
    ld a,(B_exp)
    ld c,a
    ld a,b
    sub c          ; A = diff
    call shr24_B_by_A
    ld a,(A_exp)
    ld (B_exp),a
.done:
    ret


; Carry set if |A| >= |B|, else carry clear
compare_mag_A_B:
    ld a,(A_exp)
    ld b,a
    ld a,(B_exp)
    cp b
    jr z,.cmpMant
    jr c,.A_ge
    or a
    ret
.A_ge:
    scf
    ret
.cmpMant:
    ld a,(A_m2)
    ld b,a
    ld a,(B_m2)
    cp b
    jr z,.m1
    jr c,.A_ge2
    or a
    ret
.A_ge2:
    scf
    ret
.m1:
    ld a,(A_m1)
    ld b,a
    ld a,(B_m1)
    cp b
    jr z,.m0
    jr c,.A_ge3
    or a
    ret
.A_ge3:
    scf
    ret
.m0:
    ld a,(A_m0)
    ld b,a
    ld a,(B_m0)
    cp b
    jr c,.A_ge4
    scf
    ret
.A_ge4:
    scf
    ret


swap_A_B_unpacked:
    ld a,(A_exp)
    ld b,a
    ld a,(B_exp)
    ld (A_exp),a
    ld a,b
    ld (B_exp),a
    ld a,(A_sign)
    ld b,a
    ld a,(B_sign)
    ld (A_sign),a
    ld a,b
    ld (B_sign),a
    ld a,(A_m2)
    ld b,a
    ld a,(B_m2)
    ld (A_m2),a
    ld a,b
    ld (B_m2),a
    ld a,(A_m1)
    ld b,a
    ld a,(B_m1)
    ld (A_m1),a
    ld a,b
    ld (B_m1),a
    ld a,(A_m0)
    ld b,a
    ld a,(B_m0)
    ld (A_m0),a
    ld a,b
    ld (B_m0),a
    ret


; ============================================================
; 24-bit mantissa ops
; ============================================================

add24_A_plus_B:
    ld a,(A_m0)
    add a,(B_m0)
    ld (A_m0),a
    ld a,(A_m1)
    adc a,(B_m1)
    ld (A_m1),a
    ld a,(A_m2)
    adc a,(B_m2)
    ld (A_m2),a
    ret  ; carry meaningful


sub24_A_minus_B:
    ld a,(A_m0)
    sub (B_m0)
    ld (A_m0),a
    ld a,(A_m1)
    sbc a,(B_m1)
    ld (A_m1),a
    ld a,(A_m2)
    sbc a,(B_m2)
    ld (A_m2),a
    ret


is_A_mant_zero:
    ld a,(A_m2)
    or (A_m1)
    or (A_m0)
    ret


shr24_A_1:
    ld a,(A_m2)
    srl a
    ld (A_m2),a
    ld a,(A_m1)
    rr  a
    ld (A_m1),a
    ld a,(A_m0)
    rr  a
    ld (A_m0),a
    ret


; Shift B mantissa right by A bits (A=0..255)
shr24_B_by_A:
    ld (SHCNT),a
    ld a,(SHCNT)
    cp 24
    jr c,.ok
    xor a
    ld (B_m2),a
    ld (B_m1),a
    ld (B_m0),a
    ret
.ok:
    ld a,(SHCNT)
    or a
    ret z
.loop:
    ld a,(B_m2)
    srl a
    ld (B_m2),a
    ld a,(B_m1)
    rr  a
    ld (B_m1),a
    ld a,(B_m0)
    rr  a
    ld (B_m0),a
    ld a,(SHCNT)
    dec a
    ld (SHCNT),a
    jr nz,.loop
    ret


normalize_A_mant:
    call is_A_mant_zero
    jr nz,.nz
    xor a
    ld (A_exp),a
    ret
.nz:
    ld a,(A_m2)
    bit 7,a
    ret nz
.left_loop:
    ld a,(A_m0)
    add a,a
    ld (A_m0),a
    ld a,(A_m1)
    adc a,a
    ld (A_m1),a
    ld a,(A_m2)
    adc a,a
    ld (A_m2),a
    ld a,(A_exp)
    dec a
    ld (A_exp),a
    ld a,(A_m2)
    bit 7,a
    jr z,.left_loop
    ret


; ============================================================
; 8x8 -> 16 multiply (unsigned), shift-add
; in:  A = multiplicand, C = multiplier
; out: HL = 16-bit product
; ============================================================
mul8u:
    ld h,0
    ld l,0
    ld b,8
.m8:
    srl c
    jr nc,.noadd
    ld e,a
    ld d,0
    add hl,de
.noadd:
    add a,a
    djnz .m8
    ret


; ============================================================
; 24x24 schoolbook multiply into P0..P5 (P0 LSB)
; ============================================================
mul24x24_schoolbook:
    xor a
    ld (P0),a
    ld (P1),a
    ld (P2),a
    ld (P3),a
    ld (P4),a
    ld (P5),a

    ; (0,0) offset 0
    ld a,(A_m0)
    ld c,(B_m0)
    call mul8u
    call add16_to_P_at0

    ; (0,1) offset 1
    ld a,(A_m0)
    ld c,(B_m1)
    call mul8u
    call add16_to_P_at1

    ; (0,2) offset 2
    ld a,(A_m0)
    ld c,(B_m2)
    call mul8u
    call add16_to_P_at2

    ; (1,0) offset 1
    ld a,(A_m1)
    ld c,(B_m0)
    call mul8u
    call add16_to_P_at1

    ; (1,1) offset 2
    ld a,(A_m1)
    ld c,(B_m1)
    call mul8u
    call add16_to_P_at2

    ; (1,2) offset 3
    ld a,(A_m1)
    ld c,(B_m2)
    call mul8u
    call add16_to_P_at3

    ; (2,0) offset 2
    ld a,(A_m2)
    ld c,(B_m0)
    call mul8u
    call add16_to_P_at2

    ; (2,1) offset 3
    ld a,(A_m2)
    ld c,(B_m1)
    call mul8u
    call add16_to_P_at3

    ; (2,2) offset 4
    ld a,(A_m2)
    ld c,(B_m2)
    call mul8u
    call add16_to_P_at4

    ret


add16_to_P_at0:
    ld a,(P0)
    add a,l
    ld (P0),a
    ld a,(P1)
    adc a,h
    ld (P1),a
    ret
add16_to_P_at1:
    ld a,(P1)
    add a,l
    ld (P1),a
    ld a,(P2)
    adc a,h
    ld (P2),a
    ret
add16_to_P_at2:
    ld a,(P2)
    add a,l
    ld (P2),a
    ld a,(P3)
    adc a,h
    ld (P3),a
    ret
add16_to_P_at3:
    ld a,(P3)
    add a,l
    ld (P3),a
    ld a,(P4)
    adc a,h
    ld (P4),a
    ret
add16_to_P_at4:
    ld a,(P4)
    add a,l
    ld (P4),a
    ld a,(P5)
    adc a,h
    ld (P5),a
    ret


; ============================================================
; Normalize product P into A mantissa
; P is 48-bit, P0 LSB .. P5 MSB
; ============================================================
norm_product_to_A:
    ld a,(P5)
    bit 7,a
    jr z,.shift23
    ld a,24
    call shr48_P_by_A
    ld a,(A_exp)
    inc a
    ld (A_exp),a
    jr .take
.shift23:
    ld a,23
    call shr48_P_by_A
.take:
    ld a,(P2)
    ld (A_m2),a
    ld a,(P1)
    ld (A_m1),a
    ld a,(P0)
    ld (A_m0),a
    ret


shr48_P_by_A:
    ld (SHCNT),a
    ld a,(SHCNT)
    or a
    ret z
.loop:
    ld a,(P5)
    srl a
    ld (P5),a
    ld a,(P4)
    rr  a
    ld (P4),a
    ld a,(P3)
    rr  a
    ld (P3),a
    ld a,(P2)
    rr  a
    ld (P2),a
    ld a,(P1)
    rr  a
    ld (P1),a
    ld a,(P0)
    rr  a
    ld (P0),a
    ld a,(SHCNT)
    dec a
    ld (SHCNT),a
    jr nz,.loop
    ret


; ============================================================
; Mantissa division (restoring-style)
; A_m = (A_m << 23) / B_m
; ============================================================
div_mantissas_to_A:
    ; P = A_m as 48-bit, then shift left 23
    xor a
    ld (P3),a
    ld (P4),a
    ld (P5),a
    ld a,(A_m0)
    ld (P0),a
    ld a,(A_m1)
    ld (P1),a
    ld a,(A_m2)
    ld (P2),a

    ld a,23
    call shl48_P_by_A

    ; clear quotient
    xor a
    ld (A_m2),a
    ld (A_m1),a
    ld (A_m0),a

    ld b,24
.div_loop:
    call shl24_A_1
    call shl48_P_1

    ; subtract divisor from high 24 bits of P (P5..P3)
    call sub24_Phigh_minus_B
    jr c,.restore
    ; success => set quotient LSB = 1
    ld a,(A_m0)
    or 001h
    ld (A_m0),a
    jr .next
.restore:
    call add24_Phigh_plus_B
.next:
    djnz .div_loop
    ret


shl48_P_by_A:
    ld (SHCNT),a
    ld a,(SHCNT)
    or a
    ret z
.loop:
    call shl48_P_1
    ld a,(SHCNT)
    dec a
    ld (SHCNT),a
    jr nz,.loop
    ret


shl48_P_1:
    ld a,(P0)
    add a,a
    ld (P0),a
    ld a,(P1)
    adc a,a
    ld (P1),a
    ld a,(P2)
    adc a,a
    ld (P2),a
    ld a,(P3)
    adc a,a
    ld (P3),a
    ld a,(P4)
    adc a,a
    ld (P4),a
    ld a,(P5)
    adc a,a
    ld (P5),a
    ret


sub24_Phigh_minus_B:
    ld a,(P3)
    sub (B_m0)
    ld (P3),a
    ld a,(P4)
    sbc a,(B_m1)
    ld (P4),a
    ld a,(P5)
    sbc a,(B_m2)
    ld (P5),a
    ret    ; carry set indicates borrow


add24_Phigh_plus_B:
    ld a,(P3)
    add a,(B_m0)
    ld (P3),a
    ld a,(P4)
    adc a,(B_m1)
    ld (P4),a
    ld a,(P5)
    adc a,(B_m2)
    ld (P5),a
    ret


; ============================================================
; fp_print: fixed format printing
; Prints: [-]I.FFFFFF (FRAC_DIGITS digits)
; Uses printChar (A=char)
; ============================================================
fp_print:
    ; zero?
    ld a,(hl)
    or a
    jr nz,.nz
    ld a,'0'
    call printChar
    ld a,'.'
    call printChar
    ld b,FRAC_DIGITS
.zf:
    ld a,'0'
    call printChar
    djnz .zf
    ret

.nz:
    ; EXP -> PR_E (unbiased)
    ld a,(hl)
    sub FP_BIAS
    ld (PR_E),a
    inc hl

    ; sign + top fraction
    ld a,(hl)
    ld b,a
    and 080h
    jr z,.ps0
    ld a,1
    jr .ps1
.ps0:
    xor a
.ps1:
    ld (PR_SIGN),a

    ; mantissa with hidden 1 inserted
    ld a,b
    and 07Fh
    or 080h
    ld (PR_M2),a
    inc hl
    ld a,(hl)
    ld (PR_M1),a
    inc hl
    ld a,(hl)
    ld (PR_M0),a

    ; print '-'
    ld a,(PR_SIGN)
    or a
    jr z,.mag
    ld a,'-'
    call printChar
.mag:
    ; S = (E - 23)
    ld a,(PR_E)
    sub 23

    ; clear int and remainder helpers
    xor a
    ld (PR_INT0),a
    ld (PR_INT1),a
    ld (PR_INT2),a
    ld (PR_INT3),a
    ld (PR_R3),a

    bit 7,a
    jr z,.S_nonneg

    ; S negative: INT = [M2][M1][M0][00] (i.e., M << 8), then shift right by -S
    neg
    ld b,a                 ; B = shift count

    xor a
    ld (PR_INT0),a
    ld a,(PR_M0)
    ld (PR_INT1),a
    ld a,(PR_M1)
    ld (PR_INT2),a
    ld a,(PR_M2)
    ld (PR_INT3),a

    call shr32_INT_to_INT_with_remainder
    jr .print_int_and_frac

.S_nonneg:
    ; S non-negative: INT = M (24-bit) then shift left S (cap at 31)
    cp 32
    jr c,.doShl
    ld a,31
.doShl:
    ld b,a
    ld a,(PR_M0)
    ld (PR_INT0),a
    ld a,(PR_M1)
    ld (PR_INT1),a
    ld a,(PR_M2)
    ld (PR_INT2),a
    xor a
    ld (PR_INT3),a
    call shl32_INT_by_B

.print_int_and_frac:
    call print_u32_dec
    ld a,'.'
    call printChar
    ld b,FRAC_DIGITS
.fr:
    call mul_remainder_by_10
    ld a,(PR_R3)
    add a,'0'
    call printChar
    xor a
    ld (PR_R3),a
    djnz .fr
    ret


; Shift-right PR_INT by B, collect shifted-out bits into PR_R3 (simplified)
shr32_INT_to_INT_with_remainder:
    xor a
    ld (PR_R3),a
    ld a,b
    or a
    ret z
.loop:
    ld a,(PR_INT3)
    srl a
    ld (PR_INT3),a
    ld a,(PR_INT2)
    rr  a
    ld (PR_INT2),a
    ld a,(PR_INT1)
    rr  a
    ld (PR_INT1),a
    ld a,(PR_INT0)
    rr  a
    ld (PR_INT0),a
    ; carry has shifted-out bit; accumulate into PR_R3
    ld a,(PR_R3)
    add a,a
    adc a,0
    ld (PR_R3),a
    djnz .loop
    ret


shl32_INT_by_B:
    ld a,b
    or a
    ret z
.loop:
    ld a,(PR_INT0)
    add a,a
    ld (PR_INT0),a
    ld a,(PR_INT1)
    adc a,a
    ld (PR_INT1),a
    ld a,(PR_INT2)
    adc a,a
    ld (PR_INT2),a
    ld a,(PR_INT3)
    adc a,a
    ld (PR_INT3),a
    djnz .loop
    ret


mul_remainder_by_10:
    ld a,(PR_R3)
    ld b,a
    add a,a      ; *2
    add a,a      ; *4
    add a,a      ; *8
    add a,b      ; *9
    add a,b      ; *10
    ld (PR_R3),a
    ret


; Print PR_INT (u32) as decimal
print_u32_dec:
    ld a,(PR_INT0)
    or (PR_INT1)
    or (PR_INT2)
    or (PR_INT3)
    jr nz,.nz
    ld a,'0'
    call printChar
    ret
.nz:
    xor a
    ld (DIGLEN),a
.dloop:
    call u32_div10_inplace   ; remainder in A, quotient back in PR_INT
    ld hl,DIGBUF
    ld b,0
    ld c,(DIGLEN)
    add hl,bc
    add a,'0'
    ld (hl),a
    ld a,(DIGLEN)
    inc a
    ld (DIGLEN),a
    ld a,(PR_INT0)
    or (PR_INT1)
    or (PR_INT2)
    or (PR_INT3)
    jr nz,.dloop

    ; print in reverse
    ld a,(DIGLEN)
    ld b,a
.pr:
    dec b
    ld hl,DIGBUF
    ld c,b
    ld b,0
    add hl,bc
    ld a,(hl)
    call printChar
    ld a,c
    or a
    jr nz,.pr
    ret


; Divide PR_INT (u32) by 10, return remainder in A (0..9)
u32_div10_inplace:
    ld b,0          ; remainder
    ld hl,PR_INT3
    call .step
    inc hl
    call .step
    inc hl
    call .step
    inc hl
    call .step
    ld a,b
    ret
.step:
    ; DE = remainder*256 + byte
    ld a,b
    ld d,a
    ld e,(hl)
    ld c,0          ; quotient byte
.div:
    ld a,d
    or a
    jr nz,.sub
    ld a,e
    cp 10
    jr c,.done
.sub:
    ld a,e
    sub 10
    ld e,a
    ld a,d
    sbc a,0
    ld d,a
    inc c
    jr .div
.done:
    ld (hl),c
    ld b,e
    ret


; ============================================================
; fp_parse: parse decimal string -> float
; DE -> "[-]ddd[.ddd]\0"
; HL -> output float
; ============================================================
fp_parse:
    xor a
    ld (P_SIGN),a
    ld (P_FRACN),a
    ld (P_S0),a
    ld (P_S1),a
    ld (P_S2),a
    ld (P_S3),a

    ; optional sign
    ld a,(de)
    cp '-'
    jr nz,.chkplus
    ld a,1
    ld (P_SIGN),a
    inc de
    jr .intpart
.chkplus:
    ld a,(de)
    cp '+'
    jr nz,.intpart
    inc de

.intpart:
    ld a,(de)
    call is_digit
    jr nc,.maybe_dot
.il:
    ld a,(de)
    sub '0'
    ld c,a
    call u32_mul10_scaled
    call u32_add8_scaled
    inc de
    ld a,(de)
    call is_digit
    jr c,.il

.maybe_dot:
    ld a,(de)
    cp '.'
    jr nz,.finish_scaled
    inc de

    ld b,MAX_FRAC
.fl:
    ld a,(de)
    call is_digit
    jr nc,.finish_scaled
    ld a,(de)
    sub '0'
    ld c,a
    call u32_mul10_scaled
    call u32_add8_scaled
    ld a,(P_FRACN)
    inc a
    ld (P_FRACN),a
    inc de
    djnz .fl

.finish_scaled:
    ; convert scaled u32 to float into (HL)
    call fp_from_u32_scaled_to_A

    ; divide by 10^k if needed
    ld a,(P_FRACN)
    or a
    jr z,.apply_sign

    ; DE = &pow10_table[k]
    push hl
    ld e,a
    ld d,0
    ld hl,pow10_table
    add hl,de
    add hl,de
    add hl,de
    add hl,de
    ex de,hl
    pop hl
    call fp_div

.apply_sign:
    ld a,(P_SIGN)
    or a
    ret z
    inc hl
    ld a,(hl)
    xor 080h
    ld (hl),a
    ret


is_digit:
    cp '0'
    jr c,.no
    cp '9'+1
    jr nc,.no
    scf
    ret
.no:
    or a
    ret


; P_S = P_S*10  (uses PR_INT and PR_R0..3 as scratch)
u32_mul10_scaled:
    ; PR_INT = P
    ld a,(P_S0)
    ld (PR_INT0),a
    ld a,(P_S1)
    ld (PR_INT1),a
    ld a,(P_S2)
    ld (PR_INT2),a
    ld a,(P_S3)
    ld (PR_INT3),a
    ; PR_INT *=2
    ld b,1
    call shl32_INT_by_B

    ; PR_R = P
    ld a,(P_S0)
    ld (PR_R0),a
    ld a,(P_S1)
    ld (PR_R1),a
    ld a,(P_S2)
    ld (PR_R2),a
    ld a,(P_S3)
    ld (PR_R3),a
    ; PR_R *=8 (shift left 3)
    ld b,3
    call shl32_R_by_B

    ; P = PR_INT + PR_R
    ld a,(PR_INT0)
    add a,(PR_R0)
    ld (P_S0),a
    ld a,(PR_INT1)
    adc a,(PR_R1)
    ld (P_S1),a
    ld a,(PR_INT2)
    adc a,(PR_R2)
    ld (P_S2),a
    ld a,(PR_INT3)
    adc a,(PR_R3)
    ld (P_S3),a
    ret


shl32_R_by_B:
    ld a,b
    or a
    ret z
.loop:
    ld a,(PR_R0)
    add a,a
    ld (PR_R0),a
    ld a,(PR_R1)
    adc a,a
    ld (PR_R1),a
    ld a,(PR_R2)
    adc a,a
    ld (PR_R2),a
    ld a,(PR_R3)
    adc a,a
    ld (PR_R3),a
    djnz .loop
    ret


; P_S += C (0..9)
u32_add8_scaled:
    ld a,(P_S0)
    add a,c
    ld (P_S0),a
    ld a,(P_S1)
    adc a,0
    ld (P_S1),a
    ld a,(P_S2)
    adc a,0
    ld (P_S2),a
    ld a,(P_S3)
    adc a,0
    ld (P_S3),a
    ret


; Convert P_S (u32) to float at (HL). Positive only; sign handled by caller.
fp_from_u32_scaled_to_A:
    ld a,(P_S0)
    or (P_S1)
    or (P_S2)
    or (P_S3)
    jr nz,.nz
    ld (hl),0
    inc hl
    ld (hl),0
    inc hl
    ld (hl),0
    inc hl
    ld (hl),0
    ret

.nz:
    ; find MSB index in B (0..31)
    ld b,31
    ld a,(P_S3)
    ld c,a
    or a
    jr nz,.scan
    ld b,23
    ld a,(P_S2)
    ld c,a
    or a
    jr nz,.scan
    ld b,15
    ld a,(P_S1)
    ld c,a
    or a
    jr nz,.scan
    ld b,7
    ld a,(P_S0)
    ld c,a
.scan:
.find:
    bit 7,c
    jr nz,.found
    add c,c
    dec b
    jr .find
.found:
    ; EXP = FP_BIAS + B
    ld a,b
    add a,FP_BIAS
    ld (hl),a
    inc hl

    ; shift value left by (23-B), take top 24 bits
    ld a,23
    sub b
    ld b,a

    ; PR_INT = P_S
    ld a,(P_S0)
    ld (PR_INT0),a
    ld a,(P_S1)
    ld (PR_INT1),a
    ld a,(P_S2)
    ld (PR_INT2),a
    ld a,(P_S3)
    ld (PR_INT3),a
    call shl32_INT_by_B

    ; store sign=0, fraction = top 23 bits of mantissa (hidden 1 removed)
    ld a,(PR_INT3)
    and 07Fh
    ld (hl),a
    inc hl
    ld a,(PR_INT2)
    ld (hl),a
    inc hl
    ld a,(PR_INT1)
    ld (hl),a
    ret

.data
; ============================================================
; pow10_table: 10^k constants (k=0..6) in THIS float encoding
; Verified:
;   1.0      = 127 00 00 00
;   10.0     = 130 20 00 00
;   100.0    = 133 48 00 00
;   1000.0   = 136 7A 00 00
;   10000.0  = 140 1C 40 00
;   100000.0 = 143 43 50 00
;   1000000.0= 146 74 24 00
; ============================================================
pow10_table:
    .byte 127, 0x00, 0x00, 0x00   ; 10^0 = 1
    .byte 130, 0x20, 0x00, 0x00   ; 10^1 = 10
    .byte 133, 0x48, 0x00, 0x00   ; 10^2 = 100
    .byte 136, 0x7A, 0x00, 0x00   ; 10^3 = 1000
    .byte 140, 0x1C, 0x40, 0x00   ; 10^4 = 10000
    .byte 143, 0x43, 0x50, 0x00   ; 10^5 = 100000
    .byte 146, 0x74, 0x24, 0x00   ; 10^6 = 1000000


; ============================================================
; BSS / WORKSPACE
; ============================================================
.bss

; Unpacked A
.comm A_exp,1
.comm A_sign,1
.comm A_m2,1
.comm A_m1,1
.comm A_m0,1

; Unpacked B
.comm B_exp,1
.comm B_sign,1
.comm B_m2,1
.comm B_m1,1
.comm B_m0,1

; 48-bit workspace (P0 LSB .. P5 MSB)
.comm P0,1
.comm P1,1
.comm P2,1
.comm P3,1
.comm P4,1
.comm P5,1

.comm SHCNT,1

; Print temps
.comm PR_SI,1
.comm PR_E,1
.comm PR_M2,1
.comm PR_M1,1
.comm PR_M0,1
.comm PR_INT0,1
.comm PR_INT1,1
.comm PR_INT2,1
.comm PR_INT3,1
.comm PR_R0,1
.comm PR_R1,1
.comm PR_R2,1
.comm PR_R3,1

; Parse temps
.comm P_SIGN,1
.comm P_FRACN,1
.comm P_S0,1
.comm P_S1,1
.comm P_S2,1
.comm P_S3,1

; Digit buffer
.comm DIGBUF,1
.comm DIGLEN,1