From 1353091ddad5170ae74b34d01f71150366ce4310 Mon Sep 17 00:00:00 2001
From: "A.M. Rowsell" <amr@frzn.dev>
Date: Sat, 20 Dec 2025 14:40:01 -0500
Subject: [PATCH] init: creation of initial repo for this project

---
 float.asm | 1545 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1545 insertions(+)
 create mode 100644 float.asm

diff --git a/float.asm b/float.asm
new file mode 100644
index 0000000..67d71a4
--- /dev/null
+++ b/float.asm
@@ -0,0 +1,1545 @@
+; ============================================================
+; Z80 Soft Float Library (4-byte) + Print + Parse (vasm syntax)
+; ============================================================
+; Float format in memory (big-endian, 4 bytes):
+;   byte0: EXP   (8-bit biased exponent, 0 = zero)
+;   byte1: S|F22..F16   (bit7 = sign, bits6..0 = top 7 fraction bits)
+;   byte2: F15..F8
+;   byte3: F7..F0
+;
+; For EXP != 0:
+;   value = (-1)^S * (1.F) * 2^(EXP - FP_BIAS)
+;   FP_BIAS = 127
+;
+; Calling convention (in-place ops):
+;   HL -> A (4 bytes)
+;   DE -> B (4 bytes)
+;   fp_add: A = A + B  (stored back at HL)
+;   fp_sub: A = A - B
+;   fp_mul: A = A * B
+;   fp_div: A = A / B
+;
+; Extra:
+;   fp_print: print float at (HL) using external printChar (A=ASCII)
+;   fp_parse: parse null-terminated string at (DE) into float at (HL)
+;
+; Limitations:
+;   - No NaN/Inf/denormals
+;   - Truncation (no rounding)
+;   - fp_print prints fixed decimals with a lightweight fraction path
+;   - fp_parse supports optional +/- and '.' up to MAX_FRAC digits, no exponent notation
+; ============================================================
+
+.equ FP_BIAS,127
+.equ FRAC_DIGITS,6
+.equ MAX_FRAC,6
+
+.global printChar
+; ============================================================
+; CODE
+; ============================================================
+.text
+
+; ------------------------------------------------------------
+; External routine you provide:
+;   printChar: prints ASCII character in A
+; ------------------------------------------------------------
+; printChar is external, not defined here.
+
+; ============================================================
+; Public API: fp_add / fp_sub / fp_mul / fp_div
+; ============================================================
+
+; ------------------------------------------------------------
+; fp_add: A = A + B
+; ------------------------------------------------------------
+fp_add:
+    push hl
+    push de
+    call fp_unpackA
+    pop de
+    call fp_unpackB
+    pop hl
+
+    ; zero short-cuts
+    ld a,(A_exp)
+    or a
+    jr nz,.checkB
+    ; A==0 => result=B
+    call fp_pack_from_B_into_A
+    ret
+.checkB:
+    ld a,(B_exp)
+    or a
+    ret z
+
+    ; if signs same -> magnitude add
+    ld a,(A_sign)
+    ld b,a
+    ld a,(B_sign)
+    xor b
+    jr z,fp_add_same_sign
+
+    ; signs differ -> magnitude subtract
+    jp fp_add_diff_sign
+
+
+; ------------------------------------------------------------
+; fp_sub: A = A - B (flip B sign in memory, add, flip back)
+; ------------------------------------------------------------
+fp_sub:
+    ; Flip sign bit of B byte1 (DE+1)
+    push hl
+    push de
+    inc de
+    ld a,(de)
+    xor 080h
+    ld (de),a
+    pop de
+    pop hl
+
+    call fp_add
+
+    ; Flip sign bit back
+    push hl
+    push de
+    inc de
+    ld a,(de)
+    xor 080h
+    ld (de),a
+    pop de
+    pop hl
+    ret
+
+
+; ------------------------------------------------------------
+; fp_mul: A = A * B
+; ------------------------------------------------------------
+fp_mul:
+    push hl
+    push de
+    call fp_unpackA
+    pop de
+    call fp_unpackB
+    pop hl
+
+    ; if A==0 or B==0 => 0
+    ld a,(A_exp)
+    or a
+    jr z,fp_store_zero_A
+    ld a,(B_exp)
+    or a
+    jr z,fp_store_zero_A
+
+    ; sign = A_sign XOR B_sign
+    ld a,(A_sign)
+    ld b,a
+    ld a,(B_sign)
+    xor b
+    ld (A_sign),a
+
+    ; exponent = A_exp + B_exp - BIAS
+    ld a,(A_exp)
+    ld b,a
+    ld a,(B_exp)
+    add a,b
+    sub FP_BIAS
+    ld (A_exp),a
+
+    ; product = A_mant * B_mant (24x24 => 48)
+    call mul24x24_schoolbook
+
+    ; normalize product into A mantissa
+    call norm_product_to_A
+
+    ; pack back into (HL)
+    call fp_packA
+    ret
+
+
+; ------------------------------------------------------------
+; fp_div: A = A / B
+; ------------------------------------------------------------
+fp_div:
+    push hl
+    push de
+    call fp_unpackA
+    pop de
+    call fp_unpackB
+    pop hl
+
+    ; A==0 => 0
+    ld a,(A_exp)
+    or a
+    jr z,fp_store_zero_A
+
+    ; B==0 => return 0 (simple “error” behavior)
+    ld a,(B_exp)
+    or a
+    jr z,fp_store_zero_A
+
+    ; sign = A_sign XOR B_sign
+    ld a,(A_sign)
+    ld b,a
+    ld a,(B_sign)
+    xor b
+    ld (A_sign),a
+
+    ; exponent = A_exp - B_exp + BIAS
+    ld a,(A_exp)
+    ld b,a
+    ld a,(B_exp)
+    ld c,a
+    ld a,b
+    sub c
+    add a,FP_BIAS
+    ld (A_exp),a
+
+    ; mantissa division
+    call div_mantissas_to_A
+    call normalize_A_mant
+
+    call fp_packA
+    ret
+
+
+; ============================================================
+; Add/Sub core (unpacked)
+; ============================================================
+
+fp_add_same_sign:
+    call align_exponents_A_B
+    call add24_A_plus_B
+
+    ; if carry: shift right, exponent++
+    jr nc,.noCarry
+    call shr24_A_1
+    ld a,(A_exp)
+    inc a
+    ld (A_exp),a
+.noCarry:
+    call normalize_A_mant
+    call fp_packA
+    ret
+
+
+fp_add_diff_sign:
+    ; compare |A| vs |B|, do larger - smaller, sign = sign(larger)
+    call compare_mag_A_B
+    jr c,.A_ge_B
+    ; |B| > |A| => swap
+    call swap_A_B_unpacked
+.A_ge_B:
+    call align_exponents_A_B
+    call sub24_A_minus_B
+    call is_A_mant_zero
+    jr z,fp_store_zero_A
+    call normalize_A_mant
+    call fp_packA
+    ret
+
+
+; ============================================================
+; Unpack / Pack helpers
+; ============================================================
+
+; Unpack A from (HL)
+fp_unpackA:
+    ld a,(hl)
+    ld (A_exp),a
+    or a
+    jr z,.zeroA
+    inc hl
+    ld a,(hl)
+    ld b,a
+    ; sign bit -> A_sign (0/1)
+    and 080h
+    jr z,.sa0
+    ld a,1
+    jr .sa1
+.sa0:
+    xor a
+.sa1:
+    ld (A_sign),a
+
+    ; mantissa bytes with hidden 1 inserted
+    ld a,b
+    and 07Fh
+    or 080h
+    ld (A_m2),a
+    inc hl
+    ld a,(hl)
+    ld (A_m1),a
+    inc hl
+    ld a,(hl)
+    ld (A_m0),a
+    ret
+.zeroA:
+    xor a
+    ld (A_sign),a
+    ld (A_m2),a
+    ld (A_m1),a
+    ld (A_m0),a
+    ret
+
+
+; Unpack B from (DE)
+fp_unpackB:
+    ld a,(de)
+    ld (B_exp),a
+    or a
+    jr z,.zeroB
+    inc de
+    ld a,(de)
+    ld b,a
+    and 080h
+    jr z,.sb0
+    ld a,1
+    jr .sb1
+.sb0:
+    xor a
+.sb1:
+    ld (B_sign),a
+
+    ld a,b
+    and 07Fh
+    or 080h
+    ld (B_m2),a
+    inc de
+    ld a,(de)
+    ld (B_m1),a
+    inc de
+    ld a,(de)
+    ld (B_m0),a
+    ret
+.zeroB:
+    xor a
+    ld (B_sign),a
+    ld (B_m2),a
+    ld (B_m1),a
+    ld (B_m0),a
+    ret
+
+
+; Pack unpacked A back into memory at (HL)
+fp_packA:
+    ld a,(A_exp)
+    or a
+    jr nz,.packNZ
+    ; store 0
+    ld (hl),0
+    inc hl
+    ld (hl),0
+    inc hl
+    ld (hl),0
+    inc hl
+    ld (hl),0
+    ret
+
+.packNZ:
+    ld a,(A_exp)
+    ld (hl),a
+    inc hl
+
+    ; remove hidden 1
+    ld a,(A_m2)
+    and 07Fh
+    ld b,a
+
+    ; apply sign bit7
+    ld a,(A_sign)
+    or a
+    jr z,.sign0
+    ld a,b
+    or 080h
+    jr .storeB1
+.sign0:
+    ld a,b
+.storeB1:
+    ld (hl),a
+    inc hl
+    ld a,(A_m1)
+    ld (hl),a
+    inc hl
+    ld a,(A_m0)
+    ld (hl),a
+    ret
+
+
+; Pack from unpacked B into memory A (HL points to A destination)
+fp_pack_from_B_into_A:
+    ld a,(B_exp)
+    ld (hl),a
+    inc hl
+    ld a,(B_m2)
+    and 07Fh
+    ld b,a
+    ld a,(B_sign)
+    or a
+    jr z,.bs0
+    ld a,b
+    or 080h
+    jr .bs1
+.bs0:
+    ld a,b
+.bs1:
+    ld (hl),a
+    inc hl
+    ld a,(B_m1)
+    ld (hl),a
+    inc hl
+    ld a,(B_m0)
+    ld (hl),a
+    ret
+
+
+fp_store_zero_A:
+    xor a
+    ld (A_exp),a
+    ld (A_sign),a
+    ld (A_m2),a
+    ld (A_m1),a
+    ld (A_m0),a
+    jp fp_packA
+
+
+; ============================================================
+; Exponent alignment / compare / swap
+; ============================================================
+
+; Ensure A_exp >= B_exp; shift smaller mantissa right by diff
+align_exponents_A_B:
+    ld a,(A_exp)
+    ld b,a
+    ld a,(B_exp)
+    cp b
+    jr z,.done
+    jr c,.A_bigger_exp     ; B_exp < A_exp
+    call swap_A_B_unpacked ; make A the larger exponent
+.A_bigger_exp:
+    ld a,(A_exp)
+    ld b,a
+    ld a,(B_exp)
+    ld c,a
+    ld a,b
+    sub c          ; A = diff
+    call shr24_B_by_A
+    ld a,(A_exp)
+    ld (B_exp),a
+.done:
+    ret
+
+
+; Carry set if |A| >= |B|, else carry clear
+compare_mag_A_B:
+    ld a,(A_exp)
+    ld b,a
+    ld a,(B_exp)
+    cp b
+    jr z,.cmpMant
+    jr c,.A_ge
+    or a
+    ret
+.A_ge:
+    scf
+    ret
+.cmpMant:
+    ld a,(A_m2)
+    ld b,a
+    ld a,(B_m2)
+    cp b
+    jr z,.m1
+    jr c,.A_ge2
+    or a
+    ret
+.A_ge2:
+    scf
+    ret
+.m1:
+    ld a,(A_m1)
+    ld b,a
+    ld a,(B_m1)
+    cp b
+    jr z,.m0
+    jr c,.A_ge3
+    or a
+    ret
+.A_ge3:
+    scf
+    ret
+.m0:
+    ld a,(A_m0)
+    ld b,a
+    ld a,(B_m0)
+    cp b
+    jr c,.A_ge4
+    scf
+    ret
+.A_ge4:
+    scf
+    ret
+
+
+swap_A_B_unpacked:
+    ld a,(A_exp)
+    ld b,a
+    ld a,(B_exp)
+    ld (A_exp),a
+    ld a,b
+    ld (B_exp),a
+    ld a,(A_sign)
+    ld b,a
+    ld a,(B_sign)
+    ld (A_sign),a
+    ld a,b
+    ld (B_sign),a
+    ld a,(A_m2)
+    ld b,a
+    ld a,(B_m2)
+    ld (A_m2),a
+    ld a,b
+    ld (B_m2),a
+    ld a,(A_m1)
+    ld b,a
+    ld a,(B_m1)
+    ld (A_m1),a
+    ld a,b
+    ld (B_m1),a
+    ld a,(A_m0)
+    ld b,a
+    ld a,(B_m0)
+    ld (A_m0),a
+    ld a,b
+    ld (B_m0),a
+    ret
+
+
+; ============================================================
+; 24-bit mantissa ops
+; ============================================================
+
+add24_A_plus_B:
+    ld a,(A_m0)
+    add a,(B_m0)
+    ld (A_m0),a
+    ld a,(A_m1)
+    adc a,(B_m1)
+    ld (A_m1),a
+    ld a,(A_m2)
+    adc a,(B_m2)
+    ld (A_m2),a
+    ret  ; carry meaningful
+
+
+sub24_A_minus_B:
+    ld a,(A_m0)
+    sub (B_m0)
+    ld (A_m0),a
+    ld a,(A_m1)
+    sbc a,(B_m1)
+    ld (A_m1),a
+    ld a,(A_m2)
+    sbc a,(B_m2)
+    ld (A_m2),a
+    ret
+
+
+is_A_mant_zero:
+    ld a,(A_m2)
+    or (A_m1)
+    or (A_m0)
+    ret
+
+
+shr24_A_1:
+    ld a,(A_m2)
+    srl a
+    ld (A_m2),a
+    ld a,(A_m1)
+    rr  a
+    ld (A_m1),a
+    ld a,(A_m0)
+    rr  a
+    ld (A_m0),a
+    ret
+
+
+; Shift B mantissa right by A bits (A=0..255)
+shr24_B_by_A:
+    ld (SHCNT),a
+    ld a,(SHCNT)
+    cp 24
+    jr c,.ok
+    xor a
+    ld (B_m2),a
+    ld (B_m1),a
+    ld (B_m0),a
+    ret
+.ok:
+    ld a,(SHCNT)
+    or a
+    ret z
+.loop:
+    ld a,(B_m2)
+    srl a
+    ld (B_m2),a
+    ld a,(B_m1)
+    rr  a
+    ld (B_m1),a
+    ld a,(B_m0)
+    rr  a
+    ld (B_m0),a
+    ld a,(SHCNT)
+    dec a
+    ld (SHCNT),a
+    jr nz,.loop
+    ret
+
+
+normalize_A_mant:
+    call is_A_mant_zero
+    jr nz,.nz
+    xor a
+    ld (A_exp),a
+    ret
+.nz:
+    ld a,(A_m2)
+    bit 7,a
+    ret nz
+.left_loop:
+    ld a,(A_m0)
+    add a,a
+    ld (A_m0),a
+    ld a,(A_m1)
+    adc a,a
+    ld (A_m1),a
+    ld a,(A_m2)
+    adc a,a
+    ld (A_m2),a
+    ld a,(A_exp)
+    dec a
+    ld (A_exp),a
+    ld a,(A_m2)
+    bit 7,a
+    jr z,.left_loop
+    ret
+
+
+; ============================================================
+; 8x8 -> 16 multiply (unsigned), shift-add
+; in:  A = multiplicand, C = multiplier
+; out: HL = 16-bit product
+; ============================================================
+mul8u:
+    ld h,0
+    ld l,0
+    ld b,8
+.m8:
+    srl c
+    jr nc,.noadd
+    ld e,a
+    ld d,0
+    add hl,de
+.noadd:
+    add a,a
+    djnz .m8
+    ret
+
+
+; ============================================================
+; 24x24 schoolbook multiply into P0..P5 (P0 LSB)
+; ============================================================
+mul24x24_schoolbook:
+    xor a
+    ld (P0),a
+    ld (P1),a
+    ld (P2),a
+    ld (P3),a
+    ld (P4),a
+    ld (P5),a
+
+    ; (0,0) offset 0
+    ld a,(A_m0)
+    ld c,(B_m0)
+    call mul8u
+    call add16_to_P_at0
+
+    ; (0,1) offset 1
+    ld a,(A_m0)
+    ld c,(B_m1)
+    call mul8u
+    call add16_to_P_at1
+
+    ; (0,2) offset 2
+    ld a,(A_m0)
+    ld c,(B_m2)
+    call mul8u
+    call add16_to_P_at2
+
+    ; (1,0) offset 1
+    ld a,(A_m1)
+    ld c,(B_m0)
+    call mul8u
+    call add16_to_P_at1
+
+    ; (1,1) offset 2
+    ld a,(A_m1)
+    ld c,(B_m1)
+    call mul8u
+    call add16_to_P_at2
+
+    ; (1,2) offset 3
+    ld a,(A_m1)
+    ld c,(B_m2)
+    call mul8u
+    call add16_to_P_at3
+
+    ; (2,0) offset 2
+    ld a,(A_m2)
+    ld c,(B_m0)
+    call mul8u
+    call add16_to_P_at2
+
+    ; (2,1) offset 3
+    ld a,(A_m2)
+    ld c,(B_m1)
+    call mul8u
+    call add16_to_P_at3
+
+    ; (2,2) offset 4
+    ld a,(A_m2)
+    ld c,(B_m2)
+    call mul8u
+    call add16_to_P_at4
+
+    ret
+
+
+add16_to_P_at0:
+    ld a,(P0)
+    add a,l
+    ld (P0),a
+    ld a,(P1)
+    adc a,h
+    ld (P1),a
+    ret
+add16_to_P_at1:
+    ld a,(P1)
+    add a,l
+    ld (P1),a
+    ld a,(P2)
+    adc a,h
+    ld (P2),a
+    ret
+add16_to_P_at2:
+    ld a,(P2)
+    add a,l
+    ld (P2),a
+    ld a,(P3)
+    adc a,h
+    ld (P3),a
+    ret
+add16_to_P_at3:
+    ld a,(P3)
+    add a,l
+    ld (P3),a
+    ld a,(P4)
+    adc a,h
+    ld (P4),a
+    ret
+add16_to_P_at4:
+    ld a,(P4)
+    add a,l
+    ld (P4),a
+    ld a,(P5)
+    adc a,h
+    ld (P5),a
+    ret
+
+
+; ============================================================
+; Normalize product P into A mantissa
+; P is 48-bit, P0 LSB .. P5 MSB
+; ============================================================
+norm_product_to_A:
+    ld a,(P5)
+    bit 7,a
+    jr z,.shift23
+    ld a,24
+    call shr48_P_by_A
+    ld a,(A_exp)
+    inc a
+    ld (A_exp),a
+    jr .take
+.shift23:
+    ld a,23
+    call shr48_P_by_A
+.take:
+    ld a,(P2)
+    ld (A_m2),a
+    ld a,(P1)
+    ld (A_m1),a
+    ld a,(P0)
+    ld (A_m0),a
+    ret
+
+
+shr48_P_by_A:
+    ld (SHCNT),a
+    ld a,(SHCNT)
+    or a
+    ret z
+.loop:
+    ld a,(P5)
+    srl a
+    ld (P5),a
+    ld a,(P4)
+    rr  a
+    ld (P4),a
+    ld a,(P3)
+    rr  a
+    ld (P3),a
+    ld a,(P2)
+    rr  a
+    ld (P2),a
+    ld a,(P1)
+    rr  a
+    ld (P1),a
+    ld a,(P0)
+    rr  a
+    ld (P0),a
+    ld a,(SHCNT)
+    dec a
+    ld (SHCNT),a
+    jr nz,.loop
+    ret
+
+
+; ============================================================
+; Mantissa division (restoring-style)
+; A_m = (A_m << 23) / B_m
+; ============================================================
+div_mantissas_to_A:
+    ; P = A_m as 48-bit, then shift left 23
+    xor a
+    ld (P3),a
+    ld (P4),a
+    ld (P5),a
+    ld a,(A_m0)
+    ld (P0),a
+    ld a,(A_m1)
+    ld (P1),a
+    ld a,(A_m2)
+    ld (P2),a
+
+    ld a,23
+    call shl48_P_by_A
+
+    ; clear quotient
+    xor a
+    ld (A_m2),a
+    ld (A_m1),a
+    ld (A_m0),a
+
+    ld b,24
+.div_loop:
+    call shl24_A_1
+    call shl48_P_1
+
+    ; subtract divisor from high 24 bits of P (P5..P3)
+    call sub24_Phigh_minus_B
+    jr c,.restore
+    ; success => set quotient LSB = 1
+    ld a,(A_m0)
+    or 001h
+    ld (A_m0),a
+    jr .next
+.restore:
+    call add24_Phigh_plus_B
+.next:
+    djnz .div_loop
+    ret
+
+
+shl48_P_by_A:
+    ld (SHCNT),a
+    ld a,(SHCNT)
+    or a
+    ret z
+.loop:
+    call shl48_P_1
+    ld a,(SHCNT)
+    dec a
+    ld (SHCNT),a
+    jr nz,.loop
+    ret
+
+
+shl48_P_1:
+    ld a,(P0)
+    add a,a
+    ld (P0),a
+    ld a,(P1)
+    adc a,a
+    ld (P1),a
+    ld a,(P2)
+    adc a,a
+    ld (P2),a
+    ld a,(P3)
+    adc a,a
+    ld (P3),a
+    ld a,(P4)
+    adc a,a
+    ld (P4),a
+    ld a,(P5)
+    adc a,a
+    ld (P5),a
+    ret
+
+
+sub24_Phigh_minus_B:
+    ld a,(P3)
+    sub (B_m0)
+    ld (P3),a
+    ld a,(P4)
+    sbc a,(B_m1)
+    ld (P4),a
+    ld a,(P5)
+    sbc a,(B_m2)
+    ld (P5),a
+    ret    ; carry set indicates borrow
+
+
+add24_Phigh_plus_B:
+    ld a,(P3)
+    add a,(B_m0)
+    ld (P3),a
+    ld a,(P4)
+    adc a,(B_m1)
+    ld (P4),a
+    ld a,(P5)
+    adc a,(B_m2)
+    ld (P5),a
+    ret
+
+
+; ============================================================
+; fp_print: fixed format printing
+; Prints: [-]I.FFFFFF (FRAC_DIGITS digits)
+; Uses printChar (A=char)
+; ============================================================
+fp_print:
+    ; zero?
+    ld a,(hl)
+    or a
+    jr nz,.nz
+    ld a,'0'
+    call printChar
+    ld a,'.'
+    call printChar
+    ld b,FRAC_DIGITS
+.zf:
+    ld a,'0'
+    call printChar
+    djnz .zf
+    ret
+
+.nz:
+    ; EXP -> PR_E (unbiased)
+    ld a,(hl)
+    sub FP_BIAS
+    ld (PR_E),a
+    inc hl
+
+    ; sign + top fraction
+    ld a,(hl)
+    ld b,a
+    and 080h
+    jr z,.ps0
+    ld a,1
+    jr .ps1
+.ps0:
+    xor a
+.ps1:
+    ld (PR_SIGN),a
+
+    ; mantissa with hidden 1 inserted
+    ld a,b
+    and 07Fh
+    or 080h
+    ld (PR_M2),a
+    inc hl
+    ld a,(hl)
+    ld (PR_M1),a
+    inc hl
+    ld a,(hl)
+    ld (PR_M0),a
+
+    ; print '-'
+    ld a,(PR_SIGN)
+    or a
+    jr z,.mag
+    ld a,'-'
+    call printChar
+.mag:
+    ; S = (E - 23)
+    ld a,(PR_E)
+    sub 23
+
+    ; clear int and remainder helpers
+    xor a
+    ld (PR_INT0),a
+    ld (PR_INT1),a
+    ld (PR_INT2),a
+    ld (PR_INT3),a
+    ld (PR_R3),a
+
+    bit 7,a
+    jr z,.S_nonneg
+
+    ; S negative: INT = [M2][M1][M0][00] (i.e., M << 8), then shift right by -S
+    neg
+    ld b,a                 ; B = shift count
+
+    xor a
+    ld (PR_INT0),a
+    ld a,(PR_M0)
+    ld (PR_INT1),a
+    ld a,(PR_M1)
+    ld (PR_INT2),a
+    ld a,(PR_M2)
+    ld (PR_INT3),a
+
+    call shr32_INT_to_INT_with_remainder
+    jr .print_int_and_frac
+
+.S_nonneg:
+    ; S non-negative: INT = M (24-bit) then shift left S (cap at 31)
+    cp 32
+    jr c,.doShl
+    ld a,31
+.doShl:
+    ld b,a
+    ld a,(PR_M0)
+    ld (PR_INT0),a
+    ld a,(PR_M1)
+    ld (PR_INT1),a
+    ld a,(PR_M2)
+    ld (PR_INT2),a
+    xor a
+    ld (PR_INT3),a
+    call shl32_INT_by_B
+
+.print_int_and_frac:
+    call print_u32_dec
+    ld a,'.'
+    call printChar
+    ld b,FRAC_DIGITS
+.fr:
+    call mul_remainder_by_10
+    ld a,(PR_R3)
+    add a,'0'
+    call printChar
+    xor a
+    ld (PR_R3),a
+    djnz .fr
+    ret
+
+
+; Shift-right PR_INT by B, collect shifted-out bits into PR_R3 (simplified)
+shr32_INT_to_INT_with_remainder:
+    xor a
+    ld (PR_R3),a
+    ld a,b
+    or a
+    ret z
+.loop:
+    ld a,(PR_INT3)
+    srl a
+    ld (PR_INT3),a
+    ld a,(PR_INT2)
+    rr  a
+    ld (PR_INT2),a
+    ld a,(PR_INT1)
+    rr  a
+    ld (PR_INT1),a
+    ld a,(PR_INT0)
+    rr  a
+    ld (PR_INT0),a
+    ; carry has shifted-out bit; accumulate into PR_R3
+    ld a,(PR_R3)
+    add a,a
+    adc a,0
+    ld (PR_R3),a
+    djnz .loop
+    ret
+
+
+shl32_INT_by_B:
+    ld a,b
+    or a
+    ret z
+.loop:
+    ld a,(PR_INT0)
+    add a,a
+    ld (PR_INT0),a
+    ld a,(PR_INT1)
+    adc a,a
+    ld (PR_INT1),a
+    ld a,(PR_INT2)
+    adc a,a
+    ld (PR_INT2),a
+    ld a,(PR_INT3)
+    adc a,a
+    ld (PR_INT3),a
+    djnz .loop
+    ret
+
+
+mul_remainder_by_10:
+    ld a,(PR_R3)
+    ld b,a
+    add a,a      ; *2
+    add a,a      ; *4
+    add a,a      ; *8
+    add a,b      ; *9
+    add a,b      ; *10
+    ld (PR_R3),a
+    ret
+
+
+; Print PR_INT (u32) as decimal
+print_u32_dec:
+    ld a,(PR_INT0)
+    or (PR_INT1)
+    or (PR_INT2)
+    or (PR_INT3)
+    jr nz,.nz
+    ld a,'0'
+    call printChar
+    ret
+.nz:
+    xor a
+    ld (DIGLEN),a
+.dloop:
+    call u32_div10_inplace   ; remainder in A, quotient back in PR_INT
+    ld hl,DIGBUF
+    ld b,0
+    ld c,(DIGLEN)
+    add hl,bc
+    add a,'0'
+    ld (hl),a
+    ld a,(DIGLEN)
+    inc a
+    ld (DIGLEN),a
+    ld a,(PR_INT0)
+    or (PR_INT1)
+    or (PR_INT2)
+    or (PR_INT3)
+    jr nz,.dloop
+
+    ; print in reverse
+    ld a,(DIGLEN)
+    ld b,a
+.pr:
+    dec b
+    ld hl,DIGBUF
+    ld c,b
+    ld b,0
+    add hl,bc
+    ld a,(hl)
+    call printChar
+    ld a,c
+    or a
+    jr nz,.pr
+    ret
+
+
+; Divide PR_INT (u32) by 10, return remainder in A (0..9)
+u32_div10_inplace:
+    ld b,0          ; remainder
+    ld hl,PR_INT3
+    call .step
+    inc hl
+    call .step
+    inc hl
+    call .step
+    inc hl
+    call .step
+    ld a,b
+    ret
+.step:
+    ; DE = remainder*256 + byte
+    ld a,b
+    ld d,a
+    ld e,(hl)
+    ld c,0          ; quotient byte
+.div:
+    ld a,d
+    or a
+    jr nz,.sub
+    ld a,e
+    cp 10
+    jr c,.done
+.sub:
+    ld a,e
+    sub 10
+    ld e,a
+    ld a,d
+    sbc a,0
+    ld d,a
+    inc c
+    jr .div
+.done:
+    ld (hl),c
+    ld b,e
+    ret
+
+
+; ============================================================
+; fp_parse: parse decimal string -> float
+; DE -> "[-]ddd[.ddd]\0"
+; HL -> output float
+; ============================================================
+fp_parse:
+    xor a
+    ld (P_SIGN),a
+    ld (P_FRACN),a
+    ld (P_S0),a
+    ld (P_S1),a
+    ld (P_S2),a
+    ld (P_S3),a
+
+    ; optional sign
+    ld a,(de)
+    cp '-'
+    jr nz,.chkplus
+    ld a,1
+    ld (P_SIGN),a
+    inc de
+    jr .intpart
+.chkplus:
+    ld a,(de)
+    cp '+'
+    jr nz,.intpart
+    inc de
+
+.intpart:
+    ld a,(de)
+    call is_digit
+    jr nc,.maybe_dot
+.il:
+    ld a,(de)
+    sub '0'
+    ld c,a
+    call u32_mul10_scaled
+    call u32_add8_scaled
+    inc de
+    ld a,(de)
+    call is_digit
+    jr c,.il
+
+.maybe_dot:
+    ld a,(de)
+    cp '.'
+    jr nz,.finish_scaled
+    inc de
+
+    ld b,MAX_FRAC
+.fl:
+    ld a,(de)
+    call is_digit
+    jr nc,.finish_scaled
+    ld a,(de)
+    sub '0'
+    ld c,a
+    call u32_mul10_scaled
+    call u32_add8_scaled
+    ld a,(P_FRACN)
+    inc a
+    ld (P_FRACN),a
+    inc de
+    djnz .fl
+
+.finish_scaled:
+    ; convert scaled u32 to float into (HL)
+    call fp_from_u32_scaled_to_A
+
+    ; divide by 10^k if needed
+    ld a,(P_FRACN)
+    or a
+    jr z,.apply_sign
+
+    ; DE = &pow10_table[k]
+    push hl
+    ld e,a
+    ld d,0
+    ld hl,pow10_table
+    add hl,de
+    add hl,de
+    add hl,de
+    add hl,de
+    ex de,hl
+    pop hl
+    call fp_div
+
+.apply_sign:
+    ld a,(P_SIGN)
+    or a
+    ret z
+    inc hl
+    ld a,(hl)
+    xor 080h
+    ld (hl),a
+    ret
+
+
+is_digit:
+    cp '0'
+    jr c,.no
+    cp '9'+1
+    jr nc,.no
+    scf
+    ret
+.no:
+    or a
+    ret
+
+
+; P_S = P_S*10  (uses PR_INT and PR_R0..3 as scratch)
+u32_mul10_scaled:
+    ; PR_INT = P
+    ld a,(P_S0)
+    ld (PR_INT0),a
+    ld a,(P_S1)
+    ld (PR_INT1),a
+    ld a,(P_S2)
+    ld (PR_INT2),a
+    ld a,(P_S3)
+    ld (PR_INT3),a
+    ; PR_INT *=2
+    ld b,1
+    call shl32_INT_by_B
+
+    ; PR_R = P
+    ld a,(P_S0)
+    ld (PR_R0),a
+    ld a,(P_S1)
+    ld (PR_R1),a
+    ld a,(P_S2)
+    ld (PR_R2),a
+    ld a,(P_S3)
+    ld (PR_R3),a
+    ; PR_R *=8 (shift left 3)
+    ld b,3
+    call shl32_R_by_B
+
+    ; P = PR_INT + PR_R
+    ld a,(PR_INT0)
+    add a,(PR_R0)
+    ld (P_S0),a
+    ld a,(PR_INT1)
+    adc a,(PR_R1)
+    ld (P_S1),a
+    ld a,(PR_INT2)
+    adc a,(PR_R2)
+    ld (P_S2),a
+    ld a,(PR_INT3)
+    adc a,(PR_R3)
+    ld (P_S3),a
+    ret
+
+
+shl32_R_by_B:
+    ld a,b
+    or a
+    ret z
+.loop:
+    ld a,(PR_R0)
+    add a,a
+    ld (PR_R0),a
+    ld a,(PR_R1)
+    adc a,a
+    ld (PR_R1),a
+    ld a,(PR_R2)
+    adc a,a
+    ld (PR_R2),a
+    ld a,(PR_R3)
+    adc a,a
+    ld (PR_R3),a
+    djnz .loop
+    ret
+
+
+; P_S += C (0..9)
+u32_add8_scaled:
+    ld a,(P_S0)
+    add a,c
+    ld (P_S0),a
+    ld a,(P_S1)
+    adc a,0
+    ld (P_S1),a
+    ld a,(P_S2)
+    adc a,0
+    ld (P_S2),a
+    ld a,(P_S3)
+    adc a,0
+    ld (P_S3),a
+    ret
+
+
+; Convert P_S (u32) to float at (HL). Positive only; sign handled by caller.
+fp_from_u32_scaled_to_A:
+    ld a,(P_S0)
+    or (P_S1)
+    or (P_S2)
+    or (P_S3)
+    jr nz,.nz
+    ld (hl),0
+    inc hl
+    ld (hl),0
+    inc hl
+    ld (hl),0
+    inc hl
+    ld (hl),0
+    ret
+
+.nz:
+    ; find MSB index in B (0..31)
+    ld b,31
+    ld a,(P_S3)
+    ld c,a
+    or a
+    jr nz,.scan
+    ld b,23
+    ld a,(P_S2)
+    ld c,a
+    or a
+    jr nz,.scan
+    ld b,15
+    ld a,(P_S1)
+    ld c,a
+    or a
+    jr nz,.scan
+    ld b,7
+    ld a,(P_S0)
+    ld c,a
+.scan:
+.find:
+    bit 7,c
+    jr nz,.found
+    add c,c
+    dec b
+    jr .find
+.found:
+    ; EXP = FP_BIAS + B
+    ld a,b
+    add a,FP_BIAS
+    ld (hl),a
+    inc hl
+
+    ; shift value left by (23-B), take top 24 bits
+    ld a,23
+    sub b
+    ld b,a
+
+    ; PR_INT = P_S
+    ld a,(P_S0)
+    ld (PR_INT0),a
+    ld a,(P_S1)
+    ld (PR_INT1),a
+    ld a,(P_S2)
+    ld (PR_INT2),a
+    ld a,(P_S3)
+    ld (PR_INT3),a
+    call shl32_INT_by_B
+
+    ; store sign=0, fraction = top 23 bits of mantissa (hidden 1 removed)
+    ld a,(PR_INT3)
+    and 07Fh
+    ld (hl),a
+    inc hl
+    ld a,(PR_INT2)
+    ld (hl),a
+    inc hl
+    ld a,(PR_INT1)
+    ld (hl),a
+    ret
+
+.data
+; ============================================================
+; pow10_table: 10^k constants (k=0..6) in THIS float encoding
+; Verified:
+;   1.0      = 127 00 00 00
+;   10.0     = 130 20 00 00
+;   100.0    = 133 48 00 00
+;   1000.0   = 136 7A 00 00
+;   10000.0  = 140 1C 40 00
+;   100000.0 = 143 43 50 00
+;   1000000.0= 146 74 24 00
+; ============================================================
+pow10_table:
+    .byte 127, 0x00, 0x00, 0x00   ; 10^0 = 1
+    .byte 130, 0x20, 0x00, 0x00   ; 10^1 = 10
+    .byte 133, 0x48, 0x00, 0x00   ; 10^2 = 100
+    .byte 136, 0x7A, 0x00, 0x00   ; 10^3 = 1000
+    .byte 140, 0x1C, 0x40, 0x00   ; 10^4 = 10000
+    .byte 143, 0x43, 0x50, 0x00   ; 10^5 = 100000
+    .byte 146, 0x74, 0x24, 0x00   ; 10^6 = 1000000
+
+
+; ============================================================
+; BSS / WORKSPACE
+; ============================================================
+.bss
+
+; Unpacked A
+.comm A_exp,1
+.comm A_sign,1
+.comm A_m2,1
+.comm A_m1,1
+.comm A_m0,1
+
+; Unpacked B
+.comm B_exp,1
+.comm B_sign,1
+.comm B_m2,1
+.comm B_m1,1
+.comm B_m0,1
+
+; 48-bit workspace (P0 LSB .. P5 MSB)
+.comm P0,1
+.comm P1,1
+.comm P2,1
+.comm P3,1
+.comm P4,1
+.comm P5,1
+
+.comm SHCNT,1
+
+; Print temps
+.comm PR_SI,1
+.comm PR_E,1
+.comm PR_M2,1
+.comm PR_M1,1
+.comm PR_M0,1
+.comm PR_INT0,1
+.comm PR_INT1,1
+.comm PR_INT2,1
+.comm PR_INT3,1
+.comm PR_R0,1
+.comm PR_R1,1
+.comm PR_R2,1
+.comm PR_R3,1
+
+; Parse temps
+.comm P_SIGN,1
+.comm P_FRACN,1
+.comm P_S0,1
+.comm P_S1,1
+.comm P_S2,1
+.comm P_S3,1
+
+; Digit buffer
+.comm DIGBUF,1
+.comm DIGLEN,1