; SPDX-License-Identifier: MPL-2.0
	; SPDX-FileCopyrightText: (c) 2025 A.M. Rowsell
	; ============================================================
	; Z80 Soft Float Library (4-byte) + Print + Parse (vasm syntax)
	; ============================================================
	; Float format in memory (big-endian, 4 bytes):
	; byte0: EXP   (8-bit biased exponent, 0 = zero)
	; byte1: S|F22..F16   (bit7 = sign, bits6..0 = top 7 fraction bits)
	; byte2: F15..F8
	; byte3: F7..F0

	; For EXP != 0:
	; value = (-1)^S * (1.F) * 2^(EXP - FP_BIAS)
	; FP_BIAS = 127

	; Calling convention (in-place ops):
	; HL -> A (4 bytes)
	; DE -> B (4 bytes)
	; fp_add: A = A + B  (stored back at HL)
	; fp_sub: A = A - B
	; fp_mul: A = A * B
	; fp_div: A = A / B

	; Extra:
	; fp_print: print float at (HL) using external os_print_vec (A=ASCII)
	; fp_parse: parse null-terminated string at (DE) into float at (HL)

	; Limitations:
	; - No NaN/Inf/denormals
	; - Truncation (no rounding)
	; - fp_print prints fixed decimals with a lightweight fraction path
	; - fp_parse supports optional +/- and '.' up to MAX_FRAC digits, no exponent notation
	; ============================================================

	.equ FP_BIAS, 127
	.equ FRAC_DIGITS, 6
	.equ MAX_FRAC, 6

	.extern  os_print_vec
	;        ============================================================
	;        CODE
	;        ============================================================
	.section "zone", "acrx"
	;        ------------------------------------------------------------
	;        External routine you provide:
	;        os_print_vec: prints ASCII character in A
	;        ------------------------------------------------------------
	;        os_print_vec is external, not defined here.

	; ============================================================
	; Public API: fp_add / fp_sub / fp_mul / fp_div
	; ============================================================

	; ------------------------------------------------------------
	; fp_add: A = A + B
	; ------------------------------------------------------------

fp_add:
	push hl
	push de
	call fp_unpackA
	pop  de
	call fp_unpackB
	pop  hl

	;    zero short-cuts
	ld   a, (A_exp)
	or   a
	jr   nz, fp_add_checkB
	;    A==0 => result=B
	call fp_pack_from_B_into_A
	ret

fp_add_checkB:
	ld  a, (B_exp)
	or  a
	ret z

	;   if signs same -> magnitude add
	ld  a, (A_sign)
	ld  b, a
	ld  a, (B_sign)
	xor b
	jp  z, fp_add_same_sign

	;  signs differ -> magnitude subtract
	jp fp_add_diff_sign

	; ------------------------------------------------------------
	; fp_sub: A = A - B (flip B sign in memory, add, flip back)
	; ------------------------------------------------------------

fp_sub:
	;    Flip sign bit of B byte1 (DE+1)
	push hl
	push de
	inc  de
	ld   a, (de)
	xor  0x80
	ld   (de), a
	pop  de
	pop  hl

	call fp_add

	;    Flip sign bit back
	push hl
	push de
	inc  de
	ld   a, (de)
	xor  0x80
	ld   (de), a
	pop  de
	pop  hl
	ret

	; ------------------------------------------------------------
	; fp_mul: A = A * B
	; ------------------------------------------------------------

fp_mul:
	push hl
	push de
	call fp_unpackA
	pop  de
	call fp_unpackB
	pop  hl

	;  if A==0 or B==0 => 0
	ld a, (A_exp)
	or a
	jp z, fp_store_zero_A
	ld a, (B_exp)
	or a
	jp z, fp_store_zero_A

	;   sign = A_sign XOR B_sign
	ld  a, (A_sign)
	ld  b, a
	ld  a, (B_sign)
	xor b
	ld  (A_sign), a

	;   exponent = A_exp + B_exp - BIAS
	ld  a, (A_exp)
	ld  b, a
	ld  a, (B_exp)
	add a, b
	sub FP_BIAS
	ld  (A_exp), a

	;    product = A_mant * B_mant (24x24 => 48)
	call mul24x24_schoolbook

	;    normalize product into A mantissa
	call norm_product_to_A

	;    pack back into (HL)
	call fp_packA
	ret

	; ------------------------------------------------------------
	; fp_div: A = A / B
	; ------------------------------------------------------------

fp_div:
	push hl
	push de
	call fp_unpackA
	pop  de
	call fp_unpackB
	pop  hl

	;  A==0 => 0
	ld a, (A_exp)
	or a
	jp z, fp_store_zero_A

	;  B==0 => return 0 (simple “error” behavior)
	ld a, (B_exp)
	or a
	jp z, fp_store_zero_A

	;   sign = A_sign XOR B_sign
	ld  a, (A_sign)
	ld  b, a
	ld  a, (B_sign)
	xor b
	ld  (A_sign), a

	;   exponent = A_exp - B_exp + BIAS
	ld  a, (A_exp)
	ld  b, a
	ld  a, (B_exp)
	ld  c, a
	ld  a, b
	sub c
	add a, FP_BIAS
	ld  (A_exp), a

	;    mantissa division
	call div_mantissas_to_A
	call normalize_A_mant

	call fp_packA
	ret

	; ============================================================
	; Add/Sub core (unpacked)
	; ============================================================

fp_add_same_sign:
	call align_exponents_A_B
	call add24_A_plus_B

	;    if carry: shift right, exponent++
	jr   nc, fp_add_same_sign_noCarry
	call shr24_A_1
	ld   a, (A_exp)
	inc  a
	ld   (A_exp), a

fp_add_same_sign_noCarry:
	call normalize_A_mant
	call fp_packA
	ret

fp_add_diff_sign:
	;    compare |A| vs |B|, do larger - smaller, sign = sign(larger)
	call compare_mag_A_B
	jr   c, fp_add_diff_sign_A_ge_B
	;    |B| > |A| => swap
	call swap_A_B_unpacked

fp_add_diff_sign_A_ge_B:
	call align_exponents_A_B
	call sub24_A_minus_B
	call is_A_mant_zero
	jp   z, fp_store_zero_A
	call normalize_A_mant
	call fp_packA
	ret

	; ============================================================
	; Unpack / Pack helpers
	; ============================================================

	; Unpack A from (HL)

fp_unpackA:
	ld  a, (hl)
	ld  (A_exp), a
	or  a
	jp  z, fp_unpackA_zeroA
	inc hl
	ld  a, (hl)
	ld  b, a
	;   sign bit -> A_sign (0/1)
	and 0x80
	jp  z, fp_unpackA_sa0
	ld  a, 1
	jr  fp_unpackA_sa1

fp_unpackA_sa0:
	xor a

fp_unpackA_sa1:
	ld (A_sign), a

	;   mantissa bytes with hidden 1 inserted
	ld  a, b
	and 0x7F
	or  0x80
	ld  (A_m2), a
	inc hl
	ld  a, (hl)
	ld  (A_m1), a
	inc hl
	ld  a, (hl)
	ld  (A_m0), a
	ret

fp_unpackA_zeroA:
	xor a
	ld  (A_sign), a
	ld  (A_m2), a
	ld  (A_m1), a
	ld  (A_m0), a
	ret

	; Unpack B from (DE)

fp_unpackB:
	ld  a, (de)
	ld  (B_exp), a
	or  a
	jp  z, fp_unpackB_zeroB
	inc de
	ld  a, (de)
	ld  b, a
	and 0x80
	jp  z, fp_unpackB_sb0
	ld  a, 1
	jr  fp_unpackB_sb1

fp_unpackB_sb0:
	xor a

fp_unpackB_sb1:
	ld (B_sign), a

	ld  a, b
	and 0x7F
	or  0x80
	ld  (B_m2), a
	inc de
	ld  a, (de)
	ld  (B_m1), a
	inc de
	ld  a, (de)
	ld  (B_m0), a
	ret

fp_unpackB_zeroB:
	xor a
	ld  (B_sign), a
	ld  (B_m2), a
	ld  (B_m1), a
	ld  (B_m0), a
	ret

	; Pack unpacked A back into memory at (HL)

fp_packA:
	ld  a, (A_exp)
	or  a
	jr  nz, fp_packA_packNZ
	;   store 0
	ld  (hl), 0
	inc hl
	ld  (hl), 0
	inc hl
	ld  (hl), 0
	inc hl
	ld  (hl), 0
	ret

fp_packA_packNZ:
	ld  a, (A_exp)
	ld  (hl), a
	inc hl

	;   remove hidden 1
	ld  a, (A_m2)
	and 0x7F
	ld  b, a

	;  apply sign bit7
	ld a, (A_sign)
	or a
	jp z, fp_packA_sign0
	ld a, b
	or 0x80
	jr fp_packA_storeB1

fp_packA_sign0:
	ld a, b

fp_packA_storeB1:
	ld  (hl), a
	inc hl
	ld  a, (A_m1)
	ld  (hl), a
	inc hl
	ld  a, (A_m0)
	ld  (hl), a
	ret

	; Pack from unpacked B into memory A (HL points to A destination)

fp_pack_from_B_into_A:
	ld  a, (B_exp)
	ld  (hl), a
	inc hl
	ld  a, (B_m2)
	and 0x7F
	ld  b, a
	ld  a, (B_sign)
	or  a
	jp  z, fp_pack_from_B_bs0
	ld  a, b
	or  0x80
	jr  fp_pack_from_B_bs1

fp_pack_from_B_bs0:
	ld a, b

fp_pack_from_B_bs1:
	ld  (hl), a
	inc hl
	ld  a, (B_m1)
	ld  (hl), a
	inc hl
	ld  a, (B_m0)
	ld  (hl), a
	ret

fp_store_zero_A:
	xor a
	ld  (A_exp), a
	ld  (A_sign), a
	ld  (A_m2), a
	ld  (A_m1), a
	ld  (A_m0), a
	jp  fp_packA

	; ============================================================
	; Exponent alignment / compare / swap
	; ============================================================

	; Ensure A_exp >= B_exp; shift smaller mantissa right by diff

align_exponents_A_B:
	ld   a, (A_exp)
	ld   b, a
	ld   a, (B_exp)
	cp   b
	jr   z, align_exponents_A_B_done
	jr   c, align_exponents_A_B_bigger_exp; B_exp < A_exp
	call swap_A_B_unpacked; make A the larger exponent

align_exponents_A_B_bigger_exp:
	ld   a, (A_exp)
	ld   b, a
	ld   a, (B_exp)
	ld   c, a
	ld   a, b
	sub  c; A = diff
	call shr24_B_by_A
	ld   a, (A_exp)
	ld   (B_exp), a

align_exponents_A_B_done:
	ret

	; Carry set if |A| >= |B|, else carry clear

compare_mag_A_B:
	ld a, (A_exp)
	ld b, a
	ld a, (B_exp)
	cp b
	jr z, compare_mag_A_B_cmpMant
	jr c, compare_mag_A_B_ge
	or a
	ret

compare_mag_A_B_ge:
	scf
	ret

compare_mag_A_B_cmpMant:
	ld a, (A_m2)
	ld b, a
	ld a, (B_m2)
	cp b
	jr z, compare_mag_A_B_m1
	jr c, compare_mag_A_B_ge2
	or a
	ret

compare_mag_A_B_ge2:
	scf
	ret

compare_mag_A_B_m1:
	ld a, (A_m1)
	ld b, a
	ld a, (B_m1)
	cp b
	jr z, compare_mag_A_B_m0
	jr c, compare_mag_A_B_ge3
	or a
	ret

compare_mag_A_B_ge3:
	scf
	ret

compare_mag_A_B_m0:
	ld a, (A_m0)
	ld b, a
	ld a, (B_m0)
	cp b
	jr c, compare_mag_A_B_ge4
	scf
	ret

compare_mag_A_B_ge4:
	scf
	ret

swap_A_B_unpacked:
	ld a, (A_exp)
	ld b, a
	ld a, (B_exp)
	ld (A_exp), a
	ld a, b
	ld (B_exp), a
	ld a, (A_sign)
	ld b, a
	ld a, (B_sign)
	ld (A_sign), a
	ld a, b
	ld (B_sign), a
	ld a, (A_m2)
	ld b, a
	ld a, (B_m2)
	ld (A_m2), a
	ld a, b
	ld (B_m2), a
	ld a, (A_m1)
	ld b, a
	ld a, (B_m1)
	ld (A_m1), a
	ld a, b
	ld (B_m1), a
	ld a, (A_m0)
	ld b, a
	ld a, (B_m0)
	ld (A_m0), a
	ld a, b
	ld (B_m0), a
	ret

	; ============================================================
	; 24-bit mantissa ops
	; ============================================================

add24_A_plus_B:
	ld  a, (B_m0)
	ld  b, a
	ld  a, (A_m0)
	add a, b
	ld  (A_m0), a
	ld  a, (B_m1)
	ld  b, a
	ld  a, (A_m1)
	adc a, b
	ld  (A_m1), a
	ld  a, (B_m2)
	ld  b, a
	ld  a, (A_m2)
	adc a, b
	ld  (A_m2), a
	ret ; carry meaningful

sub24_A_minus_B:
	ld  a, (B_m0)
	ld  b, a
	ld  a, (A_m0)
	sub b
	ld  (A_m0), a
	ld  a, (B_m1)
	ld  b, a
	ld  a, (A_m1)
	sbc a, b
	ld  (A_m1), a
	ld  a, (B_m2)
	ld  b, a
	ld  a, (A_m2)
	sbc a, b
	ld  (A_m2), a
	ret

is_A_mant_zero:
	ld a, (A_m2)
	ld b, a
	ld a, (A_m1)
	or b
	ld b, a
	ld a, (A_m0)
	or b
	ret

shr24_A_1:
	ld  a, (A_m2)
	srl a
	ld  (A_m2), a
	ld  a, (A_m1)
	rr  a
	ld  (A_m1), a
	ld  a, (A_m0)
	rr  a
	ld  (A_m0), a
	ret

shl24_A_1:
	ld  a, (A_m0)
	add a, a
	ld  (A_m0), a
	ld  a, (A_m1)
	adc a, a
	ld  (A_m1), a
	ld  a, (A_m2)
	adc a, a
	ld  (A_m2), a
	ret

	; Shift B mantissa right by A bits (A=0..255)

shr24_B_by_A:
	ld  (SHCNT), a
	ld  a, (SHCNT)
	cp  24
	jr  c, shr24_B_by_A_ok
	xor a
	ld  (B_m2), a
	ld  (B_m1), a
	ld  (B_m0), a
	ret

shr24_B_by_A_ok:
	ld  a, (SHCNT)
	or  a
	ret z

shr24_B_by_A_loop:
	ld  a, (B_m2)
	srl a
	ld  (B_m2), a
	ld  a, (B_m1)
	rr  a
	ld  (B_m1), a
	ld  a, (B_m0)
	rr  a
	ld  (B_m0), a
	ld  a, (SHCNT)
	dec a
	ld  (SHCNT), a
	jr  nz, shr24_B_by_A_loop
	ret

normalize_A_mant:
	call is_A_mant_zero
	jr   nz, normalize_A_mant_nz
	xor  a
	ld   (A_exp), a
	ret

normalize_A_mant_nz:
	ld  a, (A_m2)
	bit 7, a
	ret nz

normalize_A_mant_left_loop:
	ld  a, (A_m0)
	add a, a
	ld  (A_m0), a
	ld  a, (A_m1)
	adc a, a
	ld  (A_m1), a
	ld  a, (A_m2)
	adc a, a
	ld  (A_m2), a
	ld  a, (A_exp)
	dec a
	ld  (A_exp), a
	ld  a, (A_m2)
	bit 7, a
	jr  z, normalize_A_mant_left_loop
	ret

	; ============================================================
	; 8x8 -> 16 multiply (unsigned), shift-add
	; in:  A = multiplicand, C = multiplier
	; out: HL = 16-bit product
	; ============================================================

mul8u:
	ld h, 0
	ld l, 0
	ld b, 8

mul8u_m8:
	srl c
	jr  nc, mul8u_noadd
	ld  e, a
	ld  d, 0
	add hl, de

mul8u_noadd:
	add  a, a
	djnz mul8u_m8
	ret

	; ============================================================
	; 24x24 schoolbook multiply into P0..P5 (P0 LSB)
	; ============================================================

mul24x24_schoolbook:
	xor a
	ld  (P0), a
	ld  (P1), a
	ld  (P2), a
	ld  (P3), a
	ld  (P4), a
	ld  (P5), a

	;    (0, 0) offset 0
	ld   a, (B_m0)
	ld   c, a
	ld   a, (A_m0)
	call mul8u
	call add16_to_P_at0

	;    (0, 1) offset 1
	ld   a, (B_m1)
	ld   c, a
	ld   a, (A_m0)
	call mul8u
	call add16_to_P_at1

	;    (0, 2) offset 2
	ld   a, (B_m2)
	ld   c, a
	ld   a, (A_m0)
	call mul8u
	call add16_to_P_at2

	;    (1, 0) offset 1
	ld   a, (B_m0)
	ld   c, a
	ld   a, (A_m1)
	call mul8u
	call add16_to_P_at1

	;    (1, 1) offset 2
	ld   a, (B_m1)
	ld   c, a
	ld   a, (A_m1)
	call mul8u
	call add16_to_P_at2

	;    (1, 2) offset 3
	ld   a, (B_m2)
	ld   c, a
	ld   a, (A_m1)
	call mul8u
	call add16_to_P_at3

	;    (2, 0) offset 2
	ld   a, (B_m0)
	ld   c, a
	ld   a, (A_m2)
	call mul8u
	call add16_to_P_at2

	;    (2, 1) offset 3
	ld   a, (B_m1)
	ld   c, a
	ld   a, (A_m2)
	call mul8u
	call add16_to_P_at3

	;    (2, 2) offset 4
	ld   a, (B_m2)
	ld   c, a
	ld   a, (A_m2)
	call mul8u
	call add16_to_P_at4

	ret

add16_to_P_at0:
	ld  a, (P0)
	add a, l
	ld  (P0), a
	ld  a, (P1)
	adc a, h
	ld  (P1), a
	ret

add16_to_P_at1:
	ld  a, (P1)
	add a, l
	ld  (P1), a
	ld  a, (P2)
	adc a, h
	ld  (P2), a
	ret

add16_to_P_at2:
	ld  a, (P2)
	add a, l
	ld  (P2), a
	ld  a, (P3)
	adc a, h
	ld  (P3), a
	ret

add16_to_P_at3:
	ld  a, (P3)
	add a, l
	ld  (P3), a
	ld  a, (P4)
	adc a, h
	ld  (P4), a
	ret

add16_to_P_at4:
	ld  a, (P4)
	add a, l
	ld  (P4), a
	ld  a, (P5)
	adc a, h
	ld  (P5), a
	ret

	; ============================================================
	; Normalize product P into A mantissa
	; P is 48-bit, P0 LSB .. P5 MSB
	; ============================================================

norm_product_to_A:
	ld   a, (P5)
	bit  7, a
	jr   z, norm_product_shift23
	ld   a, 24
	call shr48_P_by_A
	ld   a, (A_exp)
	inc  a
	ld   (A_exp), a
	jr   norm_product_take

norm_product_shift23:
	ld   a, 23
	call shr48_P_by_A

norm_product_take:
	ld a, (P2)
	ld (A_m2), a
	ld a, (P1)
	ld (A_m1), a
	ld a, (P0)
	ld (A_m0), a
	ret

shr48_P_by_A:
	ld  (SHCNT), a
	ld  a, (SHCNT)
	or  a
	ret z

shr48_P_by_A_loop:
	ld  a, (P5)
	srl a
	ld  (P5), a
	ld  a, (P4)
	rr  a
	ld  (P4), a
	ld  a, (P3)
	rr  a
	ld  (P3), a
	ld  a, (P2)
	rr  a
	ld  (P2), a
	ld  a, (P1)
	rr  a
	ld  (P1), a
	ld  a, (P0)
	rr  a
	ld  (P0), a
	ld  a, (SHCNT)
	dec a
	ld  (SHCNT), a
	jr  nz, shr48_P_by_A_loop
	ret

	; ============================================================
	; Mantissa division (restoring-style)
	; A_m = (A_m << 23) / B_m
	; ============================================================

div_mantissas_to_A:
	;   P = A_m as 48-bit, then shift left 23
	xor a
	ld  (P3), a
	ld  (P4), a
	ld  (P5), a
	ld  a, (A_m0)
	ld  (P0), a
	ld  a, (A_m1)
	ld  (P1), a
	ld  a, (A_m2)
	ld  (P2), a

	ld   a, 23
	call shl48_P_by_A

	;   clear quotient
	xor a
	ld  (A_m2), a
	ld  (A_m1), a
	ld  (A_m0), a

	ld b, 24

div_mantissas_loop:
	call shl24_A_1
	call shl48_P_1

	;    subtract divisor from high 24 bits of P (P5..P3)
	call sub24_Phigh_minus_B
	jr   c, div_mantissas_restore
	;    success => set quotient LSB = 1
	ld   a, (A_m0)
	or   0x1
	ld   (A_m0), a
	jr   div_mantissas_next

div_mantissas_restore:
	call add24_Phigh_plus_B

div_mantissas_next:
	djnz div_mantissas_loop
	ret

shl48_P_by_A:
	ld  (SHCNT), a
	ld  a, (SHCNT)
	or  a
	ret z

shl48_P_by_A_loop:
	call shl48_P_1
	ld   a, (SHCNT)
	dec  a
	ld   (SHCNT), a
	jr   nz, shl48_P_by_A_loop
	ret

shl48_P_1:
	ld  a, (P0)
	add a, a
	ld  (P0), a
	ld  a, (P1)
	adc a, a
	ld  (P1), a
	ld  a, (P2)
	adc a, a
	ld  (P2), a
	ld  a, (P3)
	adc a, a
	ld  (P3), a
	ld  a, (P4)
	adc a, a
	ld  (P4), a
	ld  a, (P5)
	adc a, a
	ld  (P5), a
	ret

sub24_Phigh_minus_B:
	ld  a, (B_m0)
	ld  b, a
	ld  a, (P3)
	sub b
	ld  (P3), a
	ld  a, (B_m1)
	ld  b, a
	ld  a, (P4)
	sbc a, b
	ld  (P4), a
	ld  a, (B_m2)
	ld  b, a
	ld  a, (P5)
	sbc a, b
	ld  (P5), a
	ret ; carry set indicates borrow

add24_Phigh_plus_B:
	ld  a, (B_m0)
	ld  b, a
	ld  a, (P3)
	add a, b
	ld  (P3), a
	ld  a, (B_m1)
	ld  b, a
	ld  a, (P4)
	adc a, b
	ld  (P4), a
	ld  a, (B_m2)
	ld  b, a
	ld  a, (P5)
	adc a, b
	ld  (P5), a
	ret

	; ============================================================
	; fp_print: fixed format printing
	; Prints: [-]I.FFFFFF (FRAC_DIGITS digits)
	; Uses os_print_vec (A=char)
	; ============================================================

fp_print:
	;    zero?
	ld   a, (hl)
	or   a
	jr   nz, fp_print_nz
	ld   a, '0'
	call os_print_vec
	ld   a, '.'
	call os_print_vec
	ld   b, FRAC_DIGITS

fp_print_zf:
	ld   a, '0'
	call os_print_vec
	djnz fp_print_zf
	ret

fp_print_nz:
	;   EXP -> PR_E (unbiased)
	ld  a, (hl)
	sub FP_BIAS
	ld  (PR_E), a
	inc hl

	;   sign + top fraction
	ld  a, (hl)
	ld  b, a
	and 0x80
	jp  z, fp_print_ps0
	ld  a, 1
	jr  fp_print_ps1

fp_print_ps0:
	xor a

fp_print_ps1:
	ld (PR_SIGN), a

	;   mantissa with hidden 1 inserted
	ld  a, b
	and 0x7F
	or  0x80
	ld  (PR_M2), a
	inc hl
	ld  a, (hl)
	ld  (PR_M1), a
	inc hl
	ld  a, (hl)
	ld  (PR_M0), a

	;    print '-'
	ld   a, (PR_SIGN)
	or   a
	jp   z, fp_print_mag
	ld   a, '-'
	call os_print_vec

fp_print_mag:
	;   S = (E - 23)
	ld  a, (PR_E)
	sub 23

	;   clear int and remainder helpers
	xor a
	ld  (PR_INT0), a
	ld  (PR_INT1), a
	ld  (PR_INT2), a
	ld  (PR_INT3), a
	ld  (PR_R3), a

	bit 7, a
	jp  z, fp_print_S_nonneg

	;  S negative: INT = [M2][M1][M0][00] (i.e., M << 8), then shift right by -S
	neg
	ld b, a; B = shift count

	xor a
	ld  (PR_INT0), a
	ld  a, (PR_M0)
	ld  (PR_INT1), a
	ld  a, (PR_M1)
	ld  (PR_INT2), a
	ld  a, (PR_M2)
	ld  (PR_INT3), a

	call shr32_INT_to_INT_with_remainder
	jr   fp_print_print_int_and_frac

fp_print_S_nonneg:
	;  S non-negative: INT = M (24-bit) then shift left S (cap at 31)
	cp 32
	jr c, fp_print_doShl
	ld a, 31

fp_print_doShl:
	ld   b, a
	ld   a, (PR_M0)
	ld   (PR_INT0), a
	ld   a, (PR_M1)
	ld   (PR_INT1), a
	ld   a, (PR_M2)
	ld   (PR_INT2), a
	xor  a
	ld   (PR_INT3), a
	call shl32_INT_by_B

fp_print_print_int_and_frac:
	call print_u32_dec
	ld   a, '.'
	call os_print_vec
	ld   b, FRAC_DIGITS

fp_print_fr:
	call mul_remainder_by_10
	ld   a, (PR_R3)
	add  a, '0'
	call os_print_vec
	xor  a
	ld   (PR_R3), a
	djnz fp_print_fr
	ret

	; Shift-right PR_INT by B, collect shifted-out bits into PR_R3 (simplified)

shr32_INT_to_INT_with_remainder:
	xor a
	ld  (PR_R3), a
	ld  a, b
	or  a
	ret z

shr32_INT_to_INT_with_remainder_loop:
	ld   a, (PR_INT3)
	srl  a
	ld   (PR_INT3), a
	ld   a, (PR_INT2)
	rr   a
	ld   (PR_INT2), a
	ld   a, (PR_INT1)
	rr   a
	ld   (PR_INT1), a
	ld   a, (PR_INT0)
	rr   a
	ld   (PR_INT0), a
	;    carry has shifted-out bit; accumulate into PR_R3
	ld   a, (PR_R3)
	add  a, a
	adc  a, 0
	ld   (PR_R3), a
	djnz shr32_INT_to_INT_with_remainder_loop
	ret

shl32_INT_by_B:
	ld  a, b
	or  a
	ret z

shl32_INT_by_B_loop:
	ld   a, (PR_INT0)
	add  a, a
	ld   (PR_INT0), a
	ld   a, (PR_INT1)
	adc  a, a
	ld   (PR_INT1), a
	ld   a, (PR_INT2)
	adc  a, a
	ld   (PR_INT2), a
	ld   a, (PR_INT3)
	adc  a, a
	ld   (PR_INT3), a
	djnz shl32_INT_by_B_loop
	ret

mul_remainder_by_10:
	ld  a, (PR_R3)
	ld  b, a
	add a, a; *2
	add a, a; *4
	add a, a; *8
	add a, b; *9
	add a, b; *10
	ld  (PR_R3), a
	ret

	; Print PR_INT (u32) as decimal

print_u32_dec:
	ld   a, (PR_INT0)
	ld   b, a
	ld   a, (PR_INT1)
	or   b
	ld   b, a
	ld   a, (PR_INT2)
	or   b
	ld   b, a
	ld   a, (PR_INT3)
	or   b
	jr   nz, print_u32_dec_nz
	ld   a, '0'
	call os_print_vec
	ret

print_u32_dec_nz:
	xor a
	ld  (DIGLEN), a

print_u32_dec_dloop:
	call u32_div10_inplace; remainder in A, quotient back in PR_INT
	ld   hl, DIGBUF
	ld   b, 0
	ld   a, (DIGLEN)
	ld   c, a
	add  hl, bc
	add  a, '0'
	ld   (hl), a
	ld   a, (DIGLEN)
	inc  a
	ld   (DIGLEN), a
	ld   a, (PR_INT0)
	ld   b, a
	ld   a, (PR_INT1)
	or   b
	ld   b, a
	ld   a, (PR_INT2)
	or   b
	ld   b, a
	ld   a, (PR_INT3)
	or   b
	jr   nz, print_u32_dec_dloop

	;  print in reverse
	ld a, (DIGLEN)
	ld b, a

print_u32_dec_pr:
	dec  b
	ld   hl, DIGBUF
	ld   c, b
	ld   b, 0
	add  hl, bc
	ld   a, (hl)
	call os_print_vec
	ld   a, c
	or   a
	jr   nz, print_u32_dec_pr
	ret

	; Divide PR_INT (u32) by 10, return remainder in A (0..9)

u32_div10_inplace:
	ld   b, 0; remainder
	ld   hl, PR_INT3
	call u32_div10_step
	inc  hl
	call u32_div10_step
	inc  hl
	call u32_div10_step
	inc  hl
	call u32_div10_step
	ld   a, b
	ret

u32_div10_step:
	;  DE = remainder*256 + byte
	ld a, b
	ld d, a
	ld e, (hl)
	ld c, 0; quotient byte

u32_div10_div:
	ld a, d
	or a
	jr nz, u32_div10_sub
	ld a, e
	cp 10
	jr c, u32_div10_done

u32_div10_sub:
	ld  a, e
	sub 10
	ld  e, a
	ld  a, d
	sbc a, 0
	ld  d, a
	inc c
	jr  u32_div10_div

u32_div10_done:
	ld (hl), c
	ld b, e
	ret

	; ============================================================
	; fp_parse: parse decimal string -> float
	; DE -> "[-]ddd[.ddd]\0"
	; HL -> output float
	; ============================================================

fp_parse:
	xor a
	ld  (P_SIGN), a
	ld  (P_FRACN), a
	ld  (P_S0), a
	ld  (P_S1), a
	ld  (P_S2), a
	ld  (P_S3), a

	;   optional sign
	ld  a, (de)
	cp  '-'
	jr  nz, fp_parse_chkplus
	ld  a, 1
	ld  (P_SIGN), a
	inc de
	jr  fp_parse_intpart

fp_parse_chkplus:
	ld  a, (de)
	cp  '+'
	jr  nz, fp_parse_intpart
	inc de

fp_parse_intpart:
	ld   a, (de)
	call is_digit
	jr   nc, fp_parse_maybe_dot

fp_parse_il:
	ld   a, (de)
	sub  '0'
	ld   c, a
	call u32_mul10_scaled
	call u32_add8_scaled
	inc  de
	ld   a, (de)
	call is_digit
	jr   c, fp_parse_il

fp_parse_maybe_dot:
	ld  a, (de)
	cp  '.'
	jr  nz, fp_parse_finish_scaled
	inc de

	ld b, MAX_FRAC

fp_parse_fl:
	ld   a, (de)
	call is_digit
	jr   nc, fp_parse_finish_scaled
	ld   a, (de)
	sub  '0'
	ld   c, a
	call u32_mul10_scaled
	call u32_add8_scaled
	ld   a, (P_FRACN)
	inc  a
	ld   (P_FRACN), a
	inc  de
	djnz fp_parse_fl

fp_parse_finish_scaled:
	;    convert scaled u32 to float into (HL)
	call fp_from_u32_scaled_to_A

	;  divide by 10^k if needed
	ld a, (P_FRACN)
	or a
	jp z, fp_parse_apply_sign

	;    DE = &pow10_table[k]
	push hl
	ld   e, a
	ld   d, 0
	ld   hl, pow10_table
	add  hl, de
	add  hl, de
	add  hl, de
	add  hl, de
	ex   de, hl
	pop  hl
	call fp_div

fp_parse_apply_sign:
	ld  a, (P_SIGN)
	or  a
	ret z
	inc hl
	ld  a, (hl)
	xor 0x80
	ld  (hl), a
	ret

is_digit:
	cp '0'
	jr c, is_digit_no
	cp '9'+1
	jr nc, is_digit_no
	scf
	ret

is_digit_no:
	or a
	ret

	; P_S = P_S*10  (uses PR_INT and PR_R0..3 as scratch)

u32_mul10_scaled:
	;    PR_INT = P
	ld   a, (P_S0)
	ld   (PR_INT0), a
	ld   a, (P_S1)
	ld   (PR_INT1), a
	ld   a, (P_S2)
	ld   (PR_INT2), a
	ld   a, (P_S3)
	ld   (PR_INT3), a
	;    PR_INT *=2
	ld   b, 1
	call shl32_INT_by_B

	;    PR_R = P
	ld   a, (P_S0)
	ld   (PR_R0), a
	ld   a, (P_S1)
	ld   (PR_R1), a
	ld   a, (P_S2)
	ld   (PR_R2), a
	ld   a, (P_S3)
	ld   (PR_R3), a
	;    PR_R *=8 (shift left 3)
	ld   b, 3
	call shl32_R_by_B

	;   P = PR_INT + PR_R
	ld  a, (PR_R0)
	ld  b, a
	ld  a, (PR_INT0)
	add a, b
	ld  (P_S0), a
	ld  a, (PR_R1)
	ld  b, a
	ld  a, (PR_INT1)
	adc a, b
	ld  (P_S1), a
	ld  a, (PR_R2)
	ld  b, a
	ld  a, (PR_INT2)
	adc a, b
	ld  (P_S2), a
	ld  a, (PR_R3)
	ld  b, a
	ld  a, (PR_INT3)
	adc a, b
	ld  (P_S3), a
	ret

shl32_R_by_B:
	ld  a, b
	or  a
	ret z

shl32_R_by_B_loop:
	ld   a, (PR_R0)
	add  a, a
	ld   (PR_R0), a
	ld   a, (PR_R1)
	adc  a, a
	ld   (PR_R1), a
	ld   a, (PR_R2)
	adc  a, a
	ld   (PR_R2), a
	ld   a, (PR_R3)
	adc  a, a
	ld   (PR_R3), a
	djnz shl32_R_by_B_loop
	ret

	; P_S += C (0..9)

u32_add8_scaled:
	ld  a, (P_S0)
	add a, c
	ld  (P_S0), a
	ld  a, (P_S1)
	adc a, 0
	ld  (P_S1), a
	ld  a, (P_S2)
	adc a, 0
	ld  (P_S2), a
	ld  a, (P_S3)
	adc a, 0
	ld  (P_S3), a
	ret

	; Convert P_S (u32) to float at (HL). Positive only; sign handled by caller.

fp_from_u32_scaled_to_A:
	ld  a, (P_S0)
	ld  b, a
	ld  a, (P_S1)
	or  b
	ld  b, a
	ld  a, (P_S2)
	or  b
	ld  b, a
	ld  a, (P_S3)
	or  b
	jr  nz, fp_from_u32_scaled_to_A_nz
	ld  (hl), 0
	inc hl
	ld  (hl), 0
	inc hl
	ld  (hl), 0
	inc hl
	ld  (hl), 0
	ret

fp_from_u32_scaled_to_A_nz:
	;  find MSB index in B (0..31)
	ld b, 31
	ld a, (P_S3)
	ld c, a
	or a
	jr nz, fp_from_u32_scaled_to_A_scan
	ld b, 23
	ld a, (P_S2)
	ld c, a
	or a
	jr nz, fp_from_u32_scaled_to_A_scan
	ld b, 15
	ld a, (P_S1)
	ld c, a
	or a
	jr nz, fp_from_u32_scaled_to_A_scan
	ld b, 7
	ld a, (P_S0)
	ld c, a

fp_from_u32_scaled_to_A_scan:
fp_from_u32_scaled_to_A_find:
	bit 7, c
	jr  nz, fp_from_u32_scaled_to_A_found
	ld  a, c
	add a, a
	ld  c, a
	dec b
	jr  fp_from_u32_scaled_to_A_find

fp_from_u32_scaled_to_A_found:
	;   EXP = FP_BIAS + B
	ld  a, b
	add a, FP_BIAS
	ld  (hl), a
	inc hl

	;   shift value left by (23-B), take top 24 bits
	ld  a, 23
	sub b
	ld  b, a

	;    PR_INT = P_S
	ld   a, (P_S0)
	ld   (PR_INT0), a
	ld   a, (P_S1)
	ld   (PR_INT1), a
	ld   a, (P_S2)
	ld   (PR_INT2), a
	ld   a, (P_S3)
	ld   (PR_INT3), a
	call shl32_INT_by_B

	;   store sign=0, fraction = top 23 bits of mantissa (hidden 1 removed)
	ld  a, (PR_INT3)
	and 0x7F
	ld  (hl), a
	inc hl
	ld  a, (PR_INT2)
	ld  (hl), a
	inc hl
	ld  a, (PR_INT1)
	ld  (hl), a
	ret

	;       ============================================================
	;       BSS / WORKSPACE
	;       ============================================================
	.balign 16
	.bss

	; Unpacked A
	A_exp: .space 1
	A_sign: .space 1
	A_m2: .space 1
	A_m1: .space 1
	A_m0: .space 1

	; Unpacked B
	B_exp: .space 1
	B_sign: .space 1
	B_m2: .space 1
	B_m1: .space 1
	B_m0: .space 1

	; 48-bit workspace (P0 LSB .. P5 MSB)

P0:
	.space 1

P1:
	.space 1

P2:
	.space 1

P3:
	.space 1

P4:
	.space 1

P5:
	.space 1

SHCNT:
	.space 1

	; Print temps
	PR_SIGN: .space 1
	PR_E: .space 1
	PR_M2: .space 1
	PR_M1: .space 1
	PR_M0: .space 1
	PR_INT0: .space 1
	PR_INT1: .space 1
	PR_INT2: .space 1
	PR_INT3: .space 1
	PR_R0: .space 1
	PR_R1: .space 1
	PR_R2: .space 1
	PR_R3: .space 1

	; Parse temps
	P_SIGN: .space 1
	P_FRACN: .space 1
	P_S0: .space 1
	P_S1: .space 1
	P_S2: .space 1
	P_S3: .space 1

	; Digit buffer

DIGBUF:
	.space 1

DIGLEN:
	.space 1

	;        ============================================================
	;        pow10_table: 10^k constants (k=0..6) in THIS float encoding
	;        Verified:
	;        1.0      = 127 00 00 00
	;        10.0     = 130 20 00 00
	;        100.0    = 133 48 00 00
	;        1000.0   = 136 7A 00 00
	;        10000.0  = 140 1C 40 00
	;        100000.0 = 143 43 50 00
	;        1000000.0= 146 74 24 00
	;        ============================================================
	.section "zone", "acrx"

pow10_table:
	.byte 127, 0x00, 0x00, 0x00; 10^0 = 1
	.byte 130, 0x20, 0x00, 0x00; 10^1 = 10
	.byte 133, 0x48, 0x00, 0x00; 10^2 = 100
	.byte 136, 0x7A, 0x00, 0x00; 10^3 = 1000
	.byte 140, 0x1C, 0x40, 0x00; 10^4 = 10000
	.byte 143, 0x43, 0x50, 0x00; 10^5 = 100000
	.byte 146, 0x74, 0x24, 0x00; 10^6 = 1000000