mirror of
https://github.com/NotAShelf/goblin.git
synced 2024-11-22 21:31:07 +00:00
210 lines
3.5 KiB
ArmAsm
210 lines
3.5 KiB
ArmAsm
//go:build !appengine && gc && !purego
|
|
// +build !appengine
|
|
// +build gc
|
|
// +build !purego
|
|
|
|
#include "textflag.h"
|
|
|
|
// Registers:
|
|
#define h AX
|
|
#define d AX
|
|
#define p SI // pointer to advance through b
|
|
#define n DX
|
|
#define end BX // loop end
|
|
#define v1 R8
|
|
#define v2 R9
|
|
#define v3 R10
|
|
#define v4 R11
|
|
#define x R12
|
|
#define prime1 R13
|
|
#define prime2 R14
|
|
#define prime4 DI
|
|
|
|
#define round(acc, x) \
|
|
IMULQ prime2, x \
|
|
ADDQ x, acc \
|
|
ROLQ $31, acc \
|
|
IMULQ prime1, acc
|
|
|
|
// round0 performs the operation x = round(0, x).
|
|
#define round0(x) \
|
|
IMULQ prime2, x \
|
|
ROLQ $31, x \
|
|
IMULQ prime1, x
|
|
|
|
// mergeRound applies a merge round on the two registers acc and x.
|
|
// It assumes that prime1, prime2, and prime4 have been loaded.
|
|
#define mergeRound(acc, x) \
|
|
round0(x) \
|
|
XORQ x, acc \
|
|
IMULQ prime1, acc \
|
|
ADDQ prime4, acc
|
|
|
|
// blockLoop processes as many 32-byte blocks as possible,
|
|
// updating v1, v2, v3, and v4. It assumes that there is at least one block
|
|
// to process.
|
|
#define blockLoop() \
|
|
loop: \
|
|
MOVQ +0(p), x \
|
|
round(v1, x) \
|
|
MOVQ +8(p), x \
|
|
round(v2, x) \
|
|
MOVQ +16(p), x \
|
|
round(v3, x) \
|
|
MOVQ +24(p), x \
|
|
round(v4, x) \
|
|
ADDQ $32, p \
|
|
CMPQ p, end \
|
|
JLE loop
|
|
|
|
// func Sum64(b []byte) uint64
|
|
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
|
|
// Load fixed primes.
|
|
MOVQ ·primes+0(SB), prime1
|
|
MOVQ ·primes+8(SB), prime2
|
|
MOVQ ·primes+24(SB), prime4
|
|
|
|
// Load slice.
|
|
MOVQ b_base+0(FP), p
|
|
MOVQ b_len+8(FP), n
|
|
LEAQ (p)(n*1), end
|
|
|
|
// The first loop limit will be len(b)-32.
|
|
SUBQ $32, end
|
|
|
|
// Check whether we have at least one block.
|
|
CMPQ n, $32
|
|
JLT noBlocks
|
|
|
|
// Set up initial state (v1, v2, v3, v4).
|
|
MOVQ prime1, v1
|
|
ADDQ prime2, v1
|
|
MOVQ prime2, v2
|
|
XORQ v3, v3
|
|
XORQ v4, v4
|
|
SUBQ prime1, v4
|
|
|
|
blockLoop()
|
|
|
|
MOVQ v1, h
|
|
ROLQ $1, h
|
|
MOVQ v2, x
|
|
ROLQ $7, x
|
|
ADDQ x, h
|
|
MOVQ v3, x
|
|
ROLQ $12, x
|
|
ADDQ x, h
|
|
MOVQ v4, x
|
|
ROLQ $18, x
|
|
ADDQ x, h
|
|
|
|
mergeRound(h, v1)
|
|
mergeRound(h, v2)
|
|
mergeRound(h, v3)
|
|
mergeRound(h, v4)
|
|
|
|
JMP afterBlocks
|
|
|
|
noBlocks:
|
|
MOVQ ·primes+32(SB), h
|
|
|
|
afterBlocks:
|
|
ADDQ n, h
|
|
|
|
ADDQ $24, end
|
|
CMPQ p, end
|
|
JG try4
|
|
|
|
loop8:
|
|
MOVQ (p), x
|
|
ADDQ $8, p
|
|
round0(x)
|
|
XORQ x, h
|
|
ROLQ $27, h
|
|
IMULQ prime1, h
|
|
ADDQ prime4, h
|
|
|
|
CMPQ p, end
|
|
JLE loop8
|
|
|
|
try4:
|
|
ADDQ $4, end
|
|
CMPQ p, end
|
|
JG try1
|
|
|
|
MOVL (p), x
|
|
ADDQ $4, p
|
|
IMULQ prime1, x
|
|
XORQ x, h
|
|
|
|
ROLQ $23, h
|
|
IMULQ prime2, h
|
|
ADDQ ·primes+16(SB), h
|
|
|
|
try1:
|
|
ADDQ $4, end
|
|
CMPQ p, end
|
|
JGE finalize
|
|
|
|
loop1:
|
|
MOVBQZX (p), x
|
|
ADDQ $1, p
|
|
IMULQ ·primes+32(SB), x
|
|
XORQ x, h
|
|
ROLQ $11, h
|
|
IMULQ prime1, h
|
|
|
|
CMPQ p, end
|
|
JL loop1
|
|
|
|
finalize:
|
|
MOVQ h, x
|
|
SHRQ $33, x
|
|
XORQ x, h
|
|
IMULQ prime2, h
|
|
MOVQ h, x
|
|
SHRQ $29, x
|
|
XORQ x, h
|
|
IMULQ ·primes+16(SB), h
|
|
MOVQ h, x
|
|
SHRQ $32, x
|
|
XORQ x, h
|
|
|
|
MOVQ h, ret+24(FP)
|
|
RET
|
|
|
|
// func writeBlocks(d *Digest, b []byte) int
|
|
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
|
|
// Load fixed primes needed for round.
|
|
MOVQ ·primes+0(SB), prime1
|
|
MOVQ ·primes+8(SB), prime2
|
|
|
|
// Load slice.
|
|
MOVQ b_base+8(FP), p
|
|
MOVQ b_len+16(FP), n
|
|
LEAQ (p)(n*1), end
|
|
SUBQ $32, end
|
|
|
|
// Load vN from d.
|
|
MOVQ s+0(FP), d
|
|
MOVQ 0(d), v1
|
|
MOVQ 8(d), v2
|
|
MOVQ 16(d), v3
|
|
MOVQ 24(d), v4
|
|
|
|
// We don't need to check the loop condition here; this function is
|
|
// always called with at least one block of data to process.
|
|
blockLoop()
|
|
|
|
// Copy vN back to d.
|
|
MOVQ v1, 0(d)
|
|
MOVQ v2, 8(d)
|
|
MOVQ v3, 16(d)
|
|
MOVQ v4, 24(d)
|
|
|
|
// The number of bytes written is p minus the old base pointer.
|
|
SUBQ b_base+8(FP), p
|
|
MOVQ p, ret+32(FP)
|
|
|
|
RET
|