// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// AVX2 version by Intel, same algorithm as code in Linux kernel:
// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
// Authors:
// Ilya Albrekht <[email protected]>
// Maxim Locktyukhin <[email protected]>
// Ronen Zohar <[email protected]>
// Chandramouli Narayanan <[email protected]>


#include "textflag.h"

// SHA-1 block routine. See sha1block.go for Go equivalent.
//
// There are 80 rounds of 4 types:
//   - rounds 0-15 are type 1 and load data (ROUND1 macro).
//   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
//   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
//   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
//   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
//
// Each round loads or shuffles the data, then computes a per-round
// function of b, c, d, and then mixes the result into and rotates the
// five registers a, b, c, d, e holding the intermediate results.
//
// The register rotation is implemented by rotating the arguments to
// the round macros instead of by explicit move instructions.

#define LOAD(index) \
	MOVL	(index*4)(SI), R10; \
	BSWAPL	R10; \
	MOVL	R10, (index*4)(SP)

#define SHUFFLE(index) \
	MOVL	(((index)&0xf)*4)(SP), R10; \
	XORL	(((index-3)&0xf)*4)(SP), R10; \
	XORL	(((index-8)&0xf)*4)(SP), R10; \
	XORL	(((index-14)&0xf)*4)(SP), R10; \
	ROLL	$1, R10; \
	MOVL	R10, (((index)&0xf)*4)(SP)

#define FUNC1(a, b, c, d, e) \
	MOVL	d, R9; \
	XORL	c, R9; \
	ANDL	b, R9; \
	XORL	d, R9

#define FUNC2(a, b, c, d, e) \
	MOVL	b, R9; \
	XORL	c, R9; \
	XORL	d, R9

#define FUNC3(a, b, c, d, e) \
	MOVL	b, R8; \
	ORL	c, R8; \
	ANDL	d, R8; \
	MOVL	b, R9; \
	ANDL	c, R9; \
	ORL	R8, R9

#define FUNC4 FUNC2

#define MIX(a, b, c, d, e, const) \
	ROLL	$30, b; \
	ADDL	R9, e; \
	MOVL	a, R8; \
	ROLL	$5, R8; \
	LEAL	const(e)(R10*1), e; \
	ADDL	R8, e

#define ROUND1(a, b, c, d, e, index) \
	LOAD(index); \
	FUNC1(a, b, c, d, e); \
	MIX(a, b, c, d, e, 0x5A827999)

#define ROUND1x(a, b, c, d, e, index) \
	SHUFFLE(index); \
	FUNC1(a, b, c, d, e); \
	MIX(a, b, c, d, e, 0x5A827999)

#define ROUND2(a, b, c, d, e, index) \
	SHUFFLE(index); \
	FUNC2(a, b, c, d, e); \
	MIX(a, b, c, d, e, 0x6ED9EBA1)

#define ROUND3(a, b, c, d, e, index) \
	SHUFFLE(index); \
	FUNC3(a, b, c, d, e); \
	MIX(a, b, c, d, e, 0x8F1BBCDC)

#define ROUND4(a, b, c, d, e, index) \
	SHUFFLE(index); \
	FUNC4(a, b, c, d, e); \
	MIX(a, b, c, d, e, 0xCA62C1D6)

TEXT ·blockAMD64(SB),NOSPLIT,$64-32
	MOVQ	dig+0(FP),	BP
	MOVQ	p_base+8(FP),	SI
	MOVQ	p_len+16(FP),	DX
	SHRQ	$6,		DX
	SHLQ	$6,		DX

	LEAQ	(SI)(DX*1),	DI
	MOVL	(0*4)(BP),	AX
	MOVL	(1*4)(BP),	BX
	MOVL	(2*4)(BP),	CX
	MOVL	(3*4)(BP),	DX
	MOVL	(4*4)(BP),	BP

	CMPQ	SI,		DI
	JEQ	end

loop:
	MOVL	AX,	R11
	MOVL	BX,	R12
	MOVL	CX,	R13
	MOVL	DX,	R14
	MOVL	BP,	R15

	ROUND1(AX, BX, CX, DX, BP, 0)
	ROUND1(BP, AX, BX, CX, DX, 1)
	ROUND1(DX, BP, AX, BX, CX, 2)
	ROUND1(CX, DX, BP, AX, BX, 3)
	ROUND1(BX, CX, DX, BP, AX, 4)
	ROUND1(AX, BX, CX, DX, BP, 5)
	ROUND1(BP, AX, BX, CX, DX, 6)
	ROUND1(DX, BP, AX, BX, CX, 7)
	ROUND1(CX, DX, BP, AX, BX, 8)
	ROUND1(BX, CX, DX, BP, AX, 9)
	ROUND1(AX, BX, CX, DX, BP, 10)
	ROUND1(BP, AX, BX, CX, DX, 11)
	ROUND1(DX, BP, AX, BX, CX, 12)
	ROUND1(CX, DX, BP, AX, BX, 13)
	ROUND1(BX, CX, DX, BP, AX, 14)
	ROUND1(AX, BX, CX, DX, BP, 15)

	ROUND1x(BP, AX, BX, CX, DX, 16)
	ROUND1x(DX, BP, AX, BX, CX, 17)
	ROUND1x(CX, DX, BP, AX, BX, 18)
	ROUND1x(BX, CX, DX, BP, AX, 19)

	ROUND2(AX, BX, CX, DX, BP, 20)
	ROUND2(BP, AX, BX, CX, DX, 21)
	ROUND2(DX, BP, AX, BX, CX, 22)
	ROUND2(CX, DX, BP, AX, BX, 23)
	ROUND2(BX, CX, DX, BP, AX, 24)
	ROUND2(AX, BX, CX, DX, BP, 25)
	ROUND2(BP, AX, BX, CX, DX, 26)
	ROUND2(DX, BP, AX, BX, CX, 27)
	ROUND2(CX, DX, BP, AX, BX, 28)
	ROUND2(BX, CX, DX, BP, AX, 29)
	ROUND2(AX, BX, CX, DX, BP, 30)
	ROUND2(BP, AX, BX, CX, DX, 31)
	ROUND2(DX, BP, AX, BX, CX, 32)
	ROUND2(CX, DX, BP, AX, BX, 33)
	ROUND2(BX, CX, DX, BP, AX, 34)
	ROUND2(AX, BX, CX, DX, BP, 35)
	ROUND2(BP, AX, BX, CX, DX, 36)
	ROUND2(DX, BP, AX, BX, CX, 37)
	ROUND2(CX, DX, BP, AX, BX, 38)
	ROUND2(BX, CX, DX, BP, AX, 39)

	ROUND3(AX, BX, CX, DX, BP, 40)
	ROUND3(BP, AX, BX, CX, DX, 41)
	ROUND3(DX, BP, AX, BX, CX, 42)
	ROUND3(CX, DX, BP, AX, BX, 43)
	ROUND3(BX, CX, DX, BP, AX, 44)
	ROUND3(AX, BX, CX, DX, BP, 45)
	ROUND3(BP, AX, BX, CX, DX, 46)
	ROUND3(DX, BP, AX, BX, CX, 47)
	ROUND3(CX, DX, BP, AX, BX, 48)
	ROUND3(BX, CX, DX, BP, AX, 49)
	ROUND3(AX, BX, CX, DX, BP, 50)
	ROUND3(BP, AX, BX, CX, DX, 51)
	ROUND3(DX, BP, AX, BX, CX, 52)
	ROUND3(CX, DX, BP, AX, BX, 53)
	ROUND3(BX, CX, DX, BP, AX, 54)
	ROUND3(AX, BX, CX, DX, BP, 55)
	ROUND3(BP, AX, BX, CX, DX, 56)
	ROUND3(DX, BP, AX, BX, CX, 57)
	ROUND3(CX, DX, BP, AX, BX, 58)
	ROUND3(BX, CX, DX, BP, AX, 59)

	ROUND4(AX, BX, CX, DX, BP, 60)
	ROUND4(BP, AX, BX, CX, DX, 61)
	ROUND4(DX, BP, AX, BX, CX, 62)
	ROUND4(CX, DX, BP, AX, BX, 63)
	ROUND4(BX, CX, DX, BP, AX, 64)
	ROUND4(AX, BX, CX, DX, BP, 65)
	ROUND4(BP, AX, BX, CX, DX, 66)
	ROUND4(DX, BP, AX, BX, CX, 67)
	ROUND4(CX, DX, BP, AX, BX, 68)
	ROUND4(BX, CX, DX, BP, AX, 69)
	ROUND4(AX, BX, CX, DX, BP, 70)
	ROUND4(BP, AX, BX, CX, DX, 71)
	ROUND4(DX, BP, AX, BX, CX, 72)
	ROUND4(CX, DX, BP, AX, BX, 73)
	ROUND4(BX, CX, DX, BP, AX, 74)
	ROUND4(AX, BX, CX, DX, BP, 75)
	ROUND4(BP, AX, BX, CX, DX, 76)
	ROUND4(DX, BP, AX, BX, CX, 77)
	ROUND4(CX, DX, BP, AX, BX, 78)
	ROUND4(BX, CX, DX, BP, AX, 79)

	ADDL	R11, AX
	ADDL	R12, BX
	ADDL	R13, CX
	ADDL	R14, DX
	ADDL	R15, BP

	ADDQ	$64, SI
	CMPQ	SI, DI
	JB	loop

end:
	MOVQ	dig+0(FP), DI
	MOVL	AX, (0*4)(DI)
	MOVL	BX, (1*4)(DI)
	MOVL	CX, (2*4)(DI)
	MOVL	DX, (3*4)(DI)
	MOVL	BP, (4*4)(DI)
	RET


// This is the implementation using AVX2, BMI1 and BMI2. It is based on:
// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
// From http://software.intel.com/en-us/articles
// (look for improving-the-performance-of-the-secure-hash-algorithm-1)
// This implementation is 2x unrolled, and interleaves vector instructions,
// used to precompute W, with scalar computation of current round
// for optimal scheduling.

// Trivial helper macros.
#define UPDATE_HASH(A,TB,C,D,E) \
	ADDL	(R9), A \
	MOVL	A, (R9) \
	ADDL	4(R9), TB \
	MOVL	TB, 4(R9) \
	ADDL	8(R9), C \
	MOVL	C, 8(R9) \
	ADDL	12(R9), D \
	MOVL	D, 12(R9) \
	ADDL	16(R9), E \
	MOVL	E, 16(R9)



// Helper macros for PRECALC, which does precomputations
#define PRECALC_0(OFFSET) \
	VMOVDQU   OFFSET(R10),X0

#define PRECALC_1(OFFSET) \
	VINSERTI128 $1, OFFSET(R13), Y0, Y0

#define PRECALC_2(YREG) \
	VPSHUFB Y10, Y0, YREG

#define PRECALC_4(YREG,K_OFFSET) \
	VPADDD K_OFFSET(R8), YREG, Y0

#define PRECALC_7(OFFSET) \
	VMOVDQU Y0, (OFFSET*2)(R14)


// Message scheduling pre-compute for rounds 0-15
// R13 is a pointer to even 64-byte block
// R10 is a pointer to odd 64-byte block
// R14 is a pointer to temp buffer
// X0 is used as temp register
// YREG is clobbered as part of computation
// OFFSET chooses 16 byte chunk within a block
// R8 is a pointer to constants block
// K_OFFSET chooses K constants relevant to this round
// X10 holds swap mask
#define PRECALC_00_15(OFFSET,YREG) \
	PRECALC_0(OFFSET) \
	PRECALC_1(OFFSET) \
	PRECALC_2(YREG) \
	PRECALC_4(YREG,0x0) \
	PRECALC_7(OFFSET)


// Helper macros for PRECALC_16_31
#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
	VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \  // w[i-14]
	VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]

#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
	VPXOR  REG_SUB_8, REG, REG \
	VPXOR  REG_SUB_16, Y0, Y0

#define PRECALC_18(REG) \
	VPXOR Y0, REG, REG \
	VPSLLDQ $12, REG, Y9

#define PRECALC_19(REG) \
	VPSLLD $1, REG, Y0 \
	VPSRLD $31, REG, REG

#define PRECALC_20(REG) \
	VPOR REG, Y0, Y0 \
	VPSLLD $2, Y9,  REG

#define PRECALC_21(REG) \
	VPSRLD $30, Y9, Y9 \
	VPXOR REG, Y0, Y0

#define PRECALC_23(REG,K_OFFSET,OFFSET) \
	VPXOR Y9, Y0, REG \
	VPADDD K_OFFSET(R8), REG, Y0 \
	VMOVDQU Y0, (OFFSET)(R14)

// Message scheduling pre-compute for rounds 16-31
// calculating last 32 w[i] values in 8 XMM registers
// pre-calculate K+w[i] values and store to mem
// for later load by ALU add instruction.
// "brute force" vectorization for rounds 16-31 only
// due to w[i]->w[i-3] dependency.
// clobbers 5 input ymm registers REG_SUB*
// uses X0 and X9 as temp registers
// As always, R8 is a pointer to constants block
// and R14 is a pointer to temp buffer
#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
	PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
	PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
	PRECALC_18(REG) \
	PRECALC_19(REG) \
	PRECALC_20(REG) \
	PRECALC_21(REG) \
	PRECALC_23(REG,K_OFFSET,OFFSET)


// Helper macros for PRECALC_32_79
#define PRECALC_32(REG_SUB_8,REG_SUB_4) \
	VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0

#define PRECALC_33(REG_SUB_28,REG) \
	VPXOR REG_SUB_28, REG, REG

#define PRECALC_34(REG_SUB_16) \
	VPXOR REG_SUB_16, Y0, Y0

#define PRECALC_35(REG) \
	VPXOR Y0, REG, REG

#define PRECALC_36(REG) \
	VPSLLD $2, REG, Y0

#define PRECALC_37(REG) \
	VPSRLD $30, REG, REG \
	VPOR REG, Y0, REG

#define PRECALC_39(REG,K_OFFSET,OFFSET) \
	VPADDD K_OFFSET(R8), REG, Y0 \
	VMOVDQU Y0, (OFFSET)(R14)

// Message scheduling pre-compute for rounds 32-79
// In SHA-1 specification we have:
// w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
// Which is the same as:
// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
// This allows for more efficient vectorization,
// since w[i]->w[i-3] dependency is broken
#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
	PRECALC_32(REG_SUB_8,REG_SUB_4) \
	PRECALC_33(REG_SUB_28,REG) \
	PRECALC_34(REG_SUB_16) \
	PRECALC_35(REG) \
	PRECALC_36(REG) \
	PRECALC_37(REG) \
	PRECALC_39(REG,K_OFFSET,OFFSET)

#define PRECALC \
	PRECALC_00_15(0,Y15) \
	PRECALC_00_15(0x10,Y14) \
	PRECALC_00_15(0x20,Y13) \
	PRECALC_00_15(0x30,Y12) \
	PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
	PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
	PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
	PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
	PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
	PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
	PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
	PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)

// Macros calculating individual rounds have general form
// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
// CALC_ROUND_{PRE,POST} macros follow

#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
	ADDL OFFSET(R15),REG_E \
	ANDNL REG_C,REG_A,BP \
	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
	RORXL $0x1b, REG_A, R12 \
	RORXL $2, REG_A, REG_B         // for next round

// Calculate F for the next round
#define CALC_F1_POST(REG_A,REG_B,REG_E) \
	ANDL REG_B,REG_A \             // b&c
	XORL BP, REG_A \               // F1 = (b&c) ^ (~b&d)
	LEAL (REG_E)(R12*1), REG_E     // E += A >>> 5


// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
#define CALC_0 \
	MOVL SI, BX \ // Precalculating first round
	RORXL $2, SI, SI \
	ANDNL AX, BX, BP \
	ANDL DI, BX \
	XORL BP, BX \
	CALC_F1_PRE(0x0,CX,BX,DI,DX) \
	PRECALC_0(0x80) \
	CALC_F1_POST(CX,SI,DX)

#define CALC_1 \
	CALC_F1_PRE(0x4,DX,CX,SI,AX) \
	PRECALC_1(0x80) \
	CALC_F1_POST(DX,BX,AX)

#define CALC_2 \
	CALC_F1_PRE(0x8,AX,DX,BX,DI) \
	PRECALC_2(Y15) \
	CALC_F1_POST(AX,CX,DI)

#define CALC_3 \
	CALC_F1_PRE(0xc,DI,AX,CX,SI) \
	CALC_F1_POST(DI,DX,SI)

#define CALC_4 \
	CALC_F1_PRE(0x20,SI,DI,DX,BX) \
	PRECALC_4(Y15,0x0) \
	CALC_F1_POST(SI,AX,BX)

#define CALC_5 \
	CALC_F1_PRE(0x24,BX,SI,AX,CX) \
	CALC_F1_POST(BX,DI,CX)

#define CALC_6 \
	CALC_F1_PRE(0x28,CX,BX,DI,DX) \
	CALC_F1_POST(CX,SI,DX)

#define CALC_7 \
	CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
	PRECALC_7(0x0) \
	CALC_F1_POST(DX,BX,AX)

#define CALC_8 \
	CALC_F1_PRE(0x40,AX,DX,BX,DI) \
	PRECALC_0(0x90) \
	CALC_F1_POST(AX,CX,DI)

#define CALC_9 \
	CALC_F1_PRE(0x44,DI,AX,CX,SI) \
	PRECALC_1(0x90) \
	CALC_F1_POST(DI,DX,SI)

#define CALC_10 \
	CALC_F1_PRE(0x48,SI,DI,DX,BX) \
	PRECALC_2(Y14) \
	CALC_F1_POST(SI,AX,BX)

#define CALC_11 \
	CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
	CALC_F1_POST(BX,DI,CX)

#define CALC_12 \
	CALC_F1_PRE(0x60,CX,BX,DI,DX) \
	PRECALC_4(Y14,0x0) \
	CALC_F1_POST(CX,SI,DX)

#define CALC_13 \
	CALC_F1_PRE(0x64,DX,CX,SI,AX) \
	CALC_F1_POST(DX,BX,AX)

#define CALC_14 \
	CALC_F1_PRE(0x68,AX,DX,BX,DI) \
	CALC_F1_POST(AX,CX,DI)

#define CALC_15 \
	CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
	PRECALC_7(0x10) \
	CALC_F1_POST(DI,DX,SI)

#define CALC_16 \
	CALC_F1_PRE(0x80,SI,DI,DX,BX) \
	PRECALC_0(0xa0) \
	CALC_F1_POST(SI,AX,BX)

#define CALC_17 \
	CALC_F1_PRE(0x84,BX,SI,AX,CX) \
	PRECALC_1(0xa0) \
	CALC_F1_POST(BX,DI,CX)

#define CALC_18 \
	CALC_F1_PRE(0x88,CX,BX,DI,DX) \
	PRECALC_2(Y13) \
	CALC_F1_POST(CX,SI,DX)


#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
	ADDL OFFSET(R15),REG_E \
	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
	RORXL $0x1b, REG_A, R12 \
	RORXL $2, REG_A, REG_B         // for next round

#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
	XORL REG_B, REG_A \
	ADDL R12, REG_E \
        XORL REG_C, REG_A

#define CALC_19 \
	CALC_F2_PRE(0x8c,DX,CX,AX) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_20 \
	CALC_F2_PRE(0xa0,AX,DX,DI) \
	PRECALC_4(Y13,0x0) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_21 \
	CALC_F2_PRE(0xa4,DI,AX,SI) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_22 \
	CALC_F2_PRE(0xa8,SI,DI,BX) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_23 \
	CALC_F2_PRE(0xac,BX,SI,CX) \
	PRECALC_7(0x20) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_24 \
	CALC_F2_PRE(0xc0,CX,BX,DX) \
	PRECALC_0(0xb0) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_25 \
	CALC_F2_PRE(0xc4,DX,CX,AX) \
	PRECALC_1(0xb0) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_26 \
	CALC_F2_PRE(0xc8,AX,DX,DI) \
	PRECALC_2(Y12) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_27 \
	CALC_F2_PRE(0xcc,DI,AX,SI) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_28 \
	CALC_F2_PRE(0xe0,SI,DI,BX) \
	PRECALC_4(Y12,0x0) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_29 \
	CALC_F2_PRE(0xe4,BX,SI,CX) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_30 \
	CALC_F2_PRE(0xe8,CX,BX,DX) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_31 \
	CALC_F2_PRE(0xec,DX,CX,AX) \
	PRECALC_7(0x30) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_32 \
	CALC_F2_PRE(0x100,AX,DX,DI) \
	PRECALC_16(Y15,Y14,Y12,Y8) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_33 \
	CALC_F2_PRE(0x104,DI,AX,SI) \
	PRECALC_17(Y15,Y13,Y8) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_34 \
	CALC_F2_PRE(0x108,SI,DI,BX) \
	PRECALC_18(Y8) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_35 \
	CALC_F2_PRE(0x10c,BX,SI,CX) \
	PRECALC_19(Y8) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_36 \
	CALC_F2_PRE(0x120,CX,BX,DX) \
	PRECALC_20(Y8) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_37 \
	CALC_F2_PRE(0x124,DX,CX,AX) \
	PRECALC_21(Y8) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_38 \
	CALC_F2_PRE(0x128,AX,DX,DI) \
	CALC_F2_POST(AX,CX,BX,DI)


#define CALC_F3_PRE(OFFSET,REG_E) \
	ADDL OFFSET(R15),REG_E

#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
	LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
	MOVL REG_B, BP \
	ORL  REG_A, BP \
	RORXL $0x1b, REG_A, R12 \
	RORXL $2, REG_A, REG_TB \
	ANDL REG_C, BP \		// Calculate F for the next round
	ANDL REG_B, REG_A \
	ORL  BP, REG_A \
	ADDL R12, REG_E

#define CALC_39 \
	CALC_F3_PRE(0x12c,SI) \
	PRECALC_23(Y8,0x0,0x80) \
	CALC_F3_POST(DI,DX,CX,SI,AX)

#define CALC_40 \
	CALC_F3_PRE(0x140,BX) \
	PRECALC_16(Y14,Y13,Y8,Y7) \
	CALC_F3_POST(SI,AX,DX,BX,DI)

#define CALC_41 \
	CALC_F3_PRE(0x144,CX) \
	PRECALC_17(Y14,Y12,Y7) \
	CALC_F3_POST(BX,DI,AX,CX,SI)

#define CALC_42 \
	CALC_F3_PRE(0x148,DX) \
	PRECALC_18(Y7) \
	CALC_F3_POST(CX,SI,DI,DX,BX)

#define CALC_43 \
	CALC_F3_PRE(0x14c,AX) \
	PRECALC_19(Y7) \
	CALC_F3_POST(DX,BX,SI,AX,CX)

#define CALC_44 \
	CALC_F3_PRE(0x160,DI) \
	PRECALC_20(Y7) \
	CALC_F3_POST(AX,CX,BX,DI,DX)

#define CALC_45 \
	CALC_F3_PRE(0x164,SI) \
	PRECALC_21(Y7) \
	CALC_F3_POST(DI,DX,CX,SI,AX)

#define CALC_46 \
	CALC_F3_PRE(0x168,BX) \
	CALC_F3_POST(SI,AX,DX,BX,DI)

#define CALC_47 \
	CALC_F3_PRE(0x16c,CX) \
	VPXOR Y9, Y0, Y7 \
	VPADDD 0x20(R8), Y7, Y0 \
	VMOVDQU Y0, 0xa0(R14) \
	CALC_F3_POST(BX,DI,AX,CX,SI)

#define CALC_48 \
	CALC_F3_PRE(0x180,DX) \
	PRECALC_16(Y13,Y12,Y7,Y5) \
	CALC_F3_POST(CX,SI,DI,DX,BX)

#define CALC_49 \
	CALC_F3_PRE(0x184,AX) \
	PRECALC_17(Y13,Y8,Y5) \
	CALC_F3_POST(DX,BX,SI,AX,CX)

#define CALC_50 \
	CALC_F3_PRE(0x188,DI) \
	PRECALC_18(Y5) \
	CALC_F3_POST(AX,CX,BX,DI,DX)

#define CALC_51 \
	CALC_F3_PRE(0x18c,SI) \
	PRECALC_19(Y5) \
	CALC_F3_POST(DI,DX,CX,SI,AX)

#define CALC_52 \
	CALC_F3_PRE(0x1a0,BX) \
	PRECALC_20(Y5) \
	CALC_F3_POST(SI,AX,DX,BX,DI)

#define CALC_53 \
	CALC_F3_PRE(0x1a4,CX) \
	PRECALC_21(Y5) \
	CALC_F3_POST(BX,DI,AX,CX,SI)

#define CALC_54 \
	CALC_F3_PRE(0x1a8,DX) \
	CALC_F3_POST(CX,SI,DI,DX,BX)

#define CALC_55 \
	CALC_F3_PRE(0x1ac,AX) \
	PRECALC_23(Y5,0x20,0xc0) \
	CALC_F3_POST(DX,BX,SI,AX,CX)

#define CALC_56 \
	CALC_F3_PRE(0x1c0,DI) \
	PRECALC_16(Y12,Y8,Y5,Y3) \
	CALC_F3_POST(AX,CX,BX,DI,DX)

#define CALC_57 \
	CALC_F3_PRE(0x1c4,SI) \
	PRECALC_17(Y12,Y7,Y3) \
	CALC_F3_POST(DI,DX,CX,SI,AX)

#define CALC_58 \
	CALC_F3_PRE(0x1c8,BX) \
	PRECALC_18(Y3) \
	CALC_F3_POST(SI,AX,DX,BX,DI)

#define CALC_59 \
	CALC_F2_PRE(0x1cc,BX,SI,CX) \
	PRECALC_19(Y3) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_60 \
	CALC_F2_PRE(0x1e0,CX,BX,DX) \
	PRECALC_20(Y3) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_61 \
	CALC_F2_PRE(0x1e4,DX,CX,AX) \
	PRECALC_21(Y3) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_62 \
	CALC_F2_PRE(0x1e8,AX,DX,DI) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_63 \
	CALC_F2_PRE(0x1ec,DI,AX,SI) \
	PRECALC_23(Y3,0x20,0xe0) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_64 \
	CALC_F2_PRE(0x200,SI,DI,BX) \
	PRECALC_32(Y5,Y3) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_65 \
	CALC_F2_PRE(0x204,BX,SI,CX) \
	PRECALC_33(Y14,Y15) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_66 \
	CALC_F2_PRE(0x208,CX,BX,DX) \
	PRECALC_34(Y8) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_67 \
	CALC_F2_PRE(0x20c,DX,CX,AX) \
	PRECALC_35(Y15) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_68 \
	CALC_F2_PRE(0x220,AX,DX,DI) \
	PRECALC_36(Y15) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_69 \
	CALC_F2_PRE(0x224,DI,AX,SI) \
	PRECALC_37(Y15) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_70 \
	CALC_F2_PRE(0x228,SI,DI,BX) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_71 \
	CALC_F2_PRE(0x22c,BX,SI,CX) \
	PRECALC_39(Y15,0x20,0x100) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_72 \
	CALC_F2_PRE(0x240,CX,BX,DX) \
	PRECALC_32(Y3,Y15) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_73 \
	CALC_F2_PRE(0x244,DX,CX,AX) \
	PRECALC_33(Y13,Y14) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_74 \
	CALC_F2_PRE(0x248,AX,DX,DI) \
	PRECALC_34(Y7) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_75 \
	CALC_F2_PRE(0x24c,DI,AX,SI) \
	PRECALC_35(Y14) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_76 \
	CALC_F2_PRE(0x260,SI,DI,BX) \
	PRECALC_36(Y14) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_77 \
	CALC_F2_PRE(0x264,BX,SI,CX) \
	PRECALC_37(Y14) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_78 \
	CALC_F2_PRE(0x268,CX,BX,DX) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_79 \
	ADDL 0x26c(R15), AX \
	LEAL (AX)(CX*1), AX \
	RORXL $0x1b, DX, R12 \
	PRECALC_39(Y14,0x20,0x120) \
	ADDL R12, AX

// Similar to CALC_0
#define CALC_80 \
	MOVL CX, DX \
	RORXL $2, CX, CX \
	ANDNL SI, DX, BP \
	ANDL BX, DX \
	XORL BP, DX \
	CALC_F1_PRE(0x10,AX,DX,BX,DI) \
	PRECALC_32(Y15,Y14) \
	CALC_F1_POST(AX,CX,DI)

#define CALC_81 \
	CALC_F1_PRE(0x14,DI,AX,CX,SI) \
	PRECALC_33(Y12,Y13) \
	CALC_F1_POST(DI,DX,SI)

#define CALC_82 \
	CALC_F1_PRE(0x18,SI,DI,DX,BX) \
	PRECALC_34(Y5) \
	CALC_F1_POST(SI,AX,BX)

#define CALC_83 \
	CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
	PRECALC_35(Y13) \
	CALC_F1_POST(BX,DI,CX)

#define CALC_84 \
	CALC_F1_PRE(0x30,CX,BX,DI,DX) \
	PRECALC_36(Y13) \
	CALC_F1_POST(CX,SI,DX)

#define CALC_85 \
	CALC_F1_PRE(0x34,DX,CX,SI,AX) \
	PRECALC_37(Y13) \
	CALC_F1_POST(DX,BX,AX)

#define CALC_86 \
	CALC_F1_PRE(0x38,AX,DX,BX,DI) \
	CALC_F1_POST(AX,CX,DI)

#define CALC_87 \
	CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
	PRECALC_39(Y13,0x40,0x140) \
	CALC_F1_POST(DI,DX,SI)

#define CALC_88 \
	CALC_F1_PRE(0x50,SI,DI,DX,BX) \
	PRECALC_32(Y14,Y13) \
	CALC_F1_POST(SI,AX,BX)

#define CALC_89 \
	CALC_F1_PRE(0x54,BX,SI,AX,CX) \
	PRECALC_33(Y8,Y12) \
	CALC_F1_POST(BX,DI,CX)

#define CALC_90 \
	CALC_F1_PRE(0x58,CX,BX,DI,DX) \
	PRECALC_34(Y3) \
	CALC_F1_POST(CX,SI,DX)

#define CALC_91 \
	CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
	PRECALC_35(Y12) \
	CALC_F1_POST(DX,BX,AX)

#define CALC_92 \
	CALC_F1_PRE(0x70,AX,DX,BX,DI) \
	PRECALC_36(Y12) \
	CALC_F1_POST(AX,CX,DI)

#define CALC_93 \
	CALC_F1_PRE(0x74,DI,AX,CX,SI) \
	PRECALC_37(Y12) \
	CALC_F1_POST(DI,DX,SI)

#define CALC_94 \
	CALC_F1_PRE(0x78,SI,DI,DX,BX) \
	CALC_F1_POST(SI,AX,BX)

#define CALC_95 \
	CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
	PRECALC_39(Y12,0x40,0x160) \
	CALC_F1_POST(BX,DI,CX)

#define CALC_96 \
	CALC_F1_PRE(0x90,CX,BX,DI,DX) \
	PRECALC_32(Y13,Y12) \
	CALC_F1_POST(CX,SI,DX)

#define CALC_97 \
	CALC_F1_PRE(0x94,DX,CX,SI,AX) \
	PRECALC_33(Y7,Y8) \
	CALC_F1_POST(DX,BX,AX)

#define CALC_98 \
	CALC_F1_PRE(0x98,AX,DX,BX,DI) \
	PRECALC_34(Y15) \
	CALC_F1_POST(AX,CX,DI)

#define CALC_99 \
	CALC_F2_PRE(0x9c,DI,AX,SI) \
	PRECALC_35(Y8) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_100 \
	CALC_F2_PRE(0xb0,SI,DI,BX) \
	PRECALC_36(Y8) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_101 \
	CALC_F2_PRE(0xb4,BX,SI,CX) \
	PRECALC_37(Y8) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_102 \
	CALC_F2_PRE(0xb8,CX,BX,DX) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_103 \
	CALC_F2_PRE(0xbc,DX,CX,AX) \
	PRECALC_39(Y8,0x40,0x180) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_104 \
	CALC_F2_PRE(0xd0,AX,DX,DI) \
	PRECALC_32(Y12,Y8) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_105 \
	CALC_F2_PRE(0xd4,DI,AX,SI) \
	PRECALC_33(Y5,Y7) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_106 \
	CALC_F2_PRE(0xd8,SI,DI,BX) \
	PRECALC_34(Y14) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_107 \
	CALC_F2_PRE(0xdc,BX,SI,CX) \
	PRECALC_35(Y7) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_108 \
	CALC_F2_PRE(0xf0,CX,BX,DX) \
	PRECALC_36(Y7) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_109 \
	CALC_F2_PRE(0xf4,DX,CX,AX) \
	PRECALC_37(Y7) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_110 \
	CALC_F2_PRE(0xf8,AX,DX,DI) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_111 \
	CALC_F2_PRE(0xfc,DI,AX,SI) \
	PRECALC_39(Y7,0x40,0x1a0) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_112 \
	CALC_F2_PRE(0x110,SI,DI,BX) \
	PRECALC_32(Y8,Y7) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_113 \
	CALC_F2_PRE(0x114,BX,SI,CX) \
	PRECALC_33(Y3,Y5) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_114 \
	CALC_F2_PRE(0x118,CX,BX,DX) \
	PRECALC_34(Y13) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_115 \
	CALC_F2_PRE(0x11c,DX,CX,AX) \
	PRECALC_35(Y5) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_116 \
	CALC_F2_PRE(0x130,AX,DX,DI) \
	PRECALC_36(Y5) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_117 \
	CALC_F2_PRE(0x134,DI,AX,SI) \
	PRECALC_37(Y5) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_118 \
	CALC_F2_PRE(0x138,SI,DI,BX) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_119 \
	CALC_F3_PRE(0x13c,CX) \
	PRECALC_39(Y5,0x40,0x1c0) \
	CALC_F3_POST(BX,DI,AX,CX,SI)

#define CALC_120 \
	CALC_F3_PRE(0x150,DX) \
	PRECALC_32(Y7,Y5) \
	CALC_F3_POST(CX,SI,DI,DX,BX)

#define CALC_121 \
	CALC_F3_PRE(0x154,AX) \
	PRECALC_33(Y15,Y3) \
	CALC_F3_POST(DX,BX,SI,AX,CX)

#define CALC_122 \
	CALC_F3_PRE(0x158,DI) \
	PRECALC_34(Y12) \
	CALC_F3_POST(AX,CX,BX,DI,DX)

#define CALC_123 \
	CALC_F3_PRE(0x15c,SI) \
	PRECALC_35(Y3) \
	CALC_F3_POST(DI,DX,CX,SI,AX)

#define CALC_124 \
	CALC_F3_PRE(0x170,BX) \
	PRECALC_36(Y3) \
	CALC_F3_POST(SI,AX,DX,BX,DI)

#define CALC_125 \
	CALC_F3_PRE(0x174,CX) \
	PRECALC_37(Y3) \
	CALC_F3_POST(BX,DI,AX,CX,SI)

#define CALC_126 \
	CALC_F3_PRE(0x178,DX) \
	CALC_F3_POST(CX,SI,DI,DX,BX)

#define CALC_127 \
	CALC_F3_PRE(0x17c,AX) \
	PRECALC_39(Y3,0x60,0x1e0) \
	CALC_F3_POST(DX,BX,SI,AX,CX)

#define CALC_128 \
	CALC_F3_PRE(0x190,DI) \
	PRECALC_32(Y5,Y3) \
	CALC_F3_POST(AX,CX,BX,DI,DX)

#define CALC_129 \
	CALC_F3_PRE(0x194,SI) \
	PRECALC_33(Y14,Y15) \
	CALC_F3_POST(DI,DX,CX,SI,AX)

#define CALC_130 \
	CALC_F3_PRE(0x198,BX) \
	PRECALC_34(Y8) \
	CALC_F3_POST(SI,AX,DX,BX,DI)

#define CALC_131 \
	CALC_F3_PRE(0x19c,CX) \
	PRECALC_35(Y15) \
	CALC_F3_POST(BX,DI,AX,CX,SI)

#define CALC_132 \
	CALC_F3_PRE(0x1b0,DX) \
	PRECALC_36(Y15) \
	CALC_F3_POST(CX,SI,DI,DX,BX)

#define CALC_133 \
	CALC_F3_PRE(0x1b4,AX) \
	PRECALC_37(Y15) \
	CALC_F3_POST(DX,BX,SI,AX,CX)

#define CALC_134 \
	CALC_F3_PRE(0x1b8,DI) \
	CALC_F3_POST(AX,CX,BX,DI,DX)

#define CALC_135 \
	CALC_F3_PRE(0x1bc,SI) \
	PRECALC_39(Y15,0x60,0x200) \
	CALC_F3_POST(DI,DX,CX,SI,AX)

#define CALC_136 \
	CALC_F3_PRE(0x1d0,BX) \
	PRECALC_32(Y3,Y15) \
	CALC_F3_POST(SI,AX,DX,BX,DI)

#define CALC_137 \
	CALC_F3_PRE(0x1d4,CX) \
	PRECALC_33(Y13,Y14) \
	CALC_F3_POST(BX,DI,AX,CX,SI)

#define CALC_138 \
	CALC_F3_PRE(0x1d8,DX) \
	PRECALC_34(Y7) \
	CALC_F3_POST(CX,SI,DI,DX,BX)

#define CALC_139 \
	CALC_F2_PRE(0x1dc,DX,CX,AX) \
	PRECALC_35(Y14) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_140 \
	CALC_F2_PRE(0x1f0,AX,DX,DI) \
	PRECALC_36(Y14) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_141 \
	CALC_F2_PRE(0x1f4,DI,AX,SI) \
	PRECALC_37(Y14) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_142 \
	CALC_F2_PRE(0x1f8,SI,DI,BX) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_143 \
	CALC_F2_PRE(0x1fc,BX,SI,CX) \
	PRECALC_39(Y14,0x60,0x220) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_144 \
	CALC_F2_PRE(0x210,CX,BX,DX) \
	PRECALC_32(Y15,Y14) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_145 \
	CALC_F2_PRE(0x214,DX,CX,AX) \
	PRECALC_33(Y12,Y13) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_146 \
	CALC_F2_PRE(0x218,AX,DX,DI) \
	PRECALC_34(Y5) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_147 \
	CALC_F2_PRE(0x21c,DI,AX,SI) \
	PRECALC_35(Y13) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_148 \
	CALC_F2_PRE(0x230,SI,DI,BX) \
	PRECALC_36(Y13) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_149 \
	CALC_F2_PRE(0x234,BX,SI,CX) \
	PRECALC_37(Y13) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_150 \
	CALC_F2_PRE(0x238,CX,BX,DX) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_151 \
	CALC_F2_PRE(0x23c,DX,CX,AX) \
	PRECALC_39(Y13,0x60,0x240) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_152 \
	CALC_F2_PRE(0x250,AX,DX,DI) \
	PRECALC_32(Y14,Y13) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_153 \
	CALC_F2_PRE(0x254,DI,AX,SI) \
	PRECALC_33(Y8,Y12) \
	CALC_F2_POST(DI,DX,CX,SI)

#define CALC_154 \
	CALC_F2_PRE(0x258,SI,DI,BX) \
	PRECALC_34(Y3) \
	CALC_F2_POST(SI,AX,DX,BX)

#define CALC_155 \
	CALC_F2_PRE(0x25c,BX,SI,CX) \
	PRECALC_35(Y12) \
	CALC_F2_POST(BX,DI,AX,CX)

#define CALC_156 \
	CALC_F2_PRE(0x270,CX,BX,DX) \
	PRECALC_36(Y12) \
	CALC_F2_POST(CX,SI,DI,DX)

#define CALC_157 \
	CALC_F2_PRE(0x274,DX,CX,AX) \
	PRECALC_37(Y12) \
	CALC_F2_POST(DX,BX,SI,AX)

#define CALC_158 \
	CALC_F2_PRE(0x278,AX,DX,DI) \
	CALC_F2_POST(AX,CX,BX,DI)

#define CALC_159 \
	ADDL 0x27c(R15),SI \
	LEAL (SI)(AX*1), SI \
	RORXL $0x1b, DI, R12 \
	PRECALC_39(Y12,0x60,0x260) \
	ADDL R12, SI



#define CALC \
	MOVL	(R9), CX \
	MOVL	4(R9), SI \
	MOVL	8(R9), DI \
	MOVL	12(R9), AX \
	MOVL	16(R9), DX \
	MOVQ    SP, R14 \
	LEAQ    (2*4*80+32)(SP), R15 \
	PRECALC \ // Precalc WK for first 2 blocks
	XCHGQ   R15, R14 \
loop: \  // this loops is unrolled
	CMPQ    R10, R8 \ // we use R8 value (set below) as a signal of a last block
	JNE	begin \
	VZEROUPPER \
	RET \
begin: \
	CALC_0 \
	CALC_1 \
	CALC_2 \
	CALC_3 \
	CALC_4 \
	CALC_5 \
	CALC_6 \
	CALC_7 \
	CALC_8 \
	CALC_9 \
	CALC_10 \
	CALC_11 \
	CALC_12 \
	CALC_13 \
	CALC_14 \
	CALC_15 \
	CALC_16 \
	CALC_17 \
	CALC_18 \
	CALC_19 \
	CALC_20 \
	CALC_21 \
	CALC_22 \
	CALC_23 \
	CALC_24 \
	CALC_25 \
	CALC_26 \
	CALC_27 \
	CALC_28 \
	CALC_29 \
	CALC_30 \
	CALC_31 \
	CALC_32 \
	CALC_33 \
	CALC_34 \
	CALC_35 \
	CALC_36 \
	CALC_37 \
	CALC_38 \
	CALC_39 \
	CALC_40 \
	CALC_41 \
	CALC_42 \
	CALC_43 \
	CALC_44 \
	CALC_45 \
	CALC_46 \
	CALC_47 \
	CALC_48 \
	CALC_49 \
	CALC_50 \
	CALC_51 \
	CALC_52 \
	CALC_53 \
	CALC_54 \
	CALC_55 \
	CALC_56 \
	CALC_57 \
	CALC_58 \
	CALC_59 \
	ADDQ $128, R10 \ // move to next even-64-byte block
	CMPQ R10, R11 \ // is current block the last one?
	CMOVQCC R8, R10 \ // signal the last iteration smartly
	CALC_60 \
	CALC_61 \
	CALC_62 \
	CALC_63 \
	CALC_64 \
	CALC_65 \
	CALC_66 \
	CALC_67 \
	CALC_68 \
	CALC_69 \
	CALC_70 \
	CALC_71 \
	CALC_72 \
	CALC_73 \
	CALC_74 \
	CALC_75 \
	CALC_76 \
	CALC_77 \
	CALC_78 \
	CALC_79 \
	UPDATE_HASH(AX,DX,BX,SI,DI) \
	CMPQ R10, R8 \ // is current block the last one?
	JE loop\
	MOVL DX, CX \
	CALC_80 \
	CALC_81 \
	CALC_82 \
	CALC_83 \
	CALC_84 \
	CALC_85 \
	CALC_86 \
	CALC_87 \
	CALC_88 \
	CALC_89 \
	CALC_90 \
	CALC_91 \
	CALC_92 \
	CALC_93 \
	CALC_94 \
	CALC_95 \
	CALC_96 \
	CALC_97 \
	CALC_98 \
	CALC_99 \
	CALC_100 \
	CALC_101 \
	CALC_102 \
	CALC_103 \
	CALC_104 \
	CALC_105 \
	CALC_106 \
	CALC_107 \
	CALC_108 \
	CALC_109 \
	CALC_110 \
	CALC_111 \
	CALC_112 \
	CALC_113 \
	CALC_114 \
	CALC_115 \
	CALC_116 \
	CALC_117 \
	CALC_118 \
	CALC_119 \
	CALC_120 \
	CALC_121 \
	CALC_122 \
	CALC_123 \
	CALC_124 \
	CALC_125 \
	CALC_126 \
	CALC_127 \
	CALC_128 \
	CALC_129 \
	CALC_130 \
	CALC_131 \
	CALC_132 \
	CALC_133 \
	CALC_134 \
	CALC_135 \
	CALC_136 \
	CALC_137 \
	CALC_138 \
	CALC_139 \
	ADDQ $128, R13 \ //move to next even-64-byte block
	CMPQ R13, R11 \ //is current block the last one?
	CMOVQCC R8, R10 \
	CALC_140 \
	CALC_141 \
	CALC_142 \
	CALC_143 \
	CALC_144 \
	CALC_145 \
	CALC_146 \
	CALC_147 \
	CALC_148 \
	CALC_149 \
	CALC_150 \
	CALC_151 \
	CALC_152 \
	CALC_153 \
	CALC_154 \
	CALC_155 \
	CALC_156 \
	CALC_157 \
	CALC_158 \
	CALC_159 \
	UPDATE_HASH(SI,DI,DX,CX,BX) \
	MOVL	SI, R12 \ //Reset state for  AVX2 reg permutation
	MOVL	DI, SI \
	MOVL	DX, DI \
	MOVL	BX, DX \
	MOVL	CX, AX \
	MOVL	R12, CX \
	XCHGQ   R15, R14 \
	JMP     loop



TEXT ·blockAVX2(SB),$1408-32

	MOVQ	dig+0(FP),	DI
	MOVQ	p_base+8(FP),	SI
	MOVQ	p_len+16(FP),	DX
	SHRQ	$6,		DX
	SHLQ	$6,		DX

	MOVQ	$K_XMM_AR<>(SB), R8

	MOVQ	DI, R9
	MOVQ	SI, R10
	LEAQ	64(SI), R13

	ADDQ	SI, DX
	ADDQ	$64, DX
	MOVQ	DX, R11

	CMPQ	R13, R11
	CMOVQCC	R8, R13

	VMOVDQU	BSWAP_SHUFB_CTL<>(SB), Y10

	CALC // RET is inside macros

DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
GLOBL K_XMM_AR<>(SB),RODATA,$128

DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32