diff -rupN ATLAS/CONFIG/src/backend/archinfo_x86.c atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c --- ATLAS/CONFIG/src/backend/archinfo_x86.c 2009-02-18 19:47:37.000000000 +0100 +++ atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c 2009-11-12 13:47:23.777451677 +0100 @@ -320,7 +320,7 @@ enum MACHTYPE Chip2Mach(enum CHIP chip, iret = IntP4; break; case 3: - case 4: + case 4: ; case 6: iret = IntP4E; break; default: diff -rupN ATLAS/include/atlas_lvl3.h atlas-3.8.3/include/atlas_lvl3.h --- ATLAS/include/atlas_lvl3.h 2009-02-18 19:47:35.000000000 +0100 +++ atlas-3.8.3/include/atlas_lvl3.h 2009-11-12 13:52:49.308496090 +0100 @@ -126,7 +126,7 @@ #define CPAT Mjoin(C_ATL_, PRE); #ifndef ATL_MaxMalloc - #define ATL_MaxMalloc 67108864 + #define ATL_MaxMalloc XXX_MaxMalloc_XXX #endif typedef void (*MAT2BLK)(int, int, const TYPE*, int, TYPE*, const SCALAR); diff -rupN ATLAS/src/blas/gemm/ATL_cmmJITcp.c atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c --- ATLAS/src/blas/gemm/ATL_cmmJITcp.c 2009-02-18 19:47:44.000000000 +0100 +++ atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c 2009-11-12 12:44:34.816529051 +0100 @@ -268,7 +268,8 @@ static void Mjoin(PATL,mmK) { NBmm0 = NBmm1 = NBmmX = Mjoin(PATLU,pKBmm); if (SCALAR_IS_ZERO(beta)) - Mjoin(PATL,gezero)(M, N, C, ldc); + /* Mjoin(PATL,gezero)(M, N, C, ldc); */ + { Mjoin(PATLU,gezero)(M, N, pC, ldpc); Mjoin(PATLU,gezero)(M, N, pC+ipc, ldpc); } } if (nblk) { diff -rupN ATLAS/src/blas/gemm/ATL_gereal2cplx.c atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c --- ATLAS/src/blas/gemm/ATL_gereal2cplx.c 2009-02-18 19:47:44.000000000 +0100 +++ atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c 2009-11-12 12:49:49.331651677 +0100 @@ -43,7 +43,53 @@ void Mjoin(PATL,gereal2cplx) const int ldc2 = (ldc-M)<<1; int i, j; - if (ialp == ATL_rzero && ibet == ATL_rzero) +/* + * Cannot read C if BETA is 0 + */ + if (rbet == ATL_rzero && ibet == ATL_rzero) + { + if (ialp == ATL_rzero) /* alpha is a real number */ + { + if (ralp == ATL_rone) /* alpha = 1.0 */ + { + for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2) + { + for (i=0; i < M; i++, C += 2) + { + *C = R[i]; + C[1] = I[i]; + } + } + } + else + { + for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2) + { + for (i=0; i < M; i++, C += 2) + { + *C = ralp * R[i]; + C[1] = ralp * I[i]; + } + } + } + } + else /* alpha is a complex number */ + { + for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2) + { + for (i=0; i < M; i++, C += 2) + { + ra = R[i]; ia = I[i]; + C[0] = ralp * ra - ialp * ia; + C[1] = ralp * ia + ialp * ra; + } + } + } + } +/* + * If alpha and beta are both real numbers + */ + else if (ialp == ATL_rzero && ibet == ATL_rzero) { if (ralp == ATL_rone && rbet == ATL_rone) { diff -rupN ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c --- ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-02-18 19:48:26.000000000 +0100 +++ atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-11-12 12:35:50.453038827 +0100 @@ -27,6 +27,13 @@ * POSSIBILITY OF SUCH DAMAGE. * */ +#if KB > 84 + #error "KB cannot exceed 84!" +#endif +#if (KB/4)*4 != KB + #error "KB must be a multiple of 4!" +#endif + #ifndef ATL_GAS_x8664 #error "This kernel requires x86-64 assembly!" #endif @@ -58,25 +65,25 @@ * Integer register usage shown be these defines */ #define pA %rcx -#define pA10 %rbx -#define ldab %rbp -#define mldab %rdx +#define pA10 %rbx +#define ldab %rbp +#define mldab %rdx #define mldab5 %rax #define pB %rdi #define pC %rsi #define incCn %r10 #define stM %r9 #define stN %r11 -#define pfA %r8 -#define pA5 pA -#define pB0 pB +#define pfA %r8 +#define pA5 pA +#define pB0 pB #if MB == 0 - #define stM0 %r12 - #define incAm %r13 + #define stM0 %r12 + #define incAm %r13 #endif /* rax used in 32/64 conversion */ -#define NBso (KB*4) +#define NBso (KB*4) #define MBKBso (MB*KB*4) #define NB2so (NBso+NBso) #define NB3so (NBso+NBso+NBso) @@ -95,22 +102,22 @@ /* * SSE2 register usage shown be these defines */ -#define rA0 %xmm0 -#define rB0 %xmm1 -#define rC0 %xmm2 -#define rC1 %xmm3 -#define rC2 %xmm4 -#define rC3 %xmm5 -#define rC4 %xmm6 -#define rC5 %xmm7 -#define rC6 %xmm8 -#define rC7 %xmm9 -#define rC8 %xmm10 -#define rC9 %xmm11 -#define rC10 %xmm12 -#define rC11 %xmm13 -#define rC12 %xmm14 -#define rC13 %xmm15 +#define rA0 %xmm0 +#define rB0 %xmm1 +#define rC0 %xmm2 +#define rC1 %xmm3 +#define rC2 %xmm4 +#define rC3 %xmm5 +#define rC4 %xmm6 +#define rC5 %xmm7 +#define rC6 %xmm8 +#define rC7 %xmm9 +#define rC8 %xmm10 +#define rC9 %xmm11 +#define rC10 %xmm12 +#define rC11 %xmm13 +#define rC12 %xmm14 +#define rC13 %xmm15 /* * Prefetch defines */ @@ -127,99 +134,99 @@ #if MB != 0 #define incAm $MBKBso-NB14so+176 #endif - .text + .text .global ATL_asmdecor(ATL_USERMM) ATL_asmdecor(ATL_USERMM): /* * Save callee-saved iregs */ - movq %rbp, -8(%rsp) - movq %rbx, -16(%rsp) + movq %rbp, -8(%rsp) + movq %rbx, -16(%rsp) #if MB == 0 - movq %r12, -32(%rsp) - movq %r13, -40(%rsp) + movq %r12, -32(%rsp) + movq %r13, -40(%rsp) #endif #ifdef BETAX #define BOF -56 - movss %xmm1, BOF(%rsp) - movss %xmm1, BOF+4(%rsp) - movss %xmm1, BOF+8(%rsp) - movss %xmm1, BOF+12(%rsp) + movss %xmm1, BOF(%rsp) + movss %xmm1, BOF+4(%rsp) + movss %xmm1, BOF+8(%rsp) + movss %xmm1, BOF+12(%rsp) #endif /* * pA already comes in right reg * Initialize pB = B; pC = C; NBso = NB * sizeof; */ - movq %rsi, stN - movq %rdi, %rax - movq 16(%rsp), pC - prefC((pC)) - prefC(64(pC)) - movq %r9, pB - prefB((pB)) - prefB(64(pB)) - movq %rax, stM + movq %rsi, stN + movq %rdi, %rax + movq 16(%rsp), pC + prefC((pC)) + prefC(64(pC)) + movq %r9, pB + prefB((pB)) + prefB(64(pB)) + movq %rax, stM /* * stM = pA + NBNBso; stN = pB + NBNBso; */ #if MB == 0 - movq stM, pfA - imulq $NBso, pfA - prefB(128(pB)) - movq pfA, incAm - addq pA5, pfA - addq $176-NB14so, incAm + movq stM, pfA + imulq $NBso, pfA + prefB(128(pB)) + movq pfA, incAm + addq pA5, pfA + addq $176-NB14so, incAm #else - movq $MBKBso, pfA - addq pA5, pfA - prefB(128(pB)) + movq $MBKBso, pfA + addq pA5, pfA + prefB(128(pB)) #endif /* * convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof */ - movl 24(%rsp), %eax - cltq - movq %rax, incCn - subq stM, incCn - addq $14, incCn + movl 24(%rsp), %eax + cltq + movq %rax, incCn + subq stM, incCn + addq $14, incCn #ifdef SREAL - shl $2, incCn + shl $2, incCn #else - shl $3, incCn - prefC(128(pC)) - prefC(192(pC)) + shl $3, incCn + prefC(128(pC)) + prefC(192(pC)) #endif /* * Find M/14 if MB is not set */ #if MB == 0 - cmp $84, stM - jne MB_LT84 -/* movq $84/14, stM */ - movq $6, stM + cmp $84, stM + jne MB_LT84 +/* movq $84/14, stM */ + movq $6, stM MBFOUND: - subq $1, stM - movq stM, stM0 + subq $1, stM + movq stM, stM0 #endif - addq $120, pA5 - addq $120, pB0 - movq $KB*4, ldab - movq $-KB*5*4, mldab5 - movq $-KB*4, mldab - subq mldab5, pA5 - lea KB*4(pA5, ldab,4), pA10 -/* movq $NB, stN */ + addq $120, pA5 + addq $120, pB0 + movq $KB*4, ldab + movq $-KB*5*4, mldab5 + movq $-KB*4, mldab + subq mldab5, pA5 + lea KB*4(pA5, ldab,4), pA10 +/* movq $NB, stN */ UNLOOP: #if MB == 0 - movq stM0, stM - cmp $0, stM - je MLAST + movq stM0, stM + cmp $0, stM + je MLAST #else #ifdef ATL_DivAns - movq $ATL_DivAns-1, stM + movq $ATL_DivAns-1, stM #else - movq $MB/14-1, stM + movq $MB/14-1, stM #endif #endif #if MB == 0 || MB > 14 @@ -227,992 +234,992 @@ UMLOOP: /* * rC[0-13] = pC[0-13] * beta */ - ALIGN16 + ALIGN16 /*UKLOOP: */ #ifdef BETA1 - movaps 0-120(pA10,mldab5,2), rC0 - movaps 0-120(pB0), rB0 - mulps rB0, rC0 - addss (pC), rC0 - movaps 0-120(pA5, mldab,4), rC1 - mulps rB0, rC1 - addss CMUL(4)(pC), rC1 - movaps 0-120(pA10, mldab,8), rC2 - mulps rB0, rC2 - addss CMUL(8)(pC), rC2 - movaps 0-120(pA5, mldab,2), rC3 - mulps rB0, rC3 - addss CMUL(12)(pC), rC3 - movaps 0-120(pA5, mldab), rC4 - mulps rB0, rC4 - addss CMUL(16)(pC), rC4 - movaps 0-120(pA5), rC5 - mulps rB0, rC5 - addss CMUL(20)(pC), rC5 - movaps 0-120(pA5, ldab), rC6 - mulps rB0, rC6 - addss CMUL(24)(pC), rC6 - movaps 0-120(pA5, ldab,2), rC7 - mulps rB0, rC7 - addss CMUL(28)(pC), rC7 - movaps 0-120(pA10, mldab,2), rC8 - mulps rB0, rC8 - addss CMUL(32)(pC), rC8 - movaps 0-120(pA5,ldab,4), rC9 - mulps rB0, rC9 - addss CMUL(36)(pC), rC9 - movaps 0-120(pA10), rC10 - mulps rB0, rC10 - addss CMUL(40)(pC), rC10 - movaps 0-120(pA10,ldab), rC11 - mulps rB0, rC11 - addss CMUL(44)(pC), rC11 - movaps 0-120(pA10,ldab,2), rC12 - mulps rB0, rC12 - addss CMUL(48)(pC), rC12 - movaps 0-120(pA5,ldab,8), rC13 - mulps rB0, rC13 - addss CMUL(52)(pC), rC13 + movaps 0-120(pA10,mldab5,2), rC0 + movaps 0-120(pB0), rB0 + mulps rB0, rC0 + addss (pC), rC0 + movaps 0-120(pA5, mldab,4), rC1 + mulps rB0, rC1 + addss CMUL(4)(pC), rC1 + movaps 0-120(pA10, mldab,8), rC2 + mulps rB0, rC2 + addss CMUL(8)(pC), rC2 + movaps 0-120(pA5, mldab,2), rC3 + mulps rB0, rC3 + addss CMUL(12)(pC), rC3 + movaps 0-120(pA5, mldab), rC4 + mulps rB0, rC4 + addss CMUL(16)(pC), rC4 + movaps 0-120(pA5), rC5 + mulps rB0, rC5 + addss CMUL(20)(pC), rC5 + movaps 0-120(pA5, ldab), rC6 + mulps rB0, rC6 + addss CMUL(24)(pC), rC6 + movaps 0-120(pA5, ldab,2), rC7 + mulps rB0, rC7 + addss CMUL(28)(pC), rC7 + movaps 0-120(pA10, mldab,2), rC8 + mulps rB0, rC8 + addss CMUL(32)(pC), rC8 + movaps 0-120(pA5,ldab,4), rC9 + mulps rB0, rC9 + addss CMUL(36)(pC), rC9 + movaps 0-120(pA10), rC10 + mulps rB0, rC10 + addss CMUL(40)(pC), rC10 + movaps 0-120(pA10,ldab), rC11 + mulps rB0, rC11 + addss CMUL(44)(pC), rC11 + movaps 0-120(pA10,ldab,2), rC12 + mulps rB0, rC12 + addss CMUL(48)(pC), rC12 + movaps 0-120(pA5,ldab,8), rC13 + mulps rB0, rC13 + addss CMUL(52)(pC), rC13 #else - movaps 0-120(pA10,mldab5,2), rC0 - movaps 0-120(pB0), rC13 - mulps rC13, rC0 - movaps 0-120(pA5, mldab,4), rC1 - mulps rC13, rC1 - movaps 0-120(pA10, mldab,8), rC2 - mulps rC13, rC2 - movaps 0-120(pA5, mldab,2), rC3 - mulps rC13, rC3 - movaps 0-120(pA5, mldab), rC4 - mulps rC13, rC4 - movaps 0-120(pA5), rC5 - mulps rC13, rC5 - movaps 0-120(pA5, ldab), rC6 - mulps rC13, rC6 - movaps 0-120(pA5, ldab,2), rC7 - mulps rC13, rC7 - movaps 0-120(pA10, mldab,2), rC8 - mulps rC13, rC8 - movaps 0-120(pA5,ldab,4), rC9 - mulps rC13, rC9 - movaps 0-120(pA10), rC10 - mulps rC13, rC10 - movaps 0-120(pA10,ldab), rC11 - mulps rC13, rC11 - movaps 0-120(pA10,ldab,2), rC12 - mulps rC13, rC12 - mulps 0-120(pA5,ldab,8), rC13 + movaps 0-120(pA10,mldab5,2), rC0 + movaps 0-120(pB0), rC13 + mulps rC13, rC0 + movaps 0-120(pA5, mldab,4), rC1 + mulps rC13, rC1 + movaps 0-120(pA10, mldab,8), rC2 + mulps rC13, rC2 + movaps 0-120(pA5, mldab,2), rC3 + mulps rC13, rC3 + movaps 0-120(pA5, mldab), rC4 + mulps rC13, rC4 + movaps 0-120(pA5), rC5 + mulps rC13, rC5 + movaps 0-120(pA5, ldab), rC6 + mulps rC13, rC6 + movaps 0-120(pA5, ldab,2), rC7 + mulps rC13, rC7 + movaps 0-120(pA10, mldab,2), rC8 + mulps rC13, rC8 + movaps 0-120(pA5,ldab,4), rC9 + mulps rC13, rC9 + movaps 0-120(pA10), rC10 + mulps rC13, rC10 + movaps 0-120(pA10,ldab), rC11 + mulps rC13, rC11 + movaps 0-120(pA10,ldab,2), rC12 + mulps rC13, rC12 + mulps 0-120(pA5,ldab,8), rC13 #endif #if KB > 4 - movaps 16-120(pA10,mldab5,2), rA0 - movaps 16-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 16-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 16-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 16-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 16-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 16-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 16-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 16-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 16-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 16-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 16-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 16-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 16-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 16-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 16-120(pA10,mldab5,2), rA0 + movaps 16-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 16-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 16-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 16-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 16-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 16-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 16-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 16-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 16-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 16-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 16-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 16-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 16-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 16-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 8 - movaps 32-120(pA10,mldab5,2), rA0 - movaps 32-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 32-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 32-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 32-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 32-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 32-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 32-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 32-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 32-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 32-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 32-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 32-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 32-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 32-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 32-120(pA10,mldab5,2), rA0 + movaps 32-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 32-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 32-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 32-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 32-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 32-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 32-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 32-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 32-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 32-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 32-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 32-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 32-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 32-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 12 - movaps 48-120(pA10,mldab5,2), rA0 - movaps 48-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 48-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 48-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 48-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 48-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 48-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 48-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 48-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 48-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 48-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 48-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 48-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 48-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 48-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 48-120(pA10,mldab5,2), rA0 + movaps 48-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 48-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 48-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 48-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 48-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 48-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 48-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 48-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 48-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 48-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 48-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 48-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 48-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 48-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 16 - movaps 64-120(pA10,mldab5,2), rA0 - movaps 64-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 64-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 64-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 64-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 64-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 64-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 64-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 64-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 64-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 64-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 64-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 64-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 64-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 64-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 64-120(pA10,mldab5,2), rA0 + movaps 64-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 64-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 64-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 64-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 64-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 64-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 64-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 64-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 64-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 64-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 64-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 64-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 64-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 64-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 20 - movaps 80-120(pA10,mldab5,2), rA0 - movaps 80-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 80-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 80-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 80-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 80-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 80-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 80-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 80-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 80-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 80-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 80-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 80-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 80-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 80-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 80-120(pA10,mldab5,2), rA0 + movaps 80-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 80-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 80-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 80-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 80-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 80-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 80-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 80-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 80-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 80-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 80-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 80-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 80-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 80-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 24 - movaps 96-120(pA10,mldab5,2), rA0 - movaps 96-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 96-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 96-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 96-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 96-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 96-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 96-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 96-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 96-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 96-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 96-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 96-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 96-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 96-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 96-120(pA10,mldab5,2), rA0 + movaps 96-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 96-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 96-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 96-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 96-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 96-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 96-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 96-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 96-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 96-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 96-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 96-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 96-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 96-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 28 - movaps 112-120(pA10,mldab5,2), rA0 - movaps 112-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 112-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 112-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 112-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 112-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 112-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 112-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 112-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 112-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 112-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 112-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 112-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 112-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 112-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 112-120(pA10,mldab5,2), rA0 + movaps 112-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 112-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 112-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 112-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 112-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 112-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 112-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 112-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 112-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 112-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 112-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 112-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 112-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 112-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #ifndef SREAL - pref2((pfA)) - pref2(64(pfA)) + pref2((pfA)) + pref2(64(pfA)) #endif #if KB > 32 - movaps 128-120(pA10,mldab5,2), rA0 - movaps 128-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 128-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 128-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 128-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 128-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 128-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 128-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 128-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 128-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 128-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 128-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 128-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 128-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 128-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 128-120(pA10,mldab5,2), rA0 + movaps 128-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 128-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 128-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 128-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 128-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 128-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 128-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 128-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 128-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 128-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 128-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 128-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 128-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 128-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 36 - movaps 144-120(pA10,mldab5,2), rA0 - movaps 144-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 144-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 144-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 144-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 144-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 144-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 144-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 144-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 144-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 144-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 144-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 144-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 144-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 144-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 144-120(pA10,mldab5,2), rA0 + movaps 144-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 144-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 144-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 144-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 144-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 144-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 144-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 144-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 144-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 144-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 144-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 144-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 144-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 144-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 40 - movaps 160-120(pA10,mldab5,2), rA0 - movaps 160-120(pB0), rB0 - mulps rB0, rA0 - addq $176, pB0 - addps rA0, rC0 - movaps 160-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 160-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 160-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 160-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 160-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 160-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 160-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 160-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 160-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 160-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 160-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 160-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addq $176, pA10 - addps rA0, rC12 - mulps 160-120(pA5,ldab,8), rB0 - addps rB0, rC13 - addq $176, pA5 + movaps 160-120(pA10,mldab5,2), rA0 + movaps 160-120(pB0), rB0 + mulps rB0, rA0 + addq $176, pB0 + addps rA0, rC0 + movaps 160-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 160-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 160-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 160-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 160-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 160-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 160-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 160-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 160-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 160-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 160-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 160-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addq $176, pA10 + addps rA0, rC12 + mulps 160-120(pA5,ldab,8), rB0 + addps rB0, rC13 + addq $176, pA5 #else - addq $176, pB0 - addq $176, pA10 - addq $176, pA5 + addq $176, pB0 + addq $176, pA10 + addq $176, pA5 #endif #if KB > 44 - movaps 0-120(pA10,mldab5,2), rA0 - movaps 0-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 0-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 0-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 0-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 0-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 0-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 0-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 0-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 0-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 0-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 0-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 0-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 0-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 0-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 0-120(pA10,mldab5,2), rA0 + movaps 0-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 0-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 0-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 0-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 0-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 0-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 0-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 0-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 0-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 0-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 0-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 0-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 0-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 0-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 48 - movaps 16-120(pA10,mldab5,2), rA0 - movaps 16-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 16-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 16-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 16-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 16-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 16-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 16-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 16-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 16-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 16-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 16-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 16-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 16-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 16-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 16-120(pA10,mldab5,2), rA0 + movaps 16-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 16-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 16-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 16-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 16-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 16-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 16-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 16-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 16-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 16-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 16-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 16-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 16-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 16-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 52 - movaps 32-120(pA10,mldab5,2), rA0 - movaps 32-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 32-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 32-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 32-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 32-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 32-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 32-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 32-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 32-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 32-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 32-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 32-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 32-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 32-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 32-120(pA10,mldab5,2), rA0 + movaps 32-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 32-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 32-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 32-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 32-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 32-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 32-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 32-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 32-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 32-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 32-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 32-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 32-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 32-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 56 - movaps 48-120(pA10,mldab5,2), rA0 - movaps 48-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 48-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 48-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 48-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 48-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 48-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 48-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 48-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 48-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 48-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 48-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 48-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 48-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 48-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 48-120(pA10,mldab5,2), rA0 + movaps 48-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 48-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 48-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 48-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 48-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 48-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 48-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 48-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 48-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 48-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 48-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 48-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 48-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 48-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 60 - movaps 64-120(pA10,mldab5,2), rA0 - movaps 64-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 64-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 64-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 64-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 64-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 64-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 64-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 64-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 64-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 64-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 64-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 64-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 64-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 64-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 64-120(pA10,mldab5,2), rA0 + movaps 64-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 64-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 64-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 64-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 64-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 64-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 64-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 64-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 64-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 64-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 64-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 64-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 64-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 64-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 64 - movaps 80-120(pA10,mldab5,2), rA0 - movaps 80-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 80-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 80-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 80-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 80-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 80-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 80-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 80-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 80-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 80-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 80-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 80-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 80-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 80-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 80-120(pA10,mldab5,2), rA0 + movaps 80-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 80-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 80-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 80-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 80-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 80-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 80-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 80-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 80-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 80-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 80-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 80-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 80-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 80-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 68 - movaps 96-120(pA10,mldab5,2), rA0 - movaps 96-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 96-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 96-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 96-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 96-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 96-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 96-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 96-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 96-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 96-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 96-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 96-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 96-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 96-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 96-120(pA10,mldab5,2), rA0 + movaps 96-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 96-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 96-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 96-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 96-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 96-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 96-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 96-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 96-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 96-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 96-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 96-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 96-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 96-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 72 - movaps 112-120(pA10,mldab5,2), rA0 - movaps 112-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 112-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 112-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 112-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 112-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 112-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 112-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 112-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 112-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 112-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 112-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 112-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 112-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 112-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 112-120(pA10,mldab5,2), rA0 + movaps 112-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 112-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 112-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 112-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 112-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 112-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 112-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 112-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 112-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 112-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 112-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 112-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 112-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 112-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 76 - movaps 128-120(pA10,mldab5,2), rA0 - movaps 128-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 128-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 128-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 128-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 128-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 128-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 128-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 128-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 128-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 128-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 128-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 128-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 128-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 128-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 128-120(pA10,mldab5,2), rA0 + movaps 128-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 128-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 128-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 128-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 128-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 128-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 128-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 128-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 128-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 128-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 128-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 128-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 128-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 128-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 80 - movaps 144-120(pA10,mldab5,2), rA0 - movaps 144-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 144-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 144-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 144-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 144-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 144-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 144-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 144-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 144-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 144-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 144-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 144-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 144-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 144-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 144-120(pA10,mldab5,2), rA0 + movaps 144-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 144-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 144-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 144-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 144-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 144-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 144-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 144-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 144-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 144-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 144-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 144-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 144-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 144-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif /*UKLOOP */ @@ -1220,234 +1227,234 @@ UMLOOP: * Get these bastard things summed up correctly */ - /* rC0 = c0a c0b c0c c0d */ - /* rC1 = c1a c1b c1c c1d */ - /* rC2 = c2a c2b c2c c2d */ - /* rC3 = c3a c3b c3c c3d */ + /* rC0 = c0a c0b c0c c0d */ + /* rC1 = c1a c1b c1c c1d */ + /* rC2 = c2a c2b c2c c2d */ + /* rC3 = c3a c3b c3c c3d */ /* */ - movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */ - prefC((pC)) - prefC(64(pC)) - movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */ - unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */ - unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */ - unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */ - movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */ - unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */ - movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */ - movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */ - movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */ - addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */ - movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */ - movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */ - movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */ - addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */ - movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */ - addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */ - - - /* rC4 = c4a c4b c4c c4d */ - /* rC5 = c5a c5b c5c c5d */ - /* rC6 = c6a c6b c6c c6d */ - /* rC7 = c7a c7b c7c c7d */ - /* rC8 = c08a c08b c08c c08d */ - /* rC9 = c09a c09b c09c c09d */ - /* rC10 = c10a c10b c10c c10d */ - /* rC11 = c11a c11b c11c c11d */ - /* rC12 = c12a c12b c12c c12d */ - /* rC13 = c13a c13b c13c c13d */ + movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */ + prefC((pC)) + prefC(64(pC)) + movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */ + unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */ + unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */ + unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */ + movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */ + unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */ + movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */ + movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */ + movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */ + addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */ + movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */ + movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */ + movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */ + addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */ + movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */ + addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */ + + + /* rC4 = c4a c4b c4c c4d */ + /* rC5 = c5a c5b c5c c5d */ + /* rC6 = c6a c6b c6c c6d */ + /* rC7 = c7a c7b c7c c7d */ + /* rC8 = c08a c08b c08c c08d */ + /* rC9 = c09a c09b c09c c09d */ + /* rC10 = c10a c10b c10c c10d */ + /* rC11 = c11a c11b c11c c11d */ + /* rC12 = c12a c12b c12c c12d */ + /* rC13 = c13a c13b c13c c13d */ /* */ - movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */ - prefC(128(pC)) + movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */ + prefC(128(pC)) #ifdef SREAL - pref2((pfA)) + pref2((pfA)) #else - prefC(192(pC)) + prefC(192(pC)) #endif - movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */ - movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */ - unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */ - unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */ - unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */ - unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */ - unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */ - movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */ - unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */ - movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */ - movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */ - unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */ - movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */ - movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */ - addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */ + movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */ + movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */ + unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */ + unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */ + unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */ + unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */ + unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */ + movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */ + unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */ + movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */ + movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */ + unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */ + movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */ + movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */ + addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */ #ifdef BETAX #ifdef SREAL - movups (pC), rA0 - movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ - movups 16(pC), rC4 - unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ - movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ - movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ - movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ - movups 32(pC), rC5 - movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ - unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ - addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ - movlps 48(pC), rC1 - addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ - movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ - unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ - movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ - addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ - pref2(64(pfA)) - mulps BOF(%rsp), rA0 - addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ - mulps BOF(%rsp), rC4 - addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ - mulps BOF(%rsp), rC5 - addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ - mulps BOF(%rsp), rC1 + movups (pC), rA0 + movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ + movups 16(pC), rC4 + unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ + movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ + movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ + movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ + movups 32(pC), rC5 + movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ + unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ + addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ + movlps 48(pC), rC1 + addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ + movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ + unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ + movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ + addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ + pref2(64(pfA)) + mulps BOF(%rsp), rA0 + addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ + mulps BOF(%rsp), rC4 + addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ + mulps BOF(%rsp), rC5 + addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ + mulps BOF(%rsp), rC1 /* */ - movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ - addps rA0, rC3 - addq $68, pfA - addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ - addps rC4, rC7 - addps rC5, rC11 - addps rC1, rC12 + movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ + addps rA0, rC3 + addq $68, pfA + addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ + addps rC4, rC7 + addps rC5, rC11 + addps rC1, rC12 #else /* BETA = X, complex type */ - movups (pC), rA0 - movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ - movups 16(pC), rC4 - unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ - shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */ - movups 32(pC), rC4 /* rC4 = c4 X c5 X */ - movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ - movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ - movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ - movups 48(pC), rC5 /* rC5 = c6 X c7 X */ - movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ - unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ - addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ - shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */ - movups 64(pC), rC5 /* rC5 = c8 X c9 X */ - movups 80(pC), rC1 /* rC1 = c10 X c11 X */ - addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ - shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */ - movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ - movss 96(pC), rC1 - unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ - movss 104(pC), rB0 - movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ - unpcklps rB0, rC1 - addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ - prefC(256(pC)) - mulps BOF(%rsp), rA0 - addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ - mulps BOF(%rsp), rC4 - addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ - mulps BOF(%rsp), rC5 - addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ - mulps BOF(%rsp), rC1 + movups (pC), rA0 + movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ + movups 16(pC), rC4 + unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ + shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */ + movups 32(pC), rC4 /* rC4 = c4 X c5 X */ + movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ + movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ + movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ + movups 48(pC), rC5 /* rC5 = c6 X c7 X */ + movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ + unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ + addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ + shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */ + movups 64(pC), rC5 /* rC5 = c8 X c9 X */ + movups 80(pC), rC1 /* rC1 = c10 X c11 X */ + addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ + shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */ + movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ + movss 96(pC), rC1 + unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ + movss 104(pC), rB0 + movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ + unpcklps rB0, rC1 + addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ + prefC(256(pC)) + mulps BOF(%rsp), rA0 + addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ + mulps BOF(%rsp), rC4 + addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ + mulps BOF(%rsp), rC5 + addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ + mulps BOF(%rsp), rC1 /* */ - movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ - addps rA0, rC3 - prefC(192(pC)) - addq $68, pfA - addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ - addps rC4, rC7 - addps rC5, rC11 - addps rC1, rC12 + movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ + addps rA0, rC3 + prefC(192(pC)) + addq $68, pfA + addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ + addps rC4, rC7 + addps rC5, rC11 + addps rC1, rC12 #endif #else - movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ - unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ - movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ - movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ - movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ - movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ - unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ - addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ - addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ - movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ - unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ - movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ - addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ + movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ + unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ + movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ + movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ + movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ + movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ + unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ + addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ + addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ + movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ + unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ + movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ + addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ #ifdef SREAL - pref2(64(pfA)) + pref2(64(pfA)) #else - prefC(256(pC)) + prefC(256(pC)) #endif - addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ - addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ - addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ + addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ + addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ + addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ /* */ - movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ + movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ #ifndef SREAL - prefC(192(pC)) + prefC(192(pC)) #endif - addq $68, pfA - addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ + addq $68, pfA + addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ #endif /* * Write results back to C; pC += 14; */ #ifdef SREAL - movups rC3, (pC) - movups rC7, 16(pC) - movups rC11, 32(pC) - movlps rC12, 48(pC) - addq $56, pC + movups rC3, (pC) + movups rC7, 16(pC) + movups rC11, 32(pC) + movlps rC12, 48(pC) + addq $56, pC #else - movss rC3, (pC) - movss rC7, 32(pC) - movhlps rC3, rC0 - movhlps rC7, rC6 - movss rC0, 16(pC) - movss rC6, 48(pC) - shufps $0x55, rC3, rC3 - shufps $0x55, rC7, rC7 - movss rC3, 8(pC) - movss rC7, 40(pC) - shufps $0x55, rC0, rC0 - shufps $0x55, rC6, rC6 - movss rC0, 24(pC) - movss rC6, 56(pC) - - movss rC11, 64(pC) - movhlps rC11, rC2 - movss rC12, 96(pC) - movss rC2, 80(pC) - shufps $0x55, rC11, rC11 - shufps $0x55, rC12, rC12 - movss rC11, 72(pC) - shufps $0x55, rC2, rC2 - movss rC12, 104(pC) - movss rC2, 88(pC) + movss rC3, (pC) + movss rC7, 32(pC) + movhlps rC3, rC0 + movhlps rC7, rC6 + movss rC0, 16(pC) + movss rC6, 48(pC) + shufps $0x55, rC3, rC3 + shufps $0x55, rC7, rC7 + movss rC3, 8(pC) + movss rC7, 40(pC) + shufps $0x55, rC0, rC0 + shufps $0x55, rC6, rC6 + movss rC0, 24(pC) + movss rC6, 56(pC) + + movss rC11, 64(pC) + movhlps rC11, rC2 + movss rC12, 96(pC) + movss rC2, 80(pC) + shufps $0x55, rC11, rC11 + shufps $0x55, rC12, rC12 + movss rC11, 72(pC) + shufps $0x55, rC2, rC2 + movss rC12, 104(pC) + movss rC2, 88(pC) - addq $112, pC + addq $112, pC #endif /* * Write results back to C */ - addq $NB14so-176, pA5 - addq $NB14so-176, pA10 - subq $176, pB0 + addq $NB14so-176, pA5 + addq $NB14so-176, pA10 + subq $176, pB0 /* * pC += 14; pA += 14*NB; pB -= NB; */ /* * while (pA != stM); */ - subq $1, stM - jne UMLOOP + subq $1, stM + jne UMLOOP #endif /* @@ -1459,994 +1466,994 @@ MLAST: #endif /*UKLOOP: */ #ifdef BETA1 - movaps 0-120(pA10,mldab5,2), rC0 - movaps 0-120(pB0), rB0 - mulps rB0, rC0 - addss (pC), rC0 - movaps 0-120(pA5, mldab,4), rC1 - mulps rB0, rC1 - addss CMUL(4)(pC), rC1 - movaps 0-120(pA10, mldab,8), rC2 - mulps rB0, rC2 - addss CMUL(8)(pC), rC2 - movaps 0-120(pA5, mldab,2), rC3 - mulps rB0, rC3 - addss CMUL(12)(pC), rC3 - movaps 0-120(pA5, mldab), rC4 - mulps rB0, rC4 - addss CMUL(16)(pC), rC4 - movaps 0-120(pA5), rC5 - mulps rB0, rC5 - addss CMUL(20)(pC), rC5 - movaps 0-120(pA5, ldab), rC6 - mulps rB0, rC6 - addss CMUL(24)(pC), rC6 - movaps 0-120(pA5, ldab,2), rC7 - mulps rB0, rC7 - addss CMUL(28)(pC), rC7 - movaps 0-120(pA10, mldab,2), rC8 - mulps rB0, rC8 - addss CMUL(32)(pC), rC8 - movaps 0-120(pA5,ldab,4), rC9 - mulps rB0, rC9 - addss CMUL(36)(pC), rC9 - movaps 0-120(pA10), rC10 - mulps rB0, rC10 - addss CMUL(40)(pC), rC10 - movaps 0-120(pA10,ldab), rC11 - mulps rB0, rC11 - addss CMUL(44)(pC), rC11 - movaps 0-120(pA10,ldab,2), rC12 - mulps rB0, rC12 - addss CMUL(48)(pC), rC12 - movaps 0-120(pA5,ldab,8), rC13 - mulps rB0, rC13 - addss CMUL(52)(pC), rC13 + movaps 0-120(pA10,mldab5,2), rC0 + movaps 0-120(pB0), rB0 + mulps rB0, rC0 + addss (pC), rC0 + movaps 0-120(pA5, mldab,4), rC1 + mulps rB0, rC1 + addss CMUL(4)(pC), rC1 + movaps 0-120(pA10, mldab,8), rC2 + mulps rB0, rC2 + addss CMUL(8)(pC), rC2 + movaps 0-120(pA5, mldab,2), rC3 + mulps rB0, rC3 + addss CMUL(12)(pC), rC3 + movaps 0-120(pA5, mldab), rC4 + mulps rB0, rC4 + addss CMUL(16)(pC), rC4 + movaps 0-120(pA5), rC5 + mulps rB0, rC5 + addss CMUL(20)(pC), rC5 + movaps 0-120(pA5, ldab), rC6 + mulps rB0, rC6 + addss CMUL(24)(pC), rC6 + movaps 0-120(pA5, ldab,2), rC7 + mulps rB0, rC7 + addss CMUL(28)(pC), rC7 + movaps 0-120(pA10, mldab,2), rC8 + mulps rB0, rC8 + addss CMUL(32)(pC), rC8 + movaps 0-120(pA5,ldab,4), rC9 + mulps rB0, rC9 + addss CMUL(36)(pC), rC9 + movaps 0-120(pA10), rC10 + mulps rB0, rC10 + addss CMUL(40)(pC), rC10 + movaps 0-120(pA10,ldab), rC11 + mulps rB0, rC11 + addss CMUL(44)(pC), rC11 + movaps 0-120(pA10,ldab,2), rC12 + mulps rB0, rC12 + addss CMUL(48)(pC), rC12 + movaps 0-120(pA5,ldab,8), rC13 + mulps rB0, rC13 + addss CMUL(52)(pC), rC13 #else - movaps 0-120(pA10,mldab5,2), rC0 - movaps 0-120(pB0), rC13 - mulps rC13, rC0 - movaps 0-120(pA5, mldab,4), rC1 - mulps rC13, rC1 - movaps 0-120(pA10, mldab,8), rC2 - mulps rC13, rC2 - movaps 0-120(pA5, mldab,2), rC3 - mulps rC13, rC3 - movaps 0-120(pA5, mldab), rC4 - mulps rC13, rC4 - movaps 0-120(pA5), rC5 - mulps rC13, rC5 - movaps 0-120(pA5, ldab), rC6 - mulps rC13, rC6 - movaps 0-120(pA5, ldab,2), rC7 - mulps rC13, rC7 - movaps 0-120(pA10, mldab,2), rC8 - mulps rC13, rC8 - movaps 0-120(pA5,ldab,4), rC9 - mulps rC13, rC9 - movaps 0-120(pA10), rC10 - mulps rC13, rC10 - movaps 0-120(pA10,ldab), rC11 - mulps rC13, rC11 - movaps 0-120(pA10,ldab,2), rC12 - mulps rC13, rC12 - mulps 0-120(pA5,ldab,8), rC13 + movaps 0-120(pA10,mldab5,2), rC0 + movaps 0-120(pB0), rC13 + mulps rC13, rC0 + movaps 0-120(pA5, mldab,4), rC1 + mulps rC13, rC1 + movaps 0-120(pA10, mldab,8), rC2 + mulps rC13, rC2 + movaps 0-120(pA5, mldab,2), rC3 + mulps rC13, rC3 + movaps 0-120(pA5, mldab), rC4 + mulps rC13, rC4 + movaps 0-120(pA5), rC5 + mulps rC13, rC5 + movaps 0-120(pA5, ldab), rC6 + mulps rC13, rC6 + movaps 0-120(pA5, ldab,2), rC7 + mulps rC13, rC7 + movaps 0-120(pA10, mldab,2), rC8 + mulps rC13, rC8 + movaps 0-120(pA5,ldab,4), rC9 + mulps rC13, rC9 + movaps 0-120(pA10), rC10 + mulps rC13, rC10 + movaps 0-120(pA10,ldab), rC11 + mulps rC13, rC11 + movaps 0-120(pA10,ldab,2), rC12 + mulps rC13, rC12 + mulps 0-120(pA5,ldab,8), rC13 #endif #if KB > 4 - movaps 16-120(pA10,mldab5,2), rA0 - movaps 16-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 16-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 16-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 16-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 16-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 16-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 16-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 16-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 16-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 16-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 16-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 16-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 16-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 16-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 16-120(pA10,mldab5,2), rA0 + movaps 16-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 16-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 16-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 16-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 16-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 16-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 16-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 16-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 16-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 16-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 16-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 16-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 16-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 16-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 8 - movaps 32-120(pA10,mldab5,2), rA0 - movaps 32-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 32-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 32-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 32-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 32-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 32-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 32-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 32-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 32-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 32-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 32-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 32-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 32-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 32-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 32-120(pA10,mldab5,2), rA0 + movaps 32-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 32-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 32-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 32-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 32-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 32-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 32-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 32-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 32-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 32-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 32-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 32-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 32-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 32-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 12 - movaps 48-120(pA10,mldab5,2), rA0 - movaps 48-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 48-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 48-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 48-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 48-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 48-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 48-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 48-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 48-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 48-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 48-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 48-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 48-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 48-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 48-120(pA10,mldab5,2), rA0 + movaps 48-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 48-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 48-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 48-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 48-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 48-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 48-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 48-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 48-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 48-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 48-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 48-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 48-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 48-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 16 - movaps 64-120(pA10,mldab5,2), rA0 - movaps 64-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 64-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 64-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 64-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 64-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 64-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 64-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 64-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 64-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 64-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 64-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 64-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 64-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 64-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 64-120(pA10,mldab5,2), rA0 + movaps 64-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 64-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 64-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 64-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 64-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 64-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 64-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 64-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 64-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 64-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 64-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 64-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 64-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 64-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 20 - movaps 80-120(pA10,mldab5,2), rA0 - movaps 80-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 80-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 80-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 80-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 80-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 80-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 80-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 80-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 80-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 80-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 80-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 80-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 80-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 80-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 80-120(pA10,mldab5,2), rA0 + movaps 80-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 80-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 80-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 80-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 80-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 80-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 80-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 80-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 80-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 80-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 80-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 80-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 80-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 80-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 24 - movaps 96-120(pA10,mldab5,2), rA0 - movaps 96-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 96-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 96-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 96-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 96-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 96-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 96-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 96-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 96-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 96-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 96-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 96-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 96-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 96-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 96-120(pA10,mldab5,2), rA0 + movaps 96-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 96-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 96-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 96-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 96-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 96-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 96-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 96-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 96-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 96-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 96-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 96-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 96-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 96-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 28 - movaps 112-120(pA10,mldab5,2), rA0 - movaps 112-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 112-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 112-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 112-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 112-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 112-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 112-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 112-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 112-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 112-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 112-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 112-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 112-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 112-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 112-120(pA10,mldab5,2), rA0 + movaps 112-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 112-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 112-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 112-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 112-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 112-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 112-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 112-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 112-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 112-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 112-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 112-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 112-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 112-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 32 - movaps 128-120(pA10,mldab5,2), rA0 - movaps 128-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 128-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 128-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 128-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 128-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 128-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 128-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 128-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 128-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 128-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 128-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 128-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 128-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 128-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 128-120(pA10,mldab5,2), rA0 + movaps 128-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 128-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 128-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 128-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 128-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 128-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 128-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 128-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 128-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 128-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 128-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 128-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 128-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 128-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 36 - movaps 144-120(pA10,mldab5,2), rA0 - movaps 144-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 144-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 144-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 144-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 144-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 144-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 144-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 144-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 144-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 144-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 144-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 144-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 144-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 144-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 144-120(pA10,mldab5,2), rA0 + movaps 144-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 144-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 144-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 144-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 144-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 144-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 144-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 144-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 144-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 144-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 144-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 144-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 144-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 144-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif - prefB((pB,ldab)) - prefB(64(pB,ldab)) + prefB((pB,ldab)) + prefB(64(pB,ldab)) #if KB > 40 - movaps 160-120(pA10,mldab5,2), rA0 - movaps 160-120(pB0), rB0 - mulps rB0, rA0 - addq $176, pB0 - addps rA0, rC0 - movaps 160-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 160-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 160-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 160-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 160-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 160-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 160-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 160-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 160-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 160-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 160-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 160-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addq $176, pA10 - addps rA0, rC12 - mulps 160-120(pA5,ldab,8), rB0 - addps rB0, rC13 - addq $176, pA5 + movaps 160-120(pA10,mldab5,2), rA0 + movaps 160-120(pB0), rB0 + mulps rB0, rA0 + addq $176, pB0 + addps rA0, rC0 + movaps 160-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 160-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 160-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 160-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 160-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 160-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 160-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 160-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 160-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 160-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 160-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 160-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addq $176, pA10 + addps rA0, rC12 + mulps 160-120(pA5,ldab,8), rB0 + addps rB0, rC13 + addq $176, pA5 #else - addq $176, pB0 - addq $176, pA10 - addq $176, pA5 + addq $176, pB0 + addq $176, pA10 + addq $176, pA5 #endif #if KB > 44 - movaps 0-120(pA10,mldab5,2), rA0 - movaps 0-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 0-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 0-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 0-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 0-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 0-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 0-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 0-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 0-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 0-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 0-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 0-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 0-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 0-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 0-120(pA10,mldab5,2), rA0 + movaps 0-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 0-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 0-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 0-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 0-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 0-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 0-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 0-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 0-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 0-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 0-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 0-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 0-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 0-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 48 - movaps 16-120(pA10,mldab5,2), rA0 - movaps 16-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 16-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 16-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 16-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 16-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 16-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 16-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 16-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 16-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 16-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 16-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 16-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 16-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 16-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 16-120(pA10,mldab5,2), rA0 + movaps 16-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 16-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 16-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 16-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 16-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 16-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 16-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 16-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 16-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 16-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 16-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 16-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 16-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 16-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 52 - movaps 32-120(pA10,mldab5,2), rA0 - movaps 32-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 32-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 32-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 32-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 32-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 32-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 32-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 32-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 32-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 32-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 32-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 32-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 32-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 32-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 32-120(pA10,mldab5,2), rA0 + movaps 32-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 32-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 32-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 32-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 32-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 32-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 32-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 32-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 32-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 32-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 32-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 32-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 32-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 32-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 56 - movaps 48-120(pA10,mldab5,2), rA0 - movaps 48-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 48-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 48-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 48-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 48-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 48-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 48-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 48-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 48-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 48-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 48-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 48-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 48-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 48-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 48-120(pA10,mldab5,2), rA0 + movaps 48-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 48-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 48-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 48-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 48-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 48-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 48-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 48-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 48-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 48-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 48-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 48-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 48-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 48-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 60 - movaps 64-120(pA10,mldab5,2), rA0 - movaps 64-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 64-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 64-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 64-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 64-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 64-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 64-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 64-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 64-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 64-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 64-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 64-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 64-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 64-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 64-120(pA10,mldab5,2), rA0 + movaps 64-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 64-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 64-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 64-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 64-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 64-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 64-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 64-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 64-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 64-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 64-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 64-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 64-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 64-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif - prefB(128-176(pB,ldab)) - prefB(192-176(pB,ldab)) + prefB(128-176(pB,ldab)) + prefB(192-176(pB,ldab)) #if KB > 64 - movaps 80-120(pA10,mldab5,2), rA0 - movaps 80-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 80-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 80-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 80-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 80-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 80-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 80-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 80-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 80-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 80-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 80-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 80-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 80-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 80-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 80-120(pA10,mldab5,2), rA0 + movaps 80-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 80-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 80-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 80-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 80-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 80-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 80-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 80-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 80-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 80-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 80-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 80-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 80-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 80-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 68 - movaps 96-120(pA10,mldab5,2), rA0 - movaps 96-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 96-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 96-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 96-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 96-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 96-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 96-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 96-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 96-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 96-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 96-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 96-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 96-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 96-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 96-120(pA10,mldab5,2), rA0 + movaps 96-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 96-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 96-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 96-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 96-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 96-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 96-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 96-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 96-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 96-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 96-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 96-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 96-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 96-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 72 - movaps 112-120(pA10,mldab5,2), rA0 - movaps 112-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 112-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 112-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 112-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 112-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 112-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 112-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 112-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 112-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 112-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 112-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 112-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 112-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 112-120(pA5,ldab,8), rB0 - prefC((pC)) - prefC((pC,incCn)) - addps rB0, rC13 + movaps 112-120(pA10,mldab5,2), rA0 + movaps 112-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 112-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 112-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 112-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 112-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 112-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 112-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 112-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 112-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 112-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 112-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 112-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 112-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 112-120(pA5,ldab,8), rB0 + prefC((pC)) + prefC((pC,incCn)) + addps rB0, rC13 #else - prefC((pC)) - prefC((pC,incCn)) + prefC((pC)) + prefC((pC,incCn)) #endif #if KB > 76 - movaps 128-120(pA10,mldab5,2), rA0 - movaps 128-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 128-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 128-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 128-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 128-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 128-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 128-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 128-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 128-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 128-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 128-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 128-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 128-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 128-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 128-120(pA10,mldab5,2), rA0 + movaps 128-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 128-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 128-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 128-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 128-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 128-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 128-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 128-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 128-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 128-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 128-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 128-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 128-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 128-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif #if KB > 80 - movaps 144-120(pA10,mldab5,2), rA0 - movaps 144-120(pB0), rB0 - mulps rB0, rA0 - addps rA0, rC0 - movaps 144-120(pA5, mldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC1 - movaps 144-120(pA10, mldab,8), rA0 - mulps rB0, rA0 - addps rA0, rC2 - movaps 144-120(pA5, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC3 - movaps 144-120(pA5, mldab), rA0 - mulps rB0, rA0 - addps rA0, rC4 - movaps 144-120(pA5), rA0 - mulps rB0, rA0 - addps rA0, rC5 - movaps 144-120(pA5, ldab), rA0 - mulps rB0, rA0 - addps rA0, rC6 - movaps 144-120(pA5, ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC7 - movaps 144-120(pA10, mldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC8 - movaps 144-120(pA5,ldab,4), rA0 - mulps rB0, rA0 - addps rA0, rC9 - movaps 144-120(pA10), rA0 - mulps rB0, rA0 - addps rA0, rC10 - movaps 144-120(pA10,ldab), rA0 - mulps rB0, rA0 - addps rA0, rC11 - movaps 144-120(pA10,ldab,2), rA0 - mulps rB0, rA0 - addps rA0, rC12 - mulps 144-120(pA5,ldab,8), rB0 - addps rB0, rC13 + movaps 144-120(pA10,mldab5,2), rA0 + movaps 144-120(pB0), rB0 + mulps rB0, rA0 + addps rA0, rC0 + movaps 144-120(pA5, mldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC1 + movaps 144-120(pA10, mldab,8), rA0 + mulps rB0, rA0 + addps rA0, rC2 + movaps 144-120(pA5, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC3 + movaps 144-120(pA5, mldab), rA0 + mulps rB0, rA0 + addps rA0, rC4 + movaps 144-120(pA5), rA0 + mulps rB0, rA0 + addps rA0, rC5 + movaps 144-120(pA5, ldab), rA0 + mulps rB0, rA0 + addps rA0, rC6 + movaps 144-120(pA5, ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC7 + movaps 144-120(pA10, mldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC8 + movaps 144-120(pA5,ldab,4), rA0 + mulps rB0, rA0 + addps rA0, rC9 + movaps 144-120(pA10), rA0 + mulps rB0, rA0 + addps rA0, rC10 + movaps 144-120(pA10,ldab), rA0 + mulps rB0, rA0 + addps rA0, rC11 + movaps 144-120(pA10,ldab,2), rA0 + mulps rB0, rA0 + addps rA0, rC12 + mulps 144-120(pA5,ldab,8), rB0 + addps rB0, rC13 #endif /*UKLOOP */ @@ -2454,202 +2461,202 @@ MLAST: * Get these bastard things summed up correctly */ - /* rC0 = c0a c0b c0c c0d */ - /* rC1 = c1a c1b c1c c1d */ - /* rC2 = c2a c2b c2c c2d */ - /* rC3 = c3a c3b c3c c3d */ + /* rC0 = c0a c0b c0c c0d */ + /* rC1 = c1a c1b c1c c1d */ + /* rC2 = c2a c2b c2c c2d */ + /* rC3 = c3a c3b c3c c3d */ /* */ - movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */ - prefC(64(pC,incCn)) - prefB(256-176(pB,ldab)) - movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */ - unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */ - unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */ - unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */ - movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */ - unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */ - movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */ - movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */ - movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */ - addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */ - movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */ - movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */ - movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */ - addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */ - movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */ - addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */ - - - /* rC4 = c4a c4b c4c c4d */ - /* rC5 = c5a c5b c5c c5d */ - /* rC6 = c6a c6b c6c c6d */ - /* rC7 = c7a c7b c7c c7d */ - /* rC8 = c08a c08b c08c c08d */ - /* rC9 = c09a c09b c09c c09d */ - /* rC10 = c10a c10b c10c c10d */ - /* rC11 = c11a c11b c11c c11d */ - /* rC12 = c12a c12b c12c c12d */ - /* rC13 = c13a c13b c13c c13d */ + movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */ + prefC(64(pC,incCn)) + prefB(256-176(pB,ldab)) + movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */ + unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */ + unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */ + unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */ + movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */ + unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */ + movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */ + movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */ + movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */ + addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */ + movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */ + movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */ + movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */ + addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */ + movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */ + addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */ + + + /* rC4 = c4a c4b c4c c4d */ + /* rC5 = c5a c5b c5c c5d */ + /* rC6 = c6a c6b c6c c6d */ + /* rC7 = c7a c7b c7c c7d */ + /* rC8 = c08a c08b c08c c08d */ + /* rC9 = c09a c09b c09c c09d */ + /* rC10 = c10a c10b c10c c10d */ + /* rC11 = c11a c11b c11c c11d */ + /* rC12 = c12a c12b c12c c12d */ + /* rC13 = c13a c13b c13c c13d */ /* */ - movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */ - movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */ - movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */ - unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */ - unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */ - unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */ - unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */ - unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */ - movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */ - unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */ - movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */ - movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */ - unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */ - movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */ - movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */ - addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */ + movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */ + movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */ + movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */ + unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */ + unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */ + unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */ + unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */ + unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */ + movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */ + unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */ + movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */ + movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */ + unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */ + movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */ + movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */ + addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */ #ifdef BETAX #ifdef SREAL - movups (pC), rA0 - movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ - movups 16(pC), rC4 - unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ - movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ - movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ - movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ - movups 32(pC), rC5 - movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ - unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ - addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ - movlps 48(pC), rC1 - addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ - movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ - unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ - movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ - addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ - mulps BOF(%rsp), rA0 - addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ - mulps BOF(%rsp), rC4 - addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ - mulps BOF(%rsp), rC5 - addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ - mulps BOF(%rsp), rC1 + movups (pC), rA0 + movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ + movups 16(pC), rC4 + unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ + movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ + movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ + movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ + movups 32(pC), rC5 + movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ + unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ + addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ + movlps 48(pC), rC1 + addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ + movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ + unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ + movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ + addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ + mulps BOF(%rsp), rA0 + addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ + mulps BOF(%rsp), rC4 + addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ + mulps BOF(%rsp), rC5 + addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ + mulps BOF(%rsp), rC1 /* */ - movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ - addps rA0, rC3 - addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ - addps rC4, rC7 - addps rC5, rC11 - prefB(320-176(pB,ldab)) - addps rC1, rC12 + movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ + addps rA0, rC3 + addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ + addps rC4, rC7 + addps rC5, rC11 + prefB(320-176(pB,ldab)) + addps rC1, rC12 #else /* BETA = X, complex type */ - movups (pC), rA0 - movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ - movups 16(pC), rC4 - unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ - shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */ - movups 32(pC), rC4 /* rC4 = c4 X c5 X */ - movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ - movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ - movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ - movups 48(pC), rC5 /* rC5 = c6 X c7 X */ - movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ - unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ - addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ - shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */ - movups 64(pC), rC5 /* rC5 = c8 X c9 X */ - movups 80(pC), rC1 /* rC1 = c10 X c11 X */ - addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ - shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */ - movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ - movss 96(pC), rC1 - unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ - movss 104(pC), rB0 - movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ - unpcklps rB0, rC1 - addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ - mulps BOF(%rsp), rA0 - addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ - mulps BOF(%rsp), rC4 - addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ - mulps BOF(%rsp), rC5 - addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ - mulps BOF(%rsp), rC1 + movups (pC), rA0 + movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ + movups 16(pC), rC4 + unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ + shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */ + movups 32(pC), rC4 /* rC4 = c4 X c5 X */ + movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ + movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ + movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ + movups 48(pC), rC5 /* rC5 = c6 X c7 X */ + movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ + unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ + addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ + shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */ + movups 64(pC), rC5 /* rC5 = c8 X c9 X */ + movups 80(pC), rC1 /* rC1 = c10 X c11 X */ + addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ + shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */ + movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ + movss 96(pC), rC1 + unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ + movss 104(pC), rB0 + movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ + unpcklps rB0, rC1 + addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ + mulps BOF(%rsp), rA0 + addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ + mulps BOF(%rsp), rC4 + addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ + mulps BOF(%rsp), rC5 + addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ + mulps BOF(%rsp), rC1 /* */ - movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ - addps rA0, rC3 - addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ - addps rC4, rC7 - addps rC5, rC11 - prefB(320-176(pB,ldab)) - addps rC1, rC12 + movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ + addps rA0, rC3 + addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ + addps rC4, rC7 + addps rC5, rC11 + prefB(320-176(pB,ldab)) + addps rC1, rC12 #endif #else - movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ - unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ - movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ - movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ - movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ - movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ - unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ - addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ - addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ - movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ - unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ - movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ - addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ - addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ - addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ - addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ + movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ + unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ + movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ + movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ + movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ + movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ + unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ + addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ + addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ + movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ + unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ + movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ + addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ + addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ + addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ + addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ /* */ - movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ - prefB(320-176(pB,ldab)) - addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ + movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ + prefB(320-176(pB,ldab)) + addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ #endif /* * Write results back to C; pC += 14; */ #ifdef SREAL - movups rC3, (pC) - movups rC7, 16(pC) - movups rC11, 32(pC) - movlps rC12, 48(pC) -/* addq $56, pC */ + movups rC3, (pC) + movups rC7, 16(pC) + movups rC11, 32(pC) + movlps rC12, 48(pC) +/* addq $56, pC */ #else - movss rC3, (pC) - movss rC7, 32(pC) - movhlps rC3, rC0 - movhlps rC7, rC6 - movss rC0, 16(pC) - movss rC6, 48(pC) - shufps $0x55, rC3, rC3 - shufps $0x55, rC7, rC7 - movss rC3, 8(pC) - movss rC7, 40(pC) - shufps $0x55, rC0, rC0 - shufps $0x55, rC6, rC6 - movss rC0, 24(pC) - movss rC6, 56(pC) - - movss rC11, 64(pC) - movhlps rC11, rC2 - movss rC12, 96(pC) - movss rC2, 80(pC) - shufps $0x55, rC11, rC11 - shufps $0x55, rC12, rC12 - movss rC11, 72(pC) - shufps $0x55, rC2, rC2 - movss rC12, 104(pC) - movss rC2, 88(pC) + movss rC3, (pC) + movss rC7, 32(pC) + movhlps rC3, rC0 + movhlps rC7, rC6 + movss rC0, 16(pC) + movss rC6, 48(pC) + shufps $0x55, rC3, rC3 + shufps $0x55, rC7, rC7 + movss rC3, 8(pC) + movss rC7, 40(pC) + shufps $0x55, rC0, rC0 + shufps $0x55, rC6, rC6 + movss rC0, 24(pC) + movss rC6, 56(pC) + + movss rC11, 64(pC) + movhlps rC11, rC2 + movss rC12, 96(pC) + movss rC2, 80(pC) + shufps $0x55, rC11, rC11 + shufps $0x55, rC12, rC12 + movss rC11, 72(pC) + shufps $0x55, rC2, rC2 + movss rC12, 104(pC) + movss rC2, 88(pC) -/* addq $112, pC */ +/* addq $112, pC */ #endif /* * Write results back to C @@ -2660,55 +2667,55 @@ MLAST: /* * while (pA != stM); */ -/* subq $1, stM */ -/* jne UMLOOP */ +/* subq $1, stM */ +/* jne UMLOOP */ /* * pC += 14; pA += 14*NB; pB -= NB; */ -/* subq $MBKBso-NB14so+176, pA5 */ -/* subq $MBKBso-NB14so+176, pA10 */ - subq incAm, pA5 - subq incAm, pA10 - addq $NBso-176, pB0 +/* subq $MBKBso-NB14so+176, pA5 */ +/* subq $MBKBso-NB14so+176, pA10 */ + subq incAm, pA5 + subq incAm, pA10 + addq $NBso-176, pB0 /* * while (pA != stM); */ -/* subq $1, stM */ -/* jne UMLOOP */ +/* subq $1, stM */ +/* jne UMLOOP */ /* * pC += incCn; pA -= NBNB; pB += NB; */ - addq incCn, pC + addq incCn, pC /* * while (pB != stN); */ - sub $1, stN - jne UNLOOP + sub $1, stN + jne UNLOOP /* * Restore callee-saved iregs */ DONE: - movq -8(%rsp), %rbp - movq -16(%rsp), %rbx + movq -8(%rsp), %rbp + movq -16(%rsp), %rbx #if MB == 0 - movq -32(%rsp), %r12 - movq -40(%rsp), %r13 + movq -32(%rsp), %r12 + movq -40(%rsp), %r13 #endif - ret + ret #if MB == 0 MB_LT84: - cmp $70, stM - jne MB_LT70 -/* movq $70/14, stM */ - movq $5, stM - jmp MBFOUND + cmp $70, stM + jne MB_LT70 +/* movq $70/14, stM */ + movq $5, stM + jmp MBFOUND MB_LT70: - cmp $56, stM - jne MB_LT56 -/* movq $56/14, stM */ - movq $4, stM - jmp MBFOUND + cmp $56, stM + jne MB_LT56 +/* movq $56/14, stM */ + movq $4, stM + jmp MBFOUND MB_LT56: cmp $42, stM jne MB_LT42 diff -rupN ATLAS/tune/blas/level1/scalsrch.c atlas-3.8.3/tune/blas/level1/scalsrch.c --- ATLAS/tune/blas/level1/scalsrch.c 2009-02-18 19:48:25.000000000 +0100 +++ atlas-3.8.3/tune/blas/level1/scalsrch.c 2009-11-12 13:45:48.141174024 +0100 @@ -747,7 +747,7 @@ void GenMainRout(char pre, int n, int *i /* * Handle all special alpha cases */ - fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc); + /* fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc); fprintf(fpout, "%s{\n", spc); if (pre == 'c' || pre == 'z') { @@ -756,7 +756,7 @@ void GenMainRout(char pre, int n, int *i } else fprintf(fpout, "%s Mjoin(PATL,set)(N, ATL_rzero, X, incx);\n", spc); fprintf(fpout, "%s return;\n", spc); - fprintf(fpout, "%s}\n", spc); + fprintf(fpout, "%s}\n", spc); */ GenAlphCase(pre, spc, fpout, 1, n, ix, iy, ia, ib); GenAlphCase(pre, spc, fpout, -1, n, ix, iy, ia, ib); if (pre == 'c' || pre == 'z')