1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/* multiplication in galois field with reduction */
#ifdef __x86_64__
__attribute__((target("sse2,pclmul")))
#endif
inline void gfmul (__m128i a, __m128i b, __m128i *res){
__m128i tmp3, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
__m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
mul128(a, b, &tmp3, &tmp6);
tmp7 = _mm_srli_epi32(tmp6, 31);
tmp8 = _mm_srli_epi32(tmp6, 30);
tmp9 = _mm_srli_epi32(tmp6, 25);
tmp7 = _mm_xor_si128(tmp7, tmp8);
tmp7 = _mm_xor_si128(tmp7, tmp9);
tmp8 = _mm_shuffle_epi32(tmp7, 147);

tmp7 = _mm_and_si128(XMMMASK, tmp8);
tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
tmp3 = _mm_xor_si128(tmp3, tmp8);
tmp6 = _mm_xor_si128(tmp6, tmp7);
tmp10 = _mm_slli_epi32(tmp6, 1);
tmp3 = _mm_xor_si128(tmp3, tmp10);
tmp11 = _mm_slli_epi32(tmp6, 2);
tmp3 = _mm_xor_si128(tmp3, tmp11);
tmp12 = _mm_slli_epi32(tmp6, 7);
tmp3 = _mm_xor_si128(tmp3, tmp12);

*res = _mm_xor_si128(tmp3, tmp6);
}
阅读全文 »

事情要从这段代码说起:

1
2
3
4
5
// gcc test.c -o test -fwrapv
int64_t multimod_fast(int64_t a, int64_t b, int64_t m) {
int64_t t = (a * b - (int64_t)((double)a * b / m) * m) % m;
return t < 0 ? t + m : t;
}

昨天晚上,有人给我发了这段据说可以替代快速乘的代码,让我解释这段代码的正确性。这段代码可以把时间复杂度从降到。显然,我们都知道int64_tdouble之间的强制转换会丢失精度,因此我对这段代码的正确性产生了怀疑。

阅读全文 »
0%