现充|junyu33

High-performance implementation of multiplication in finite fields on the Intel instruction set

/* multiplication in galois field with reduction */
#ifdef __x86_64__
__attribute__((target("sse2,pclmul")))
#endif
inline void gfmul (__m128i a, __m128i b, __m128i *res){
	__m128i tmp3, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
	__m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
	mul128(a, b, &tmp3, &tmp6);
	tmp7 = _mm_srli_epi32(tmp6, 31);
	tmp8 = _mm_srli_epi32(tmp6, 30);
	tmp9 = _mm_srli_epi32(tmp6, 25);
	tmp7 = _mm_xor_si128(tmp7, tmp8);
	tmp7 = _mm_xor_si128(tmp7, tmp9);
	tmp8 = _mm_shuffle_epi32(tmp7, 147);

	tmp7 = _mm_and_si128(XMMMASK, tmp8);
	tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
	tmp3 = _mm_xor_si128(tmp3, tmp8);
	tmp6 = _mm_xor_si128(tmp6, tmp7);
	tmp10 = _mm_slli_epi32(tmp6, 1);
	tmp3 = _mm_xor_si128(tmp3, tmp10);
	tmp11 = _mm_slli_epi32(tmp6, 2);
	tmp3 = _mm_xor_si128(tmp3, tmp11);
	tmp12 = _mm_slli_epi32(tmp6, 7);
	tmp3 = _mm_xor_si128(tmp3, tmp12);

	*res = _mm_xor_si128(tmp3, tmp6);
}

Prerequisite

The irreducible polynomial here is x128+x7+x2+x+1.

mul128

First, we discuss the multiplication of two 128-bit polynomials without reduction. For convenience, we convert polynomials into their corresponding binary numbers. For example, x4+x3+x+1 corresponds to the binary 0b11011, which is 27.

Intel has an instruction specifically for multiplication in F2n (i.e., carry-less multiplication): __m128i _mm_clmulepi64_si128(__m128i a, __m128i b, const int imm8);. Its syntax is as follows:

We can implement 128-bit carry-less multiplication using 64-bit carry-less multiplication with the following steps:

/* multiplication in galois field without reduction */
#ifdef __x86_64__
__attribute__((target("sse2,pclmul")))
inline void mul128(__m128i a, __m128i b, __m128i *res1, __m128i *res2) {
	__m128i tmp3, tmp4, tmp5, tmp6;
	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
	tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
	tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);

	tmp4 = _mm_xor_si128(tmp4, tmp5);
	tmp5 = _mm_slli_si128(tmp4, 8);
	tmp4 = _mm_srli_si128(tmp4, 8);
	tmp3 = _mm_xor_si128(tmp3, tmp5);
	tmp6 = _mm_xor_si128(tmp6, tmp4);
	// initial mul now in tmp3, tmp6
	*res1 = tmp3;
	*res2 = tmp6;
}

Below is the proof of this algorithm:

Proof

Let the high and low parts of the multiplicand A be a1 and a2, respectively, and similarly, the high and low parts of the multiplier B be b1 and b2. In the first half of the algorithm, we have:

tmp6=a1b1
tmp5=a2b1
tmp4=a1b2
tmp3=a2b2

In the second half, we have:

tmp5=264((tmp4+tmp5)%264)
tmp4=264(tmp4+tmp5)
tmp3=264((tmp4+tmp5)%264)+tmp3
tmp6=264(tmp4+tmp5)+tmp6

Therefore:

res1+res2
=2128tmp6+tmp3
=2128tmp6+2128(264(tmp4+tmp5)264(tmp4+tmp5)%1)+264((tmp4+tmp5)%264)+tmp3
=2128tmp6+264(tmp4+tmp5)264(tmp4+tmp5)%2128+264(tmp4+tmp5)%2128+tmp3
=2128tmp6+264(tmp4+tmp5)+tmp3

And:

AB
=(264a1+a2)(264b1+b2)
=2128tmp6+264(tmp4+tmp5)+tmp3

Hence, res1+res2=AB

gfmul

In practice, we also need to reduce the result after non-carry multiplication modulo x128+x7+x2+x+1. Since SageMath has specialized tools for finite field operations, we can write a SageMath program to verify the result:

# Define the polynomial ring over GF(2)
P.<x> = PolynomialRing(GF(2))

# Define the modulus polynomial
mod_poly = x^128 + x^7 + x^2 + x + 1

# Define the finite field GF(2^128)
K.<a> = GF(2**128, modulus=mod_poly)

# Convert an integer to a polynomial
def int_to_poly(n):
    bin_str = bin(n)[2:]  # Convert integer to binary string, remove '0b' prefix
    return sum([int(bit) * x^i for i, bit in enumerate(reversed(bin_str))])

# Convert a polynomial back to an integer
def poly_to_int(p):
    return sum([int(coeff) * (2**i) for i, coeff in enumerate(p.list())])

# Example: Elements represented as integers
f_int, g_int = 98195696920426533817649554218743231661, 43027262476631949179376797970948942433

# Convert to polynomials
f_poly = int_to_poly(f_int)
g_poly = int_to_poly(g_int)

# Perform operations in GF(2^128)
f = K(f_poly)
g = K(g_poly)

# Example operations
add_result = f + g
mul_result = f * g

# Convert results back to integers
add_result_int = poly_to_int(add_result.polynomial())
mul_result_int = poly_to_int(mul_result.polynomial())

print("f (as int) =", f_int)
print("g (as int) =", g_int)
print("f + g (as int) =", add_result_int)
print("f * g (as int) =", mul_result_int)
'''
f (as int) = 98195696920426533817649554218743231661
g (as int) = 43027262476631949179376797970948942433
f + g (as int) = 140241067422779931769655503331313065676
f * g (as int) = 30853704161780158484268560045100192027
'''

Once we have a proper validation program, we can analyze and debug the code provided at the beginning of the article.

Proof

Let us first analyze how to represent the result of F2128(ab) using bitwise operations on tmp3,tmp6:

tmp3, tmp6 = mul128(a, b)

// Assume T = ((tmp6 >> 127) ^ (tmp6 >> 126) ^ (tmp6 >> 121))

// Here, nr() denotes "not reduced"
res = nr(tmp3 ^ ((tmp6) << 128))
    = tmp3 ^ nr(tmp6 << 128)
    = tmp3 ^ nr(tmp6) ^ nr(tmp6 << 1) ^ nr(tmp6 << 2) ^ nr(tmp6 << 7)
    = tmp3 ^ nr(tmp6) ^ (nr(tmp6) << 1) ^ (nr(tmp6) << 2) ^ (nr(tmp6) << 7)
    // Since tmp6 >> 128 is definitely 0, it is omitted here
    = tmp3 ^ (tmp6 ^ T)
           ^ ((tmp6 ^ T) << 1)
           ^ ((tmp6 ^ T) << 2)
           ^ ((tmp6 ^ T) << 7)

However, for computational convenience, we take T = ((tmp6 >> 31) ^ (tmp6 >> 30) ^ (tmp6 >> 25)). Therefore, we have:

def gfmul(a, b):
    tmp3, tmp6 = mul128(a, b)
    T = ((tmp6 >> 31) ^ (tmp6 >> 30) ^ (tmp6 >> 25))

    res = tmp3 ^ \
      (T >> 96) ^ tmp6 ^ \
      (((T >> 96) ^ tmp6) << 1) ^ \
      (((T >> 96) ^ tmp6) << 2) ^ \
      (((T >> 96) ^ tmp6) << 7)
    return res % (2 ** 128 - 1)

After comparison with sagemath, their outputs are completely consistent.

Instruction Set Implementation

However, in 128-bit numbers, there is no direct operation to shift a 128-bit number right or left. Using C language's << or >> operators will only shift the four 32-bit vectors that make up __m128i together, thus failing to achieve the correct result.

In fact, these two operators correspond to the _mm_slli_epi32 and _mm_srli_epi32 instructions. The syntax of the former is as follows, and the latter is similar:

__m128i _mm_slli_epi32(__m128i a, int imm8);

The _mm_slli_epi32 instruction performs a logical left shift on each 32-bit integer in register a, with the number of bits to shift specified by the immediate value imm8. All integers are shifted left by the same number of bits, with the high bits shifted out being discarded and the low bits filled with zeros.

Through relevant derivations, for any 128-bit number x, we have the following identity (0<i<32):

x << i === mm_slli_epi32(x, i) ^ (mm_srli_epi32(x, 32-i) << 32)

The proof is relatively straightforward and is omitted here.

Therefore, we can modify the original function:

def gfmul(a, b):
    tmp3, tmp6 = mul128(a, b)
    T = ((tmp6 >> 31) ^ (tmp6 >> 30) ^ (tmp6 >> 25))
    # T = (srli(tmp6, 31) ^ srli(tmp6, 30) ^ srli(tmp6, 25))

    res = tmp3 ^ \
      (T >> 96) ^ tmp6 ^ \
      (((T >> 96) ^ tmp6) << 1) ^ \
      (((T >> 96) ^ tmp6) << 2) ^ \
      (((T >> 96) ^ tmp6) << 7)

    # res    = tmp3 ^ tmp6 ^ \
    #        = slli(tmp6, 1) ^ (srli(tmp6, 31) << 32) ^ \
    #          slli(tmp6, 2) ^ (srli(tmp6, 30) << 32) ^ \
    #          slli(tmp6, 7) ^ (srli(tmp6, 25) << 32) ^ \
    #          (T >> 96) ^ (T >> 96 << 1) ^ (T >> 96 << 2) ^ (T >> 96 << 7)
    # Here, since (T >> 96 << 7) itself has at most 14 bits, which is less than 32 bits, the transformation mentioned above is not needed.
    #        = (T >> 96) ^ tmp6 \
    #          slli(tmp6 ^ (T >> 96), 1) \
    #          slli(tmp6 ^ (T >> 96), 2) \
    #          slli(tmp6 ^ (T >> 96), 7) \
    #          (T << 32) ^ tmp3

    return res & (2 ** 128 - 1)

Now we can return to the source code mentioned at the beginning of this article:

void gfmul (__m128i a, __m128i b, __m128i *res){
	__m128i tmp3, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
	__m128i XMMMASK = _mm_setr_epi32(0xffffffff, 0x0, 0x0, 0x0);
	mul128(a, b, &tmp3, &tmp6);

	tmp7 = _mm_srli_epi32(tmp6, 31);
	tmp8 = _mm_srli_epi32(tmp6, 30);
	tmp9 = _mm_srli_epi32(tmp6, 25);
	tmp7 = _mm_xor_si128(tmp7, tmp8);
	tmp7 = _mm_xor_si128(tmp7, tmp9);
	// tmp7 = T = (srli(tmp6, 31) ^ srli(tmp6, 30) ^ srli(tmp6, 25))
	tmp8 = _mm_shuffle_epi32(tmp7, 147);

	tmp7 = _mm_and_si128(XMMMASK, tmp8);
	// tmp7 = T >> 96
	tmp8 = _mm_andnot_si128(XMMMASK, tmp8);
	// tmp8 = T << 32

	tmp3 = _mm_xor_si128(tmp3, tmp8);
	// tmp3 = ((T << 32) ^ tmp3)
	tmp6 = _mm_xor_si128(tmp6, tmp7);
	// tmp6 = ((T >> 96) ^ tmp6)

	tmp10 = _mm_slli_epi32(tmp6, 1);
	tmp3 = _mm_xor_si128(tmp3, tmp10);
	tmp11 = _mm_slli_epi32(tmp6, 2);
	tmp3 = _mm_xor_si128(tmp3, tmp11);
	tmp12 = _mm_slli_epi32(tmp6, 7);
	tmp3 = _mm_xor_si128(tmp3, tmp12);

	*res = _mm_xor_si128(tmp3, tmp6);
	// res = ((T << 32) ^ tmp3) ^ \
		slli((T >> 96) ^ tmp6), 1) ^ \
		slli((T >> 96) ^ tmp6), 2) ^ \
		slli((T >> 96) ^ tmp6), 7) ^ \
		((T >> 96) ^ tmp6)             
}

Thus, we have achieved the goal of performing multiplication in F2128 using Intel instruction sets.