Rasagar/Library/PackageCache/com.unity.collections/Unity.Collections/xxHash3.AVX2.cs
2024-08-26 23:07:20 +03:00

136 lines
5.6 KiB
C#

using Unity.Burst.Intrinsics;
using Unity.Collections.LowLevel.Unsafe;
namespace Unity.Collections
{
[GenerateTestsForBurstCompatibility]
public static partial class xxHash3
{
internal static unsafe void Avx2HashLongInternalLoop(ulong* acc, byte* input, byte* dest, long length, byte* secret, int isHash64)
{
if (X86.Avx2.IsAvx2Supported)
{
// Process packets of 512 bits
var nb_blocks = (length-1) / BLOCK_LEN;
for (int n = 0; n < nb_blocks; n++)
{
Avx2Accumulate(acc, input + n * BLOCK_LEN, dest == null ? null : dest + n * BLOCK_LEN, secret, NB_ROUNDS, isHash64);
Avx2ScrambleAcc(acc, secret + SECRET_KEY_SIZE - STRIPE_LEN);
}
var nbStripes = ((length-1) - (BLOCK_LEN * nb_blocks)) / STRIPE_LEN;
Avx2Accumulate(acc, input + nb_blocks * BLOCK_LEN, dest == null ? null : dest + nb_blocks * BLOCK_LEN, secret, nbStripes, isHash64);
var p = input + length - STRIPE_LEN;
Avx2Accumulate512(acc, p, null, secret + SECRET_KEY_SIZE - STRIPE_LEN - SECRET_LASTACC_START);
if (dest != null)
{
var remaining = length % STRIPE_LEN;
if (remaining != 0)
{
UnsafeUtility.MemCpy(dest + length - remaining, input + length - remaining, remaining);
}
}
}
}
internal static unsafe void Avx2ScrambleAcc(ulong* acc, byte* secret)
{
if (X86.Avx2.IsAvx2Supported)
{
var xAcc = (v256*) acc;
var xSecret = (v256*) secret;
var prime32 = X86.Avx.mm256_set1_epi32(unchecked((int) PRIME32_1));
// First bank
var acc_vec = xAcc[0];
var shifted = X86.Avx2.mm256_srli_epi64(acc_vec, 47);
var data_vec = X86.Avx2.mm256_xor_si256(acc_vec, shifted);
var key_vec = X86.Avx.mm256_loadu_si256(xSecret + 0);
var data_key = X86.Avx2.mm256_xor_si256(data_vec, key_vec);
var data_key_hi = X86.Avx2.mm256_shuffle_epi32(data_key, X86.Sse.SHUFFLE(0, 3, 0, 1));
var prod_lo = X86.Avx2.mm256_mul_epu32(data_key, prime32);
var prod_hi = X86.Avx2.mm256_mul_epu32(data_key_hi, prime32);
xAcc[0] = X86.Avx2.mm256_add_epi64(prod_lo, X86.Avx2.mm256_slli_epi64(prod_hi, 32));
// Second bank
acc_vec = xAcc[1];
shifted = X86.Avx2.mm256_srli_epi64(acc_vec, 47);
data_vec = X86.Avx2.mm256_xor_si256(acc_vec, shifted);
key_vec = X86.Avx.mm256_loadu_si256(xSecret + 1);
data_key = X86.Avx2.mm256_xor_si256(data_vec, key_vec);
data_key_hi = X86.Avx2.mm256_shuffle_epi32(data_key, X86.Sse.SHUFFLE(0, 3, 0, 1));
prod_lo = X86.Avx2.mm256_mul_epu32(data_key, prime32);
prod_hi = X86.Avx2.mm256_mul_epu32(data_key_hi, prime32);
xAcc[1] = X86.Avx2.mm256_add_epi64(prod_lo, X86.Avx2.mm256_slli_epi64(prod_hi, 32));
}
}
internal static unsafe void Avx2Accumulate(ulong* acc, byte* input, byte* dest, byte* secret, long nbStripes,
int isHash64)
{
if (X86.Avx2.IsAvx2Supported)
{
for (var n = 0; n < nbStripes; n++)
{
var xInput = input + n * STRIPE_LEN;
Avx2Accumulate512(acc, xInput, dest == null ? null : dest + n * STRIPE_LEN,
secret + n * SECRET_CONSUME_RATE);
}
}
}
internal static unsafe void Avx2Accumulate512(ulong* acc, byte* input, byte* dest, byte* secret)
{
if (X86.Avx2.IsAvx2Supported)
{
var xAcc = (v256*) acc;
var xSecret = (v256*) secret;
var xInput = (v256*) input;
// First bank
var data_vec = X86.Avx.mm256_loadu_si256(xInput + 0);
var key_vec = X86.Avx.mm256_loadu_si256(xSecret + 0);
var data_key = X86.Avx2.mm256_xor_si256(data_vec, key_vec);
if (dest != null)
{
X86.Avx.mm256_storeu_si256(dest, data_vec);
}
var data_key_lo = X86.Avx2.mm256_shuffle_epi32(data_key, X86.Sse.SHUFFLE(0, 3, 0, 1));
var product = X86.Avx2.mm256_mul_epu32(data_key, data_key_lo);
var data_swap= X86.Avx2.mm256_shuffle_epi32(data_vec, X86.Sse.SHUFFLE(1, 0, 3, 2));
var sum= X86.Avx2.mm256_add_epi64(xAcc[0], data_swap);
xAcc[0] = X86.Avx2.mm256_add_epi64(product, sum);
// Second bank
data_vec = X86.Avx.mm256_loadu_si256(xInput + 1);
key_vec = X86.Avx.mm256_loadu_si256(xSecret + 1);
data_key = X86.Avx2.mm256_xor_si256(data_vec, key_vec);
if (dest != null)
{
X86.Avx.mm256_storeu_si256(dest + 32, data_vec);
}
data_key_lo = X86.Avx2.mm256_shuffle_epi32(data_key, X86.Sse.SHUFFLE(0, 3, 0, 1));
product = X86.Avx2.mm256_mul_epu32(data_key, data_key_lo);
data_swap = X86.Avx2.mm256_shuffle_epi32(data_vec, X86.Sse.SHUFFLE(1, 0, 3, 2));
sum = X86.Avx2.mm256_add_epi64(xAcc[1], data_swap);
xAcc[1] = X86.Avx2.mm256_add_epi64(product, sum);
}
}
}
}