forked from BilalY/Rasagar
136 lines
5.6 KiB
C#
136 lines
5.6 KiB
C#
|
using Unity.Burst.Intrinsics;
|
||
|
using Unity.Collections.LowLevel.Unsafe;
|
||
|
|
||
|
namespace Unity.Collections
|
||
|
{
|
||
|
[GenerateTestsForBurstCompatibility]
|
||
|
public static partial class xxHash3
|
||
|
{
|
||
|
internal static unsafe void Avx2HashLongInternalLoop(ulong* acc, byte* input, byte* dest, long length, byte* secret, int isHash64)
|
||
|
{
|
||
|
if (X86.Avx2.IsAvx2Supported)
|
||
|
{
|
||
|
// Process packets of 512 bits
|
||
|
var nb_blocks = (length-1) / BLOCK_LEN;
|
||
|
for (int n = 0; n < nb_blocks; n++)
|
||
|
{
|
||
|
Avx2Accumulate(acc, input + n * BLOCK_LEN, dest == null ? null : dest + n * BLOCK_LEN, secret, NB_ROUNDS, isHash64);
|
||
|
Avx2ScrambleAcc(acc, secret + SECRET_KEY_SIZE - STRIPE_LEN);
|
||
|
}
|
||
|
|
||
|
var nbStripes = ((length-1) - (BLOCK_LEN * nb_blocks)) / STRIPE_LEN;
|
||
|
Avx2Accumulate(acc, input + nb_blocks * BLOCK_LEN, dest == null ? null : dest + nb_blocks * BLOCK_LEN, secret, nbStripes, isHash64);
|
||
|
|
||
|
var p = input + length - STRIPE_LEN;
|
||
|
Avx2Accumulate512(acc, p, null, secret + SECRET_KEY_SIZE - STRIPE_LEN - SECRET_LASTACC_START);
|
||
|
|
||
|
if (dest != null)
|
||
|
{
|
||
|
var remaining = length % STRIPE_LEN;
|
||
|
if (remaining != 0)
|
||
|
{
|
||
|
UnsafeUtility.MemCpy(dest + length - remaining, input + length - remaining, remaining);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
internal static unsafe void Avx2ScrambleAcc(ulong* acc, byte* secret)
|
||
|
{
|
||
|
if (X86.Avx2.IsAvx2Supported)
|
||
|
{
|
||
|
var xAcc = (v256*) acc;
|
||
|
var xSecret = (v256*) secret;
|
||
|
var prime32 = X86.Avx.mm256_set1_epi32(unchecked((int) PRIME32_1));
|
||
|
|
||
|
// First bank
|
||
|
var acc_vec = xAcc[0];
|
||
|
var shifted = X86.Avx2.mm256_srli_epi64(acc_vec, 47);
|
||
|
var data_vec = X86.Avx2.mm256_xor_si256(acc_vec, shifted);
|
||
|
|
||
|
var key_vec = X86.Avx.mm256_loadu_si256(xSecret + 0);
|
||
|
var data_key = X86.Avx2.mm256_xor_si256(data_vec, key_vec);
|
||
|
|
||
|
var data_key_hi = X86.Avx2.mm256_shuffle_epi32(data_key, X86.Sse.SHUFFLE(0, 3, 0, 1));
|
||
|
var prod_lo = X86.Avx2.mm256_mul_epu32(data_key, prime32);
|
||
|
var prod_hi = X86.Avx2.mm256_mul_epu32(data_key_hi, prime32);
|
||
|
|
||
|
xAcc[0] = X86.Avx2.mm256_add_epi64(prod_lo, X86.Avx2.mm256_slli_epi64(prod_hi, 32));
|
||
|
|
||
|
// Second bank
|
||
|
acc_vec = xAcc[1];
|
||
|
shifted = X86.Avx2.mm256_srli_epi64(acc_vec, 47);
|
||
|
data_vec = X86.Avx2.mm256_xor_si256(acc_vec, shifted);
|
||
|
|
||
|
key_vec = X86.Avx.mm256_loadu_si256(xSecret + 1);
|
||
|
data_key = X86.Avx2.mm256_xor_si256(data_vec, key_vec);
|
||
|
|
||
|
data_key_hi = X86.Avx2.mm256_shuffle_epi32(data_key, X86.Sse.SHUFFLE(0, 3, 0, 1));
|
||
|
prod_lo = X86.Avx2.mm256_mul_epu32(data_key, prime32);
|
||
|
prod_hi = X86.Avx2.mm256_mul_epu32(data_key_hi, prime32);
|
||
|
|
||
|
xAcc[1] = X86.Avx2.mm256_add_epi64(prod_lo, X86.Avx2.mm256_slli_epi64(prod_hi, 32));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
internal static unsafe void Avx2Accumulate(ulong* acc, byte* input, byte* dest, byte* secret, long nbStripes,
|
||
|
int isHash64)
|
||
|
{
|
||
|
if (X86.Avx2.IsAvx2Supported)
|
||
|
{
|
||
|
for (var n = 0; n < nbStripes; n++)
|
||
|
{
|
||
|
var xInput = input + n * STRIPE_LEN;
|
||
|
Avx2Accumulate512(acc, xInput, dest == null ? null : dest + n * STRIPE_LEN,
|
||
|
secret + n * SECRET_CONSUME_RATE);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
internal static unsafe void Avx2Accumulate512(ulong* acc, byte* input, byte* dest, byte* secret)
|
||
|
{
|
||
|
if (X86.Avx2.IsAvx2Supported)
|
||
|
{
|
||
|
var xAcc = (v256*) acc;
|
||
|
var xSecret = (v256*) secret;
|
||
|
var xInput = (v256*) input;
|
||
|
|
||
|
// First bank
|
||
|
var data_vec = X86.Avx.mm256_loadu_si256(xInput + 0);
|
||
|
var key_vec = X86.Avx.mm256_loadu_si256(xSecret + 0);
|
||
|
var data_key = X86.Avx2.mm256_xor_si256(data_vec, key_vec);
|
||
|
|
||
|
if (dest != null)
|
||
|
{
|
||
|
X86.Avx.mm256_storeu_si256(dest, data_vec);
|
||
|
}
|
||
|
|
||
|
var data_key_lo = X86.Avx2.mm256_shuffle_epi32(data_key, X86.Sse.SHUFFLE(0, 3, 0, 1));
|
||
|
var product = X86.Avx2.mm256_mul_epu32(data_key, data_key_lo);
|
||
|
var data_swap= X86.Avx2.mm256_shuffle_epi32(data_vec, X86.Sse.SHUFFLE(1, 0, 3, 2));
|
||
|
var sum= X86.Avx2.mm256_add_epi64(xAcc[0], data_swap);
|
||
|
|
||
|
xAcc[0] = X86.Avx2.mm256_add_epi64(product, sum);
|
||
|
|
||
|
// Second bank
|
||
|
data_vec = X86.Avx.mm256_loadu_si256(xInput + 1);
|
||
|
key_vec = X86.Avx.mm256_loadu_si256(xSecret + 1);
|
||
|
data_key = X86.Avx2.mm256_xor_si256(data_vec, key_vec);
|
||
|
|
||
|
if (dest != null)
|
||
|
{
|
||
|
X86.Avx.mm256_storeu_si256(dest + 32, data_vec);
|
||
|
}
|
||
|
|
||
|
data_key_lo = X86.Avx2.mm256_shuffle_epi32(data_key, X86.Sse.SHUFFLE(0, 3, 0, 1));
|
||
|
product = X86.Avx2.mm256_mul_epu32(data_key, data_key_lo);
|
||
|
data_swap = X86.Avx2.mm256_shuffle_epi32(data_vec, X86.Sse.SHUFFLE(1, 0, 3, 2));
|
||
|
sum = X86.Avx2.mm256_add_epi64(xAcc[1], data_swap);
|
||
|
|
||
|
xAcc[1] = X86.Avx2.mm256_add_epi64(product, sum);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
}
|