Caution: This page is about SHA512 instructions of Intel x86/x64 processor extension. They do not work on processors that do not support this feature.
As for SHA512, FIPS 180-4 issued by the government agency of the United States, is the original specification.
The following is the algorithm to calculate the hash value from the pre-processed 128-BYTE (16-QWORD) message blocks, defined in page 24 and 25 of FIPS 180-4.


VSHA512MSG1 and VSHA512MSG2 instructions help the calculation of A above.
VSHA512RNDS2 instrucion helps the processing of B above.
The expression of A is the calculation of QWORD values to be appended to the pre-processed 16-QWORD message block, to generate the 80-QWORD W array.
To calculate the SHA hash on x86/x64, the byte order within each QWORD has to be swapped, because SHA calculations are based on big-endian. SHA512 instructions do not do the swapping automatically.
The expression of A specifies to add W[t-16], W[t-15], W[t-7] and W[t-2] and set the sum to W[t]. Before the addition, W[t-15] and W[t-2] have to be converted by the sigma functions defined in FIPS 180-4.

SHA512 instructions make it possible to do this, four elements at a time.
VSHA512MSG1 instruction does the sigma conversion and addition of 1 in the illustration above.
VSHA512MSG2 instruction does the sigma conversion and addition of 3.
Ordinary SIMD VPADDQ instruction can do the addition of 2.
Does the sigma and addtion of 1 and returns the result in (3).
Does the sigma and addition of 3, and returns the result in (3).
Inputs: (1) = result of 2. (2) = the elements of the W array.
B loops through all 80 elements of the W array to update eight state variables a, b, c, d, e, f, g and h.
Executing VSHA512RNDS2 instruction once, 2 rounds of the loop are processed.
Does 2 rounds of B loop to calculate updated state variables.
Inputs: (1) = the state variables c, d, g, h. (2) = the state variables a, b, e, f. (3) = elements of the W array + elements of the K array.
Output: (4) = the state variables a', b', e', f' updated after 2 rounds of the loop.
The K array is a 80-QWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to the corresponding element of the W array to make the input data (3).
The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data (2) (the state variables a, b, e, f before the 2 rounds).
#pragma once
#include <intrin.h>
class SHA512H
{
protected:
// Message block
static const size_t MBYTES = 128;
static const size_t LENGTH_LENGTH = 16; // length (in byte) of the total length (in bit)
unsigned char msgbuf[MBYTES];
size_t msgbuf_count; // length (in byte) of the data currently in the message block
unsigned __int64 total_count; // total length (in byte) of the message
// Intermediate hash
__m256i h0145; // h0:h1:h4:h5
__m256i h2367; // h2:h3:h6:h7
public:
SHA512H() { Initialize(); }
~SHA512H() {}
void Update(const void* buf, size_t length);
void Final(void* digest);
protected:
void Initialize();
void ProcessMsgBlock(const unsigned char* msg);
};
#include <memory.h> #include "SHA512H.h" // K Array (see FIPS 180-4 4.2.3) static const union { unsigned __int64 qw[80]; __m256i y[20]; } K = { 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, }; // Initial hash value (see FIPS 180-4 5.3.5) #define H0 0x6a09e667f3bcc908ULL #define H1 0xbb67ae8584caa73bULL #define H2 0x3c6ef372fe94f82bULL #define H3 0xa54ff53a5f1d36f1ULL #define H4 0x510e527fade682d1ULL #define H5 0x9b05688c2b3e6c1fULL #define H6 0x1f83d9abfb41bd6bULL #define H7 0x5be0cd19137e2179ULL void SHA512H::Initialize() { h0145 = _mm256_set_epi64x(H0, H1, H4, H5); h2367 = _mm256_set_epi64x(H2, H3, H6, H7); msgbuf_count = 0; total_count = 0; } void SHA512H::Update(const void* buf, size_t length) { const unsigned char* p = (const unsigned char*)buf; total_count += length; // If any bytes are left in the message buffer, // fullfill the block first if (msgbuf_count) { size_t c = MBYTES - msgbuf_count; if (length < c) { memcpy(msgbuf + msgbuf_count, p, length); msgbuf_count += length; return; } else { memcpy(msgbuf + msgbuf_count, p, c); p += c; length -= c; ProcessMsgBlock(msgbuf); msgbuf_count = 0; } } // When we reach here, we have no data left in the message buffer while (length >= MBYTES) { // No need to copy into the internal message block ProcessMsgBlock(p); p += MBYTES; length -= MBYTES; } // Leave the remaining bytes in the message buffer if (length) { memcpy(msgbuf, p, length); msgbuf_count = length; } } void SHA512H::Final(void* digest) { // Add the terminating bit msgbuf[msgbuf_count++] = 0x80; // Need to set total length in the last LENGTH_LENGTH-byte of the block. // If there is no room for the length, process this block first if (msgbuf_count + LENGTH_LENGTH > MBYTES) { // Fill zeros and process memset(msgbuf + msgbuf_count, 0, MBYTES - msgbuf_count); ProcessMsgBlock(msgbuf); msgbuf_count = 0; } // Fill zeros before the last LENGTH_LENGTH-byte of the block memset(msgbuf + msgbuf_count, 0, MBYTES - LENGTH_LENGTH - msgbuf_count); // Set the length of the message in big-endian __m128i tmp = _mm_loadl_epi64((__m128i*) & total_count); // convert # of bytes to # of bits __m128i tmpL = _mm_slli_epi64(tmp, 3); // lower 64-bit __m128i tmpH = _mm_srli_epi64(tmp, 64 - 3);// upper 64-bit tmp = _mm_unpacklo_epi64(tmpL, tmpH); const __m128i total_count_byteswapindex = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); tmp = _mm_shuffle_epi8(tmp, total_count_byteswapindex); // convert to big endian _mm_storeu_si128((__m128i*)(msgbuf + MBYTES - LENGTH_LENGTH), tmp); // Process the last block ProcessMsgBlock(msgbuf); // Get the resulting hash value. // h0:h1 : h4:h5 // h2:h3 : h6:h7 // | // V // h2:h3 : h0:h1 // h6:h7 : h4:h5 __m256i h2301 = _mm256_permute2x128_si256(h2367, h0145, 0x13); __m256i h6745 = _mm256_permute2x128_si256(h2367, h0145, 0x02); // Swap the byte order in each lane const __m256i byteswapindex = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __m256i h3210reverse = _mm256_shuffle_epi8(h2301, byteswapindex); __m256i h7654reverse = _mm256_shuffle_epi8(h6745, byteswapindex); __m256i* digestY = (__m256i*)digest; _mm256_storeu_si256(digestY, h3210reverse); _mm256_storeu_si256(digestY + 1, h7654reverse); } void SHA512H::ProcessMsgBlock(const unsigned char* msg) { // Cyclic W array // We keep the W array content cyclically in 4 variables // Initially: // cw0 = w3 : w2 : w1 : w0 // cw1 = w7 : w6 : w5 : w4 // cw2 = w11 : w10 : w9 : w8 // cw3 = w15 : w14 : w13 : w12 const __m256i byteswapindex = _mm256_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); const __m256i* msgy = (const __m256i*)msg; __m256i cw0 = _mm256_shuffle_epi8(_mm256_loadu_si256(msgy), byteswapindex); __m256i cw1 = _mm256_shuffle_epi8(_mm256_loadu_si256(msgy + 1), byteswapindex); __m256i cw2 = _mm256_shuffle_epi8(_mm256_loadu_si256(msgy + 2), byteswapindex); __m256i cw3 = _mm256_shuffle_epi8(_mm256_loadu_si256(msgy + 3), byteswapindex); // Advance W array cycle // Inputs: // CW0 = w[t-13] : w[t-14] : w[t-15] : w[t-16] // CW1 = w[t-9] : w[t-10] : w[t-11] : w[t-12] // CW2 = w[t-5] : w[t-6] : w[t-7] : w[t-8] // CW3 = w[t-1] : w[t-2] : w[t-3] : w[t-4] // Outputs: // CW1 = w[t-9] : w[t-10] : w[t-11] : w[t-12] // CW2 = w[t-5] : w[t-6] : w[t-7] : w[t-8] // CW3 = w[t-1] : w[t-2] : w[t-3] : w[t-4] // CW0 = w[t+3] : w[t+2] : w[t+1] : w[t] __m256i temp_c; #define CYCLE_W(CW0, CW1, CW2, CW3) \ CW0 = _mm256_sha512msg1_epi64(CW0, _mm256_extracti128_si256(CW1, 0)); \ temp_c = _mm256_blend_epi32(CW2, CW3, 0x03); /* w[t-5]:w[t-6]:w[t-7]:w[t-4] */ \ temp_c = _mm256_permute4x64_epi64(temp_c, 0x39); /* 00 11 10 01b */ \ CW0 = _mm256_add_epi64(CW0, temp_c); /* add w[t-4]:w[t-5]:w[t-6]:w[t-7] */ \ CW0 = _mm256_sha512msg2_epi64(CW0, CW3); __m256i state1 = h0145; // a:b:e:f __m256i state2 = h2367; // c:d:g:h __m256i tmp; /* w0 - w3 */ #define SHA512_ROUNDS_4(cwN, n) \ tmp = _mm256_add_epi64(cwN, K.y[n]); /* w3+K3 : w2+K2 : w1+K1 : w0+K0 */ \ state2 = _mm256_sha512rnds2_epi64(state2, state1, _mm256_extracti128_si256(tmp, 0));\ state1 = _mm256_sha512rnds2_epi64(state1, state2, _mm256_extracti128_si256(tmp, 1)); /* w0 - w3 */ SHA512_ROUNDS_4(cw0, 0); /* w4 - w7 */ SHA512_ROUNDS_4(cw1, 1); /* w8 - w11 */ SHA512_ROUNDS_4(cw2, 2); /* w12 - w15 */ SHA512_ROUNDS_4(cw3, 3); /* w16 - w19 */ CYCLE_W(cw0, cw1, cw2, cw3); /* cw0 = w19 : w18 : w17 : w16 */ SHA512_ROUNDS_4(cw0, 4); /* w20 - w23 */ CYCLE_W(cw1, cw2, cw3, cw0); /* cw1 = w23 : w22 : w21 : w20 */ SHA512_ROUNDS_4(cw1, 5); /* w24 - w27 */ CYCLE_W(cw2, cw3, cw0, cw1); /* cw2 = w27 : w26 : w25 : w24 */ SHA512_ROUNDS_4(cw2, 6); /* w28 - w31 */ CYCLE_W(cw3, cw0, cw1, cw2); /* cw3 = w31 : w30 : w29 : w28 */ SHA512_ROUNDS_4(cw3, 7); /* w32 - w35 */ CYCLE_W(cw0, cw1, cw2, cw3); /* cw0 = w35 : w34 : w33 : w32 */ SHA512_ROUNDS_4(cw0, 8); /* w36 - w39 */ CYCLE_W(cw1, cw2, cw3, cw0); /* cw1 = w39 : w38 : w37 : w36 */ SHA512_ROUNDS_4(cw1, 9); /* w40 - w43 */ CYCLE_W(cw2, cw3, cw0, cw1); /* cw2 = w43 : w42 : w41 : w40 */ SHA512_ROUNDS_4(cw2, 10); /* w44 - w47 */ CYCLE_W(cw3, cw0, cw1, cw2); /* cw3 = w47 : w46 : w45 : w44 */ SHA512_ROUNDS_4(cw3, 11); /* w48 - w51 */ CYCLE_W(cw0, cw1, cw2, cw3); /* cw0 = w51 : w50 : w49 : w48 */ SHA512_ROUNDS_4(cw0, 12); /* w52 - w55 */ CYCLE_W(cw1, cw2, cw3, cw0); /* cw1 = w55 : w54 : w53 : w52 */ SHA512_ROUNDS_4(cw1, 13); /* w56 - w59 */ CYCLE_W(cw2, cw3, cw0, cw1); /* cw2 = w59 : w58 : w57 : w56 */ SHA512_ROUNDS_4(cw2, 14); /* w60 - w63 */ CYCLE_W(cw3, cw0, cw1, cw2); /* cw3 = w63 : w62 : w61 : w60 */ SHA512_ROUNDS_4(cw3, 15); /* w64 - w67 */ CYCLE_W(cw0, cw1, cw2, cw3); /* cw0 = w67 : w66 : w65 : w64 */ SHA512_ROUNDS_4(cw0, 16); /* w68 - w71 */ CYCLE_W(cw1, cw2, cw3, cw0); /* cw1 = w71 : w70 : w69 : w68 */ SHA512_ROUNDS_4(cw1, 17); /* w72 - w75 */ CYCLE_W(cw2, cw3, cw0, cw1); /* cw2 = w75 : w74 : w73 : w72 */ SHA512_ROUNDS_4(cw2, 18); /* w76 - w79 */ CYCLE_W(cw3, cw0, cw1, cw2); /* cw3 = w79 : w78 : w77 : w76 */ SHA512_ROUNDS_4(cw3, 19); // Add to the intermediate hash h0145 = _mm256_add_epi64(state1, h0145); h2367 = _mm256_add_epi64(state2, h2367); }