sha2-asm 0.6.4

Assembly implementation of SHA-2 compression functions
Documentation
/* 
 * SHA-256 hash in x86-64 assembly
 * 
 * Copyright (c) 2015 Project Nayuki. (MIT License)
 * https://www.nayuki.io/page/fast-sha2-hashes-in-x86-assembly
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy of
 * this software and associated documentation files (the "Software"), to deal in
 * the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 * the Software, and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 * - The above copyright notice and this permission notice shall be included in
 *   all copies or substantial portions of the Software.
 * - The Software is provided "as is", without warranty of any kind, express or
 *   implied, including but not limited to the warranties of merchantability,
 *   fitness for a particular purpose and noninfringement. In no event shall the
 *   authors or copyright holders be liable for any claim, damages or other
 *   liability, whether in an action of contract, tort or otherwise, arising from,
 *   out of or in connection with the Software or the use or other dealings in the
 *   Software.
 */


/* void sha256_compress(uint32_t state[8], const uint8_t block[64]) */
#ifdef __APPLE__
.globl _sha256_compress
_sha256_compress:
#else
.globl sha256_compress
sha256_compress:
#endif
    /* 
     * Storage usage:
     *   Bytes  Location  Description
     *       4  eax       Temporary for calculation per round
     *       4  ebx       Temporary for calculation per round
     *       4  ecx       Temporary for calculation per round
     *       4  edx       Temporary for calculation per round
     *       8  rsi       Base address of block array argument (read-only)
     *       8  rdi       Base address of state array argument (read-only)
     *       8  rsp       x86-64 stack pointer
     *       4  r8d       SHA-256 state variable A
     *       4  r9d       SHA-256 state variable B
     *       4  r10d      SHA-256 state variable C
     *       4  r11d      SHA-256 state variable D
     *       4  r12d      SHA-256 state variable E
     *       4  r13d      SHA-256 state variable F
     *       4  r14d      SHA-256 state variable G
     *       4  r15d      SHA-256 state variable H
     *      64  [rsp+0]   Circular buffer of most recent 16 key schedule items, 4 bytes each
     *      16  xmm0      Caller's value of r10 (only low 64 bits are used)
     *      16  xmm1      Caller's value of r11 (only low 64 bits are used)
     *      16  xmm2      Caller's value of r12 (only low 64 bits are used)
     *      16  xmm3      Caller's value of r13 (only low 64 bits are used)
     *      16  xmm4      Caller's value of r14 (only low 64 bits are used)
     *      16  xmm5      Caller's value of r15 (only low 64 bits are used)
     *      16  xmm6      Caller's value of rbx (only low 64 bits are used)
     */
    
    #define SCHED(i)  (((i)&0xF)*4)(%rsp)
    
    #define ROUNDa(i, a, b, c, d, e, f, g, h, k)  \
        movl    (i*4)(%rsi), %ebx;  \
        bswapl  %ebx;               \
        movl    %ebx, SCHED(i);     \
        ROUNDTAIL(a, b, c, d, e, f, g, h, k)
    
    #define ROUNDb(i, a, b, c, d, e, f, g, h, k)  \
        movl  SCHED(i-15), %eax;  \
        movl  SCHED(i-16), %ebx;  \
        addl  SCHED(i- 7), %ebx;  \
        movl  %eax, %ecx;         \
        movl  %eax, %edx;         \
        rorl  $18, %ecx;          \
        shrl  $3, %edx;           \
        rorl  $7, %eax;           \
        xorl  %edx, %ecx;         \
        xorl  %ecx, %eax;         \
        addl  %eax, %ebx;         \
        movl  SCHED(i- 2), %eax;  \
        movl  %eax, %ecx;         \
        movl  %eax, %edx;         \
        rorl  $19, %ecx;          \
        shrl  $10, %edx;          \
        rorl  $17, %eax;          \
        xorl  %edx, %ecx;         \
        xorl  %ecx, %eax;         \
        addl  %eax, %ebx;         \
        movl  %ebx, SCHED(i);     \
        ROUNDTAIL(a, b, c, d, e, f, g, h, k)
    
    #define ROUNDTAIL(a, b, c, d, e, f, g, h, k)  \
        /* Part 0 */               \
        /* See Intel's "Fast SHA-256 Implementations" for the ROR transformation */ \
        movl  %e, %eax;            \
        rorl  $14, %eax;            \
        xorl  %e, %eax;            \
        rorl  $5, %eax;            \
        xorl  %e, %eax;            \
        rorl  $6, %eax;            \
        addl  %ebx, %h;            \
        movl  %g, %ecx;            \
        xorl  %f, %ecx;            \
        andl  %e, %ecx;            \
        xorl  %g, %ecx;            \
        leal  k(%rax,%rcx), %eax;  \
        addl  %eax, %h;            \
        /* Part 1 */               \
        addl  %h, %d;              \
        /* Part 2 */               \
        /* See Intel's "Fast SHA-256 Implementations" for the ROR transformation */ \
        movl  %a, %eax;            \
        rorl  $9, %eax;            \
        xorl  %a, %eax;            \
        rorl  $11, %eax;           \
        xorl  %a, %eax;            \
        rorl  $2, %eax;            \
        movl  %c, %ecx;            \
        addl  %eax, %h;            \
        movl  %c, %eax;            \
        orl   %b, %eax;            \
        andl  %b, %ecx;            \
        andl  %a, %eax;            \
        orl   %ecx, %eax;          \
        addl  %eax, %h;
    
    /* Save registers, allocate scratch space */
    movq  %r10, %xmm0
    movq  %r11, %xmm1
    movq  %r12, %xmm2
    movq  %r13, %xmm3
    movq  %r14, %xmm4
    movq  %r15, %xmm5
    movq  %rbx, %xmm6
    subq  $64, %rsp
    
    /* Load state */
    movl   0(%rdi), %r8d   /* a */
    movl   4(%rdi), %r9d   /* b */
    movl   8(%rdi), %r10d  /* c */
    movl  12(%rdi), %r11d  /* d */
    movl  16(%rdi), %r12d  /* e */
    movl  20(%rdi), %r13d  /* f */
    movl  24(%rdi), %r14d  /* g */
    movl  28(%rdi), %r15d  /* h */
    
    /* Do 64 rounds of hashing */
    ROUNDa( 0, r8d , r9d , r10d, r11d, r12d, r13d, r14d, r15d,  0x428A2F98)
    ROUNDa( 1, r15d, r8d , r9d , r10d, r11d, r12d, r13d, r14d,  0x71374491)
    ROUNDa( 2, r14d, r15d, r8d , r9d , r10d, r11d, r12d, r13d, -0x4A3F0431)
    ROUNDa( 3, r13d, r14d, r15d, r8d , r9d , r10d, r11d, r12d, -0x164A245B)
    ROUNDa( 4, r12d, r13d, r14d, r15d, r8d , r9d , r10d, r11d,  0x3956C25B)
    ROUNDa( 5, r11d, r12d, r13d, r14d, r15d, r8d , r9d , r10d,  0x59F111F1)
    ROUNDa( 6, r10d, r11d, r12d, r13d, r14d, r15d, r8d , r9d , -0x6DC07D5C)
    ROUNDa( 7, r9d , r10d, r11d, r12d, r13d, r14d, r15d, r8d , -0x54E3A12B)
    ROUNDa( 8, r8d , r9d , r10d, r11d, r12d, r13d, r14d, r15d, -0x27F85568)
    ROUNDa( 9, r15d, r8d , r9d , r10d, r11d, r12d, r13d, r14d,  0x12835B01)
    ROUNDa(10, r14d, r15d, r8d , r9d , r10d, r11d, r12d, r13d,  0x243185BE)
    ROUNDa(11, r13d, r14d, r15d, r8d , r9d , r10d, r11d, r12d,  0x550C7DC3)
    ROUNDa(12, r12d, r13d, r14d, r15d, r8d , r9d , r10d, r11d,  0x72BE5D74)
    ROUNDa(13, r11d, r12d, r13d, r14d, r15d, r8d , r9d , r10d, -0x7F214E02)
    ROUNDa(14, r10d, r11d, r12d, r13d, r14d, r15d, r8d , r9d , -0x6423F959)
    ROUNDa(15, r9d , r10d, r11d, r12d, r13d, r14d, r15d, r8d , -0x3E640E8C)
    ROUNDb(16, r8d , r9d , r10d, r11d, r12d, r13d, r14d, r15d, -0x1B64963F)
    ROUNDb(17, r15d, r8d , r9d , r10d, r11d, r12d, r13d, r14d, -0x1041B87A)
    ROUNDb(18, r14d, r15d, r8d , r9d , r10d, r11d, r12d, r13d,  0x0FC19DC6)
    ROUNDb(19, r13d, r14d, r15d, r8d , r9d , r10d, r11d, r12d,  0x240CA1CC)
    ROUNDb(20, r12d, r13d, r14d, r15d, r8d , r9d , r10d, r11d,  0x2DE92C6F)
    ROUNDb(21, r11d, r12d, r13d, r14d, r15d, r8d , r9d , r10d,  0x4A7484AA)
    ROUNDb(22, r10d, r11d, r12d, r13d, r14d, r15d, r8d , r9d ,  0x5CB0A9DC)
    ROUNDb(23, r9d , r10d, r11d, r12d, r13d, r14d, r15d, r8d ,  0x76F988DA)
    ROUNDb(24, r8d , r9d , r10d, r11d, r12d, r13d, r14d, r15d, -0x67C1AEAE)
    ROUNDb(25, r15d, r8d , r9d , r10d, r11d, r12d, r13d, r14d, -0x57CE3993)
    ROUNDb(26, r14d, r15d, r8d , r9d , r10d, r11d, r12d, r13d, -0x4FFCD838)
    ROUNDb(27, r13d, r14d, r15d, r8d , r9d , r10d, r11d, r12d, -0x40A68039)
    ROUNDb(28, r12d, r13d, r14d, r15d, r8d , r9d , r10d, r11d, -0x391FF40D)
    ROUNDb(29, r11d, r12d, r13d, r14d, r15d, r8d , r9d , r10d, -0x2A586EB9)
    ROUNDb(30, r10d, r11d, r12d, r13d, r14d, r15d, r8d , r9d ,  0x06CA6351)
    ROUNDb(31, r9d , r10d, r11d, r12d, r13d, r14d, r15d, r8d ,  0x14292967)
    ROUNDb(32, r8d , r9d , r10d, r11d, r12d, r13d, r14d, r15d,  0x27B70A85)
    ROUNDb(33, r15d, r8d , r9d , r10d, r11d, r12d, r13d, r14d,  0x2E1B2138)
    ROUNDb(34, r14d, r15d, r8d , r9d , r10d, r11d, r12d, r13d,  0x4D2C6DFC)
    ROUNDb(35, r13d, r14d, r15d, r8d , r9d , r10d, r11d, r12d,  0x53380D13)
    ROUNDb(36, r12d, r13d, r14d, r15d, r8d , r9d , r10d, r11d,  0x650A7354)
    ROUNDb(37, r11d, r12d, r13d, r14d, r15d, r8d , r9d , r10d,  0x766A0ABB)
    ROUNDb(38, r10d, r11d, r12d, r13d, r14d, r15d, r8d , r9d , -0x7E3D36D2)
    ROUNDb(39, r9d , r10d, r11d, r12d, r13d, r14d, r15d, r8d , -0x6D8DD37B)
    ROUNDb(40, r8d , r9d , r10d, r11d, r12d, r13d, r14d, r15d, -0x5D40175F)
    ROUNDb(41, r15d, r8d , r9d , r10d, r11d, r12d, r13d, r14d, -0x57E599B5)
    ROUNDb(42, r14d, r15d, r8d , r9d , r10d, r11d, r12d, r13d, -0x3DB47490)
    ROUNDb(43, r13d, r14d, r15d, r8d , r9d , r10d, r11d, r12d, -0x3893AE5D)
    ROUNDb(44, r12d, r13d, r14d, r15d, r8d , r9d , r10d, r11d, -0x2E6D17E7)
    ROUNDb(45, r11d, r12d, r13d, r14d, r15d, r8d , r9d , r10d, -0x2966F9DC)
    ROUNDb(46, r10d, r11d, r12d, r13d, r14d, r15d, r8d , r9d , -0x0BF1CA7B)
    ROUNDb(47, r9d , r10d, r11d, r12d, r13d, r14d, r15d, r8d ,  0x106AA070)
    ROUNDb(48, r8d , r9d , r10d, r11d, r12d, r13d, r14d, r15d,  0x19A4C116)
    ROUNDb(49, r15d, r8d , r9d , r10d, r11d, r12d, r13d, r14d,  0x1E376C08)
    ROUNDb(50, r14d, r15d, r8d , r9d , r10d, r11d, r12d, r13d,  0x2748774C)
    ROUNDb(51, r13d, r14d, r15d, r8d , r9d , r10d, r11d, r12d,  0x34B0BCB5)
    ROUNDb(52, r12d, r13d, r14d, r15d, r8d , r9d , r10d, r11d,  0x391C0CB3)
    ROUNDb(53, r11d, r12d, r13d, r14d, r15d, r8d , r9d , r10d,  0x4ED8AA4A)
    ROUNDb(54, r10d, r11d, r12d, r13d, r14d, r15d, r8d , r9d ,  0x5B9CCA4F)
    ROUNDb(55, r9d , r10d, r11d, r12d, r13d, r14d, r15d, r8d ,  0x682E6FF3)
    ROUNDb(56, r8d , r9d , r10d, r11d, r12d, r13d, r14d, r15d,  0x748F82EE)
    ROUNDb(57, r15d, r8d , r9d , r10d, r11d, r12d, r13d, r14d,  0x78A5636F)
    ROUNDb(58, r14d, r15d, r8d , r9d , r10d, r11d, r12d, r13d, -0x7B3787EC)
    ROUNDb(59, r13d, r14d, r15d, r8d , r9d , r10d, r11d, r12d, -0x7338FDF8)
    ROUNDb(60, r12d, r13d, r14d, r15d, r8d , r9d , r10d, r11d, -0x6F410006)
    ROUNDb(61, r11d, r12d, r13d, r14d, r15d, r8d , r9d , r10d, -0x5BAF9315)
    ROUNDb(62, r10d, r11d, r12d, r13d, r14d, r15d, r8d , r9d , -0x41065C09)
    ROUNDb(63, r9d , r10d, r11d, r12d, r13d, r14d, r15d, r8d , -0x398E870E)
    
    /* Add to state */
    addl  %r8d ,  0(%rdi)
    addl  %r9d ,  4(%rdi)
    addl  %r10d,  8(%rdi)
    addl  %r11d, 12(%rdi)
    addl  %r12d, 16(%rdi)
    addl  %r13d, 20(%rdi)
    addl  %r14d, 24(%rdi)
    addl  %r15d, 28(%rdi)
    
    /* Restore registers */
    movq  %xmm0, %r10
    movq  %xmm1, %r11
    movq  %xmm2, %r12
    movq  %xmm3, %r13
    movq  %xmm4, %r14
    movq  %xmm5, %r15
    movq  %xmm6, %rbx
    addq  $64, %rsp
    retq