ams: support building unit test programs on windows/linux/macos

2022-03-06 12:08:20 -08:00
parent 9a38be201a
commit 64a97576d0
756 changed files with 33359 additions and 9372 deletions
--- a/libraries/libvapours/source/crypto/impl/crypto_aes_impl.arch.arm64.cpp
+++ b/libraries/libvapours/source/crypto/impl/crypto_aes_impl.arch.arm64.cpp
@@ -14,15 +14,111 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <vapours.hpp>
+#if defined(ATMOSPHERE_IS_STRATOSPHERE)
+#include <arm_neon.h>
+#endif

 namespace ams::crypto::impl {

-#ifdef ATMOSPHERE_IS_STRATOSPHERE
+#if defined(ATMOSPHERE_IS_STRATOSPHERE)
+

    namespace {

-        constexpr bool IsSupportedKeySize(size_t size) {
-            return size == 16 || size == 24 || size == 32;
+        /* Helper macros to setup for inline AES asm */
+        #define AES_ENC_DEC_SETUP_VARS()                                                       \
+            const auto *ctx = reinterpret_cast<const RoundKeyHelper<KeySize> *>(m_round_keys); \
+            static_assert(sizeof(*ctx) == sizeof(m_round_keys));                               \
+                                                                                               \
+            uint8x16_t tmp = vld1q_u8((const uint8_t *)src);                                   \
+            uint8x16_t tmp2
+
+        #define AES_ENC_DEC_OUTPUT_VARS() \
+            [tmp]"+w"(tmp), [tmp2]"=w"(tmp2)
+
+        #define AES_ENC_DEC_STORE_RESULT() \
+            vst1q_u8((uint8_t *)dst, tmp)
+
+        /* Helper macros to do AES encryption, via inline asm. */
+        #define AES_ENC_ROUND(n)                  \
+            "ldr %q[tmp2], %[round_key_" #n "]\n" \
+            "aese %[tmp].16b, %[tmp2].16b\n"      \
+            "aesmc %[tmp].16b, %[tmp].16b\n"
+
+        #define AES_ENC_FINAL_ROUND()                  \
+            "ldr %q[tmp2], %[round_key_second_last]\n" \
+            "aese %[tmp].16b, %[tmp2].16b\n"           \
+            "ldr %q[tmp2], %[round_key_last]\n"        \
+            "eor %[tmp].16b, %[tmp].16b, %[tmp2].16b"
+
+        #define AES_ENC_INPUT_ROUND_KEY(num_rounds, n) \
+            [round_key_##n]"m"(ctx->round_keys[(n-1)])
+
+        #define AES_ENC_INPUT_LAST_ROUND_KEYS(num_rounds)                 \
+            [round_key_second_last]"m"(ctx->round_keys[(num_rounds - 1)]), \
+            [round_key_last]"m"(ctx->round_keys[(num_rounds)])
+
+        /* Helper macros to do AES decryption, via inline asm. */
+        #define AES_DEC_ROUND(n)                  \
+            "ldr %q[tmp2], %[round_key_" #n "]\n" \
+            "aesd %[tmp].16b, %[tmp2].16b\n"      \
+            "aesimc %[tmp].16b, %[tmp].16b\n"
+
+        #define AES_DEC_FINAL_ROUND()                  \
+            "ldr %q[tmp2], %[round_key_second_last]\n" \
+            "aesd %[tmp].16b, %[tmp2].16b\n"           \
+            "ldr %q[tmp2], %[round_key_last]\n"        \
+            "eor %[tmp].16b, %[tmp].16b, %[tmp2].16b"
+
+        #define AES_DEC_INPUT_ROUND_KEY(num_rounds, n) \
+            [round_key_##n]"m"(ctx->round_keys[(num_rounds + 1 - n)])
+
+        #define AES_DEC_INPUT_LAST_ROUND_KEYS(num_rounds)   \
+            [round_key_second_last]"m"(ctx->round_keys[1]), \
+            [round_key_last]"m"(ctx->round_keys[0])
+
+
+        constexpr const u8 RoundKeyRcon0[] = {
+            0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, 0x2F,
+            0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, 0xD4, 0xB3, 0x7D, 0xFA, 0xEF, 0xC5, 0x91,
+        };
+
+        constexpr const u8 SubBytesTable[0x100] = {
+            0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+            0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+            0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+            0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+            0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+            0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+            0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+            0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+            0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+            0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+            0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+            0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+            0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+            0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+            0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+            0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16,
+        };
+
+        constexpr auto AesWordByte0Shift = 0 * BITSIZEOF(u8);
+        constexpr auto AesWordByte1Shift = 1 * BITSIZEOF(u8);
+        constexpr auto AesWordByte2Shift = 2 * BITSIZEOF(u8);
+        constexpr auto AesWordByte3Shift = 3 * BITSIZEOF(u8);
+
+        constexpr u32 SubBytesAndRotate(u32 v) {
+            return (static_cast<u32>(SubBytesTable[(v >> AesWordByte0Shift) & 0xFFu]) << AesWordByte3Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v >> AesWordByte1Shift) & 0xFFu]) << AesWordByte0Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v >> AesWordByte2Shift) & 0xFFu]) << AesWordByte1Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v >> AesWordByte3Shift) & 0xFFu]) << AesWordByte2Shift);
+        }
+
+        constexpr u32 SubBytes(u32 v) {
+            return (static_cast<u32>(SubBytesTable[(v >> AesWordByte0Shift) & 0xFFu]) << AesWordByte0Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v >> AesWordByte1Shift) & 0xFFu]) << AesWordByte1Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v >> AesWordByte2Shift) & 0xFFu]) << AesWordByte2Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v >> AesWordByte3Shift) & 0xFFu]) << AesWordByte3Shift);
        }

    }
@@ -34,78 +130,360 @@ namespace ams::crypto::impl {

    template<size_t KeySize>
    void AesImpl<KeySize>::Initialize(const void *key, size_t key_size, bool is_encrypt) {
-        static_assert(IsSupportedKeySize(KeySize));
+        /* Check pre-conditions. */
        AMS_ASSERT(key_size == KeySize);
        AMS_UNUSED(key_size);

-        if constexpr (KeySize == 16) {
-            /* Aes 128. */
-            static_assert(sizeof(m_round_keys) == sizeof(::Aes128Context));
-            aes128ContextCreate(reinterpret_cast<Aes128Context *>(m_round_keys), key, is_encrypt);
-        } else if constexpr (KeySize == 24) {
-            /* Aes 192. */
-            static_assert(sizeof(m_round_keys) == sizeof(::Aes192Context));
-            aes192ContextCreate(reinterpret_cast<Aes192Context *>(m_round_keys), key, is_encrypt);
-        } else if constexpr (KeySize == 32) {
-            /* Aes 256. */
-            static_assert(sizeof(m_round_keys) == sizeof(::Aes256Context));
-            aes256ContextCreate(reinterpret_cast<Aes256Context *>(m_round_keys), key, is_encrypt);
-        } else {
-            /* Invalid key size. */
-            static_assert(!std::is_same<AesImpl<KeySize>, AesImpl<KeySize>>::value);
+        /* Set up key. */
+        u32 *dst = m_round_keys;
+        std::memcpy(dst, key, KeySize);
+
+        /* Perform key scheduling. */
+        constexpr auto InitialKeyWords = KeySize / sizeof(u32);
+        u32 tmp = dst[InitialKeyWords - 1];
+
+        for (auto i = InitialKeyWords; i < (RoundCount  + 1) * 4; ++i) {
+            const auto idx_in_key = i % InitialKeyWords;
+            if (idx_in_key == 0) {
+                /* At start of key word, we need to handle sub/rotate/rcon. */
+                tmp = SubBytesAndRotate(tmp);
+                tmp ^= (RoundKeyRcon0[i / InitialKeyWords - 1] << AesWordByte0Shift);
+            } else if ((InitialKeyWords > 6) && idx_in_key == 4) {
+                /* Halfway into a 256-bit key word, we need to do an additional subbytes. */
+                tmp = SubBytes(tmp);
+            }
+
+            /* Set the key word. */
+            tmp ^= dst[i - InitialKeyWords];
+            dst[i] = tmp;
+        }
+
+        /* If decrypting, perform inverse mix columns on all round keys. */
+        if (!is_encrypt) {
+            auto *key8 = reinterpret_cast<u8 *>(m_round_keys) + BlockSize;
+
+            for (auto i = 1; i < RoundCount; ++i) {
+                vst1q_u8(key8, vaesimcq_u8(vld1q_u8(key8)));
+                key8 += BlockSize;
+            }
        }
    }

    template<size_t KeySize>
    void AesImpl<KeySize>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
-        static_assert(IsSupportedKeySize(KeySize));
-        AMS_ASSERT(src_size >= BlockSize);
-        AMS_ASSERT(dst_size >= BlockSize);
        AMS_UNUSED(src_size, dst_size);

-        if constexpr (KeySize == 16) {
-            /* Aes 128. */
-            static_assert(sizeof(m_round_keys) == sizeof(::Aes128Context));
-            aes128EncryptBlock(reinterpret_cast<const Aes128Context *>(m_round_keys), dst, src);
-        } else if constexpr (KeySize == 24) {
-            /* Aes 192. */
-            static_assert(sizeof(m_round_keys) == sizeof(::Aes192Context));
-            aes192EncryptBlock(reinterpret_cast<const Aes192Context *>(m_round_keys), dst, src);
-        } else if constexpr (KeySize == 32) {
-            /* Aes 256. */
-            static_assert(sizeof(m_round_keys) == sizeof(::Aes256Context));
-            aes256EncryptBlock(reinterpret_cast<const Aes256Context *>(m_round_keys), dst, src);
-        } else {
-            /* Invalid key size. */
-            static_assert(!std::is_same<AesImpl<KeySize>, AesImpl<KeySize>>::value);
+        /* Get the key. */
+        const u8 *key8 = reinterpret_cast<const u8 *>(m_round_keys);
+
+        /* Read the block. */
+        uint8x16_t block = vld1q_u8(static_cast<const u8 *>(src));
+
+        /* Encrypt block. */
+        for (auto round = 1; round < RoundCount; ++round) {
+            /* Do aes round. */
+            block = vaeseq_u8(block, vld1q_u8(key8));
+            key8 += BlockSize;
+
+            /* Do mix columns. */
+            block = vaesmcq_u8(block);
        }
+
+        /* Do last aes round. */
+        block = vaeseq_u8(block, vld1q_u8(key8));
+        key8 += BlockSize;
+
+        /* Add the final round key. */
+        block = veorq_u8(block, vld1q_u8(key8));
+
+        /* Store the block. */
+        vst1q_u8(static_cast<u8 *>(dst), block);
    }

    template<size_t KeySize>
    void AesImpl<KeySize>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
-        static_assert(IsSupportedKeySize(KeySize));
-        AMS_ASSERT(src_size >= BlockSize);
-        AMS_ASSERT(dst_size >= BlockSize);
        AMS_UNUSED(src_size, dst_size);

-        if constexpr (KeySize == 16) {
-            /* Aes 128. */
-            static_assert(sizeof(m_round_keys) == sizeof(::Aes128Context));
-            aes128DecryptBlock(reinterpret_cast<const Aes128Context *>(m_round_keys), dst, src);
-        } else if constexpr (KeySize == 24) {
-            /* Aes 192. */
-            static_assert(sizeof(m_round_keys) == sizeof(::Aes192Context));
-            aes192DecryptBlock(reinterpret_cast<const Aes192Context *>(m_round_keys), dst, src);
-        } else if constexpr (KeySize == 32) {
-            /* Aes 256. */
-            static_assert(sizeof(m_round_keys) == sizeof(::Aes256Context));
-            aes256DecryptBlock(reinterpret_cast<const Aes256Context *>(m_round_keys), dst, src);
-        } else {
-            /* Invalid key size. */
-            static_assert(!std::is_same<AesImpl<KeySize>, AesImpl<KeySize>>::value);
+        /* Get the key. */
+        const u8 *key8 = reinterpret_cast<const u8 *>(m_round_keys) + (RoundCount + BlockSize);
+
+        /* Read the block. */
+        uint8x16_t block = vld1q_u8(static_cast<const u8 *>(src));
+
+        /* Encrypt block. */
+        for (auto round = RoundCount; round > 1; --round) {
+            /* Do aes round. */
+            block = vaesdq_u8(block, vld1q_u8(key8));
+            key8 -= BlockSize;
+
+            /* Do mix columns. */
+            block = vaesimcq_u8(block);
        }
+
+        /* Do last aes round. */
+        block = vaesdq_u8(block, vld1q_u8(key8));
+        key8 -= BlockSize;
+
+        /* Add the first round key. */
+        block = veorq_u8(block, vld1q_u8(key8));
+
+        /* Store the block. */
+        vst1q_u8(static_cast<u8 *>(dst), block);
    }

+    /* Specializations when building specifically for cortex-a57 (or for apple M* processors). */
+    #if defined(ATMOSPHERE_CPU_CORTEX_A57) || defined(ATMOSPHERE_OS_MACOS)
+
+    namespace {
+
+        template<size_t KeySize>
+        struct RoundKeyHelper {
+            u8 round_keys[AesImpl<KeySize>::RoundCount + 1][AesImpl<KeySize>::BlockSize];
+        };
+
+    }
+
+    template<>
+    void AesImpl<16>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
+        AMS_UNUSED(src_size, dst_size);
+
+        /* Setup for asm */
+        AES_ENC_DEC_SETUP_VARS();
+
+        /* Use optimized assembly to do all rounds. */
+        __asm__ __volatile__ (
+            AES_ENC_ROUND(1)
+            AES_ENC_ROUND(2)
+            AES_ENC_ROUND(3)
+            AES_ENC_ROUND(4)
+            AES_ENC_ROUND(5)
+            AES_ENC_ROUND(6)
+            AES_ENC_ROUND(7)
+            AES_ENC_ROUND(8)
+            AES_ENC_ROUND(9)
+            AES_ENC_FINAL_ROUND()
+            : AES_ENC_DEC_OUTPUT_VARS()
+            : AES_ENC_INPUT_ROUND_KEY(RoundCount, 1),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 2),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 3),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 4),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 5),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 6),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 7),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 8),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 9),
+              AES_ENC_INPUT_LAST_ROUND_KEYS(RoundCount)
+        );
+
+        /* Store result. */
+        AES_ENC_DEC_STORE_RESULT();
+    }
+
+    template<>
+    void AesImpl<24>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
+        AMS_UNUSED(src_size, dst_size);
+
+        /* Setup for asm */
+        AES_ENC_DEC_SETUP_VARS();
+
+        /* Use optimized assembly to do all rounds. */
+        __asm__ __volatile__ (
+            AES_ENC_ROUND(1)
+            AES_ENC_ROUND(2)
+            AES_ENC_ROUND(3)
+            AES_ENC_ROUND(4)
+            AES_ENC_ROUND(5)
+            AES_ENC_ROUND(6)
+            AES_ENC_ROUND(7)
+            AES_ENC_ROUND(8)
+            AES_ENC_ROUND(9)
+            AES_ENC_ROUND(10)
+            AES_ENC_ROUND(11)
+            AES_ENC_FINAL_ROUND()
+            : AES_ENC_DEC_OUTPUT_VARS()
+            : AES_ENC_INPUT_ROUND_KEY(RoundCount, 1),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 2),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 3),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 4),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 5),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 6),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 7),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 8),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 9),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 10),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 11),
+              AES_ENC_INPUT_LAST_ROUND_KEYS(RoundCount)
+        );
+
+        /* Store result. */
+        AES_ENC_DEC_STORE_RESULT();
+    }
+
+    template<>
+    void AesImpl<32>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
+        AMS_UNUSED(src_size, dst_size);
+
+        /* Setup for asm */
+        AES_ENC_DEC_SETUP_VARS();
+
+        /* Use optimized assembly to do all rounds. */
+        __asm__ __volatile__ (
+            AES_ENC_ROUND(1)
+            AES_ENC_ROUND(2)
+            AES_ENC_ROUND(3)
+            AES_ENC_ROUND(4)
+            AES_ENC_ROUND(5)
+            AES_ENC_ROUND(6)
+            AES_ENC_ROUND(7)
+            AES_ENC_ROUND(8)
+            AES_ENC_ROUND(9)
+            AES_ENC_ROUND(10)
+            AES_ENC_ROUND(11)
+            AES_ENC_ROUND(12)
+            AES_ENC_ROUND(13)
+            AES_ENC_FINAL_ROUND()
+            : AES_ENC_DEC_OUTPUT_VARS()
+            : AES_ENC_INPUT_ROUND_KEY(RoundCount, 1),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 2),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 3),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 4),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 5),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 6),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 7),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 8),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 9),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 10),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 11),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 12),
+              AES_ENC_INPUT_ROUND_KEY(RoundCount, 13),
+              AES_ENC_INPUT_LAST_ROUND_KEYS(RoundCount)
+        );
+
+        /* Store result. */
+        AES_ENC_DEC_STORE_RESULT();
+    }
+
+    template<>
+    void AesImpl<16>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
+        AMS_UNUSED(src_size, dst_size);
+
+        /* Setup for asm */
+        AES_ENC_DEC_SETUP_VARS();
+
+        /* Use optimized assembly to do all rounds. */
+        __asm__ __volatile__ (
+            AES_DEC_ROUND(1)
+            AES_DEC_ROUND(2)
+            AES_DEC_ROUND(3)
+            AES_DEC_ROUND(4)
+            AES_DEC_ROUND(5)
+            AES_DEC_ROUND(6)
+            AES_DEC_ROUND(7)
+            AES_DEC_ROUND(8)
+            AES_DEC_ROUND(9)
+            AES_DEC_FINAL_ROUND()
+            : AES_ENC_DEC_OUTPUT_VARS()
+            : AES_DEC_INPUT_ROUND_KEY(RoundCount, 1),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 2),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 3),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 4),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 5),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 6),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 7),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 8),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 9),
+              AES_DEC_INPUT_LAST_ROUND_KEYS(RoundCount)
+        );
+
+        /* Store result. */
+        AES_ENC_DEC_STORE_RESULT();
+    }
+
+    template<>
+    void AesImpl<24>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
+        AMS_UNUSED(src_size, dst_size);
+
+        /* Setup for asm */
+        AES_ENC_DEC_SETUP_VARS();
+
+        /* Use optimized assembly to do all rounds. */
+        __asm__ __volatile__ (
+            AES_DEC_ROUND(1)
+            AES_DEC_ROUND(2)
+            AES_DEC_ROUND(3)
+            AES_DEC_ROUND(4)
+            AES_DEC_ROUND(5)
+            AES_DEC_ROUND(6)
+            AES_DEC_ROUND(7)
+            AES_DEC_ROUND(8)
+            AES_DEC_ROUND(9)
+            AES_DEC_ROUND(10)
+            AES_DEC_ROUND(11)
+            AES_DEC_FINAL_ROUND()
+            : AES_ENC_DEC_OUTPUT_VARS()
+            : AES_DEC_INPUT_ROUND_KEY(RoundCount, 1),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 2),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 3),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 4),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 5),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 6),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 7),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 8),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 9),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 10),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 11),
+              AES_DEC_INPUT_LAST_ROUND_KEYS(RoundCount)
+        );
+
+        /* Store result. */
+        AES_ENC_DEC_STORE_RESULT();
+    }
+
+    template<>
+    void AesImpl<32>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
+        AMS_UNUSED(src_size, dst_size);
+
+        /* Setup for asm */
+        AES_ENC_DEC_SETUP_VARS();
+
+        /* Use optimized assembly to do all rounds. */
+        __asm__ __volatile__ (
+            AES_DEC_ROUND(1)
+            AES_DEC_ROUND(2)
+            AES_DEC_ROUND(3)
+            AES_DEC_ROUND(4)
+            AES_DEC_ROUND(5)
+            AES_DEC_ROUND(6)
+            AES_DEC_ROUND(7)
+            AES_DEC_ROUND(8)
+            AES_DEC_ROUND(9)
+            AES_DEC_ROUND(10)
+            AES_DEC_ROUND(11)
+            AES_DEC_ROUND(12)
+            AES_DEC_ROUND(13)
+            AES_DEC_FINAL_ROUND()
+            : AES_ENC_DEC_OUTPUT_VARS()
+            : AES_DEC_INPUT_ROUND_KEY(RoundCount, 1),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 2),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 3),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 4),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 5),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 6),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 7),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 8),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 9),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 10),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 11),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 12),
+              AES_DEC_INPUT_ROUND_KEY(RoundCount, 13),
+              AES_DEC_INPUT_LAST_ROUND_KEYS(RoundCount)
+        );
+
+        /* Store result. */
+        AES_ENC_DEC_STORE_RESULT();
+    }
+
+    #endif

    /* Explicitly instantiate the three supported key sizes. */
    template class AesImpl<16>;
--- a/libraries/libvapours/source/crypto/impl/crypto_aes_impl.arch.x64.cpp
+++ b/libraries/libvapours/source/crypto/impl/crypto_aes_impl.arch.x64.cpp
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+#include "crypto_aes_impl.arch.x64.hpp"
+
+namespace ams::crypto::impl {
+
+    namespace {
+
+        constexpr bool IsSupportedKeySize(size_t size) {
+            return size == 16 || size == 24 || size == 32;
+        }
+
+        constexpr int WordsPerBlock = AesImpl<16>::BlockSize / sizeof(u32);
+        static_assert(AesImpl<16>::BlockSize == AesImpl<24>::BlockSize);
+        static_assert(AesImpl<16>::BlockSize == AesImpl<32>::BlockSize);
+
+        bool GetAesNiAvailabilityImpl() {
+            /* Call cpu id. */
+            int a = 0, b = 0, c = 0, d = 0;
+            __asm__ __volatile__("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(1) : "memory");
+
+            /* Check for AES-NI and SSE2. */
+            return (c & (1 << 25)) && (d & (1 << 26));
+        }
+
+        static_assert(util::IsLittleEndian());
+
+        constexpr const u8 RoundKeyRcon0[] = {
+            0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, 0x2F,
+            0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, 0xD4, 0xB3, 0x7D, 0xFA, 0xEF, 0xC5, 0x91,
+        };
+
+        constexpr const u8 SubBytesTable[0x100] = {
+            0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+            0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+            0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+            0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+            0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+            0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+            0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+            0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+            0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+            0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+            0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+            0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+            0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+            0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+            0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+            0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16,
+        };
+
+        constexpr const u8 InvSubBytesTable[0x100] = {
+            0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB,
+            0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB,
+            0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E,
+            0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25,
+            0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92,
+            0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84,
+            0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06,
+            0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B,
+            0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73,
+            0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E,
+            0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B,
+            0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4,
+            0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F,
+            0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF,
+            0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61,
+            0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D,
+        };
+
+        constexpr bool IsSubBytesTableValid() {
+            for (size_t i = 0; i < 0x100; ++i) {
+                if (SubBytesTable[InvSubBytesTable[i]] != i) {
+                    return false;
+                }
+                if (InvSubBytesTable[SubBytesTable[i]] != i) {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
+        static_assert(IsSubBytesTableValid());
+
+        constexpr const u32 EncryptTable[0x100] = {
+            0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591,
+            0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC,
+            0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB,
+            0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B,
+            0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83,
+            0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A,
+            0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F,
+            0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA,
+            0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B,
+            0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413,
+            0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6,
+            0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85,
+            0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511,
+            0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B,
+            0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1,
+            0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF,
+            0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E,
+            0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6,
+            0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B,
+            0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD,
+            0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8,
+            0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2,
+            0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949,
+            0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810,
+            0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697,
+            0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F,
+            0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C,
+            0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27,
+            0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433,
+            0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5,
+            0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0,
+            0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C,
+        };
+
+        constexpr const u32 DecryptTable[0x100] = {
+            0x50A7F451, 0x5365417E, 0xC3A4171A, 0x965E273A, 0xCB6BAB3B, 0xF1459D1F, 0xAB58FAAC, 0x9303E34B,
+            0x55FA3020, 0xF66D76AD, 0x9176CC88, 0x254C02F5, 0xFCD7E54F, 0xD7CB2AC5, 0x80443526, 0x8FA362B5,
+            0x495AB1DE, 0x671BBA25, 0x980EEA45, 0xE1C0FE5D, 0x02752FC3, 0x12F04C81, 0xA397468D, 0xC6F9D36B,
+            0xE75F8F03, 0x959C9215, 0xEB7A6DBF, 0xDA595295, 0x2D83BED4, 0xD3217458, 0x2969E049, 0x44C8C98E,
+            0x6A89C275, 0x78798EF4, 0x6B3E5899, 0xDD71B927, 0xB64FE1BE, 0x17AD88F0, 0x66AC20C9, 0xB43ACE7D,
+            0x184ADF63, 0x82311AE5, 0x60335197, 0x457F5362, 0xE07764B1, 0x84AE6BBB, 0x1CA081FE, 0x942B08F9,
+            0x58684870, 0x19FD458F, 0x876CDE94, 0xB7F87B52, 0x23D373AB, 0xE2024B72, 0x578F1FE3, 0x2AAB5566,
+            0x0728EBB2, 0x03C2B52F, 0x9A7BC586, 0xA50837D3, 0xF2872830, 0xB2A5BF23, 0xBA6A0302, 0x5C8216ED,
+            0x2B1CCF8A, 0x92B479A7, 0xF0F207F3, 0xA1E2694E, 0xCDF4DA65, 0xD5BE0506, 0x1F6234D1, 0x8AFEA6C4,
+            0x9D532E34, 0xA055F3A2, 0x32E18A05, 0x75EBF6A4, 0x39EC830B, 0xAAEF6040, 0x069F715E, 0x51106EBD,
+            0xF98A213E, 0x3D06DD96, 0xAE053EDD, 0x46BDE64D, 0xB58D5491, 0x055DC471, 0x6FD40604, 0xFF155060,
+            0x24FB9819, 0x97E9BDD6, 0xCC434089, 0x779ED967, 0xBD42E8B0, 0x888B8907, 0x385B19E7, 0xDBEEC879,
+            0x470A7CA1, 0xE90F427C, 0xC91E84F8, 0x00000000, 0x83868009, 0x48ED2B32, 0xAC70111E, 0x4E725A6C,
+            0xFBFF0EFD, 0x5638850F, 0x1ED5AE3D, 0x27392D36, 0x64D90F0A, 0x21A65C68, 0xD1545B9B, 0x3A2E3624,
+            0xB1670A0C, 0x0FE75793, 0xD296EEB4, 0x9E919B1B, 0x4FC5C080, 0xA220DC61, 0x694B775A, 0x161A121C,
+            0x0ABA93E2, 0xE52AA0C0, 0x43E0223C, 0x1D171B12, 0x0B0D090E, 0xADC78BF2, 0xB9A8B62D, 0xC8A91E14,
+            0x8519F157, 0x4C0775AF, 0xBBDD99EE, 0xFD607FA3, 0x9F2601F7, 0xBCF5725C, 0xC53B6644, 0x347EFB5B,
+            0x7629438B, 0xDCC623CB, 0x68FCEDB6, 0x63F1E4B8, 0xCADC31D7, 0x10856342, 0x40229713, 0x2011C684,
+            0x7D244A85, 0xF83DBBD2, 0x1132F9AE, 0x6DA129C7, 0x4B2F9E1D, 0xF330B2DC, 0xEC52860D, 0xD0E3C177,
+            0x6C16B32B, 0x99B970A9, 0xFA489411, 0x2264E947, 0xC48CFCA8, 0x1A3FF0A0, 0xD82C7D56, 0xEF903322,
+            0xC74E4987, 0xC1D138D9, 0xFEA2CA8C, 0x360BD498, 0xCF81F5A6, 0x28DE7AA5, 0x268EB7DA, 0xA4BFAD3F,
+            0xE49D3A2C, 0x0D927850, 0x9BCC5F6A, 0x62467E54, 0xC2138DF6, 0xE8B8D890, 0x5EF7392E, 0xF5AFC382,
+            0xBE805D9F, 0x7C93D069, 0xA92DD56F, 0xB31225CF, 0x3B99ACC8, 0xA77D1810, 0x6E639CE8, 0x7BBB3BDB,
+            0x097826CD, 0xF418596E, 0x01B79AEC, 0xA89A4F83, 0x656E95E6, 0x7EE6FFAA, 0x08CFBC21, 0xE6E815EF,
+            0xD99BE7BA, 0xCE366F4A, 0xD4099FEA, 0xD67CB029, 0xAFB2A431, 0x31233F2A, 0x3094A5C6, 0xC066A235,
+            0x37BC4E74, 0xA6CA82FC, 0xB0D090E0, 0x15D8A733, 0x4A9804F1, 0xF7DAEC41, 0x0E50CD7F, 0x2FF69117,
+            0x8DD64D76, 0x4DB0EF43, 0x544DAACC, 0xDF0496E4, 0xE3B5D19E, 0x1B886A4C, 0xB81F2CC1, 0x7F516546,
+            0x04EA5E9D, 0x5D358C01, 0x737487FA, 0x2E410BFB, 0x5A1D67B3, 0x52D2DB92, 0x335610E9, 0x1347D66D,
+            0x8C61D79A, 0x7A0CA137, 0x8E14F859, 0x893C13EB, 0xEE27A9CE, 0x35C961B7, 0xEDE51CE1, 0x3CB1477A,
+            0x59DFD29C, 0x3F73F255, 0x79CE1418, 0xBF37C773, 0xEACDF753, 0x5BAAFD5F, 0x146F3DDF, 0x86DB4478,
+            0x81F3AFCA, 0x3EC468B9, 0x2C342438, 0x5F40A3C2, 0x72C31D16, 0x0C25E2BC, 0x8B493C28, 0x41950DFF,
+            0x7101A839, 0xDEB30C08, 0x9CE4B4D8, 0x90C15664, 0x6184CB7B, 0x70B632D5, 0x745C6C48, 0x4257B8D0,
+        };
+
+        constexpr auto AesWordByte0Shift = 0 * BITSIZEOF(u8);
+        constexpr auto AesWordByte1Shift = 1 * BITSIZEOF(u8);
+        constexpr auto AesWordByte2Shift = 2 * BITSIZEOF(u8);
+        constexpr auto AesWordByte3Shift = 3 * BITSIZEOF(u8);
+
+        constexpr auto AesMixShift = 3 * BITSIZEOF(u8);
+
+        constexpr void InverseMixColumns(u32 *dst, const u32 *src) {
+            for (auto i = 0; i < WordsPerBlock; ++i) {
+                const u32 v0 = src[i];
+                const u32 v1 = (((v0 & 0x7F7F7F7Fu) << 1) ^ (((v0 & 0x80808080) >> 7) * 0x1B));
+                const u32 v2 = (((v1 & 0x7F7F7F7Fu) << 1) ^ (((v1 & 0x80808080) >> 7) * 0x1B));
+                const u32 v3 = (((v2 & 0x7F7F7F7Fu) << 1) ^ (((v2 & 0x80808080) >> 7) * 0x1B));
+
+                u32 v = v0 ^ v3;
+                v ^= util::RotateLeft(v, AesMixShift) ^ v2;
+                v ^= util::RotateLeft(v, AesMixShift) ^ v1;
+                v ^= util::RotateLeft(v, AesMixShift) ^ v0;
+
+                dst[i] = v;
+            }
+        }
+
+        constexpr u32 SubBytesAndRotate(u32 v) {
+            return (static_cast<u32>(SubBytesTable[(v >> AesWordByte0Shift) & 0xFFu]) << AesWordByte3Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v >> AesWordByte1Shift) & 0xFFu]) << AesWordByte0Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v >> AesWordByte2Shift) & 0xFFu]) << AesWordByte1Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v >> AesWordByte3Shift) & 0xFFu]) << AesWordByte2Shift);
+        }
+
+        constexpr u32 SubBytes(u32 v) {
+            return (static_cast<u32>(SubBytesTable[(v >> AesWordByte0Shift) & 0xFFu]) << AesWordByte0Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v >> AesWordByte1Shift) & 0xFFu]) << AesWordByte1Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v >> AesWordByte2Shift) & 0xFFu]) << AesWordByte2Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v >> AesWordByte3Shift) & 0xFFu]) << AesWordByte3Shift);
+        }
+
+        constexpr u32 ShiftSubMix(u32 v0, u32 v1, u32 v2, u32 v3) {
+            return (util::RotateLeft(static_cast<u32>(EncryptTable[(v0 >> AesWordByte0Shift) & 0xFFu]), AesWordByte0Shift)) ^
+                   (util::RotateLeft(static_cast<u32>(EncryptTable[(v1 >> AesWordByte1Shift) & 0xFFu]), AesWordByte1Shift)) ^
+                   (util::RotateLeft(static_cast<u32>(EncryptTable[(v2 >> AesWordByte2Shift) & 0xFFu]), AesWordByte2Shift)) ^
+                   (util::RotateLeft(static_cast<u32>(EncryptTable[(v3 >> AesWordByte3Shift) & 0xFFu]), AesWordByte3Shift));
+        }
+
+        constexpr u32 ShiftSub(u32 v0, u32 v1, u32 v2, u32 v3) {
+            return (static_cast<u32>(SubBytesTable[(v0 >> AesWordByte0Shift) & 0xFFu]) << AesWordByte0Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v1 >> AesWordByte1Shift) & 0xFFu]) << AesWordByte1Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v2 >> AesWordByte2Shift) & 0xFFu]) << AesWordByte2Shift) ^
+                   (static_cast<u32>(SubBytesTable[(v3 >> AesWordByte3Shift) & 0xFFu]) << AesWordByte3Shift);
+        }
+
+        constexpr u32 InvShiftSubMix(u32 v0, u32 v1, u32 v2, u32 v3) {
+            return (util::RotateLeft(static_cast<u32>(DecryptTable[(v0 >> AesWordByte0Shift) & 0xFFu]), AesWordByte0Shift)) ^
+                   (util::RotateLeft(static_cast<u32>(DecryptTable[(v1 >> AesWordByte1Shift) & 0xFFu]), AesWordByte1Shift)) ^
+                   (util::RotateLeft(static_cast<u32>(DecryptTable[(v2 >> AesWordByte2Shift) & 0xFFu]), AesWordByte2Shift)) ^
+                   (util::RotateLeft(static_cast<u32>(DecryptTable[(v3 >> AesWordByte3Shift) & 0xFFu]), AesWordByte3Shift));
+        }
+
+        constexpr u32 InvShiftSub(u32 v0, u32 v1, u32 v2, u32 v3) {
+            return (static_cast<u32>(InvSubBytesTable[(v0 >> AesWordByte0Shift) & 0xFFu]) << AesWordByte0Shift) ^
+                   (static_cast<u32>(InvSubBytesTable[(v1 >> AesWordByte1Shift) & 0xFFu]) << AesWordByte1Shift) ^
+                   (static_cast<u32>(InvSubBytesTable[(v2 >> AesWordByte2Shift) & 0xFFu]) << AesWordByte2Shift) ^
+                   (static_cast<u32>(InvSubBytesTable[(v3 >> AesWordByte3Shift) & 0xFFu]) << AesWordByte3Shift);
+        }
+
+    }
+
+    const bool g_is_aes_ni_available = GetAesNiAvailabilityImpl();
+
+    template<size_t KeySize>
+    AesImpl<KeySize>::~AesImpl() {
+        ClearMemory(this, sizeof(*this));
+    }
+
+    template<size_t KeySize>
+    void AesImpl<KeySize>::Initialize(const void *key, size_t key_size, bool is_encrypt) {
+        /* Check pre-conditions. */
+        static_assert(IsSupportedKeySize(KeySize));
+        AMS_ASSERT(key != nullptr);
+        AMS_ASSERT(key_size == KeySize);
+        AMS_UNUSED(key_size);
+
+        /* Set up key. */
+        u32 *dst = m_round_keys;
+        std::memcpy(dst, key, KeySize);
+
+        /* Perform key scheduling. */
+        constexpr auto InitialKeyWords = KeySize / sizeof(u32);
+        u32 tmp = dst[InitialKeyWords - 1];
+
+        for (auto i = InitialKeyWords; i < (RoundCount  + 1) * 4; ++i) {
+            const auto idx_in_key = i % InitialKeyWords;
+            if (idx_in_key == 0) {
+                /* At start of key word, we need to handle sub/rotate/rcon. */
+                tmp = SubBytesAndRotate(tmp);
+                tmp ^= (RoundKeyRcon0[i / InitialKeyWords - 1] << AesWordByte0Shift);
+            } else if ((InitialKeyWords > 6) && idx_in_key == 4) {
+                /* Halfway into a 256-bit key word, we need to do an additional subbytes. */
+                tmp = SubBytes(tmp);
+            }
+
+            /* Set the key word. */
+            tmp ^= dst[i - InitialKeyWords];
+            dst[i] = tmp;
+        }
+
+        /* If decrypting, perform inverse mix columns on all round keys. */
+        if (!is_encrypt) {
+            if (IsAesNiAvailable()) {
+                auto *key8 = reinterpret_cast<u8 *>(m_round_keys) + BlockSize;
+
+                for (auto i = 1; i < RoundCount; ++i) {
+                    auto * const key128 = reinterpret_cast<__m128i *>(key8);
+                    _mm_storeu_si128(key128, _mm_aesimc_si128(_mm_loadu_si128(key128)));
+                    key8 += BlockSize;
+                }
+            } else {
+                for (auto i = 1; i < RoundCount; ++i) {
+                    InverseMixColumns(m_round_keys + WordsPerBlock * i, m_round_keys + WordsPerBlock * i);
+                }
+            }
+        }
+    }
+
+    template<size_t KeySize>
+    void AesImpl<KeySize>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
+        static_assert(IsSupportedKeySize(KeySize));
+        AMS_ASSERT(dst_size == BlockSize && src_size == BlockSize);
+        AMS_UNUSED(dst_size, src_size);
+
+        /* Perform block encryption. */
+        if (IsAesNiAvailable()) {
+            const auto *key8 = reinterpret_cast<const u8 *>(m_round_keys);
+
+            /* Load the block. */
+            auto block = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+
+            /* Add the first round key. */
+            block = _mm_xor_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
+            key8 += BlockSize;
+
+            /* Perform aes round on remaining round keys. */
+            for (auto i = 1; i < RoundCount; ++i) {
+                block = _mm_aesenc_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
+                key8 += BlockSize;
+            }
+
+            /* Do final update. */
+            block = _mm_aesenclast_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
+
+            /* Store the output. */
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), block);
+        } else {
+            static_assert(WordsPerBlock == 4);
+
+            /* Without AES-NI, we'll operate on words. */
+            const u32 *key32 = m_round_keys;
+            const u32 *src32 = static_cast<const u32 *>(src);
+                  u32 *dst32 = static_cast<      u32 *>(dst);
+
+            /* Add the first round key. */
+            u32 v0 = src32[0] ^ key32[0];
+            u32 v1 = src32[1] ^ key32[1];
+            u32 v2 = src32[2] ^ key32[2];
+            u32 v3 = src32[3] ^ key32[3];
+            key32 += 4;
+
+            /* Perform each round. */
+            auto round = RoundCount;
+            while (--round > 0) {
+                /* Perform aes round. */
+                const u32 e0 = ShiftSubMix(v0, v1, v2, v3);
+                const u32 e1 = ShiftSubMix(v1, v2, v3, v0);
+                const u32 e2 = ShiftSubMix(v2, v3, v0, v1);
+                const u32 e3 = ShiftSubMix(v3, v0, v1, v2);
+
+                /* Add the round key. */
+                v0 = e0 ^ key32[0];
+                v1 = e1 ^ key32[1];
+                v2 = e2 ^ key32[2];
+                v3 = e3 ^ key32[3];
+                key32 += 4;
+            }
+
+            /* Perform the final round. */
+            dst32[0] = key32[0] ^ ShiftSub(v0, v1, v2, v3);
+            dst32[1] = key32[1] ^ ShiftSub(v1, v2, v3, v0);
+            dst32[2] = key32[2] ^ ShiftSub(v2, v3, v0, v1);
+            dst32[3] = key32[3] ^ ShiftSub(v3, v0, v1, v2);
+        }
+    }
+
+    template<size_t KeySize>
+    void AesImpl<KeySize>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
+        static_assert(IsSupportedKeySize(KeySize));
+        AMS_ASSERT(dst_size == BlockSize && src_size == BlockSize);
+        AMS_UNUSED(dst_size, src_size);
+
+        /* Perform block decryption. */
+        if (IsAesNiAvailable()) {
+            const auto *key8 = reinterpret_cast<const u8 *>(m_round_keys) + (RoundCount * BlockSize);
+
+            /* Load the block. */
+            auto block = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+
+            /* Add the final round key. */
+            block = _mm_xor_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
+            key8 -= BlockSize;
+
+            /* Perform aes invround on remaining round keys. */
+            for (auto i = RoundCount; i > 1; --i) {
+                block = _mm_aesdec_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
+                key8 -= BlockSize;
+            }
+
+            /* Do final update. */
+            block = _mm_aesdeclast_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
+
+            /* Store the output. */
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), block);
+        } else {
+            static_assert(WordsPerBlock == 4);
+
+            /* Without AES-NI, we'll operate on words. */
+            const u32 *key32 = m_round_keys + WordsPerBlock * RoundCount;
+            const u32 *src32 = static_cast<const u32 *>(src);
+                  u32 *dst32 = static_cast<      u32 *>(dst);
+
+            /* Add the final round key. */
+            u32 v0 = src32[0] ^ key32[0];
+            u32 v1 = src32[1] ^ key32[1];
+            u32 v2 = src32[2] ^ key32[2];
+            u32 v3 = src32[3] ^ key32[3];
+            key32 -= 4;
+
+            /* Perform each round. */
+            auto round = RoundCount;
+            while (--round > 0) {
+                /* Perform aes inv round. */
+                const u32 e0 = InvShiftSubMix(v0, v3, v2, v1);
+                const u32 e1 = InvShiftSubMix(v1, v0, v3, v2);
+                const u32 e2 = InvShiftSubMix(v2, v1, v0, v3);
+                const u32 e3 = InvShiftSubMix(v3, v2, v1, v0);
+
+                /* Add the round key. */
+                v0 = e0 ^ key32[0];
+                v1 = e1 ^ key32[1];
+                v2 = e2 ^ key32[2];
+                v3 = e3 ^ key32[3];
+                key32 -= 4;
+            }
+
+            /* Perform the final round. */
+            dst32[0] = key32[0] ^ InvShiftSub(v0, v3, v2, v1);
+            dst32[1] = key32[1] ^ InvShiftSub(v1, v0, v3, v2);
+            dst32[2] = key32[2] ^ InvShiftSub(v2, v1, v0, v3);
+            dst32[3] = key32[3] ^ InvShiftSub(v3, v2, v1, v0);
+        }
+    }
+
+    /* Explicitly instantiate the three supported key sizes. */
+    template class AesImpl<16>;
+    template class AesImpl<24>;
+    template class AesImpl<32>;
+
+}
--- a/libraries/libvapours/source/crypto/impl/crypto_aes_impl.arch.x64.hpp
+++ b/libraries/libvapours/source/crypto/impl/crypto_aes_impl.arch.x64.hpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+#include <vapours.hpp>
+#include <x86intrin.h>
+
+namespace ams::crypto::impl {
+
+    extern const bool g_is_aes_ni_available;
+
+    ALWAYS_INLINE bool IsAesNiAvailable() {
+        return g_is_aes_ni_available;
+    }
+
+}
--- a/libraries/libvapours/source/crypto/impl/crypto_bignum_operations.cpp
+++ b/libraries/libvapours/source/crypto/impl/crypto_bignum_operations.cpp
@@ -45,7 +45,7 @@ namespace ams::crypto::impl {
            return static_cast<BigNum::Word>(half) << BITSIZEOF(BigNum::HalfWord);
        }

-        constexpr ALWAYS_INLINE BigNum::Word ToLowerHalf(BigNum::HalfWord half) {
+        [[maybe_unused]] constexpr ALWAYS_INLINE BigNum::Word ToLowerHalf(BigNum::HalfWord half) {
            static_assert(sizeof(BigNum::Word) == sizeof(BigNum::HalfWord) * 2);
            return static_cast<BigNum::Word>(half);
        }
@@ -422,4 +422,69 @@ namespace ams::crypto::impl {
        return true;
    }

+    #if !defined(ATMOSPHERE_ARCH_ARM64)
+    BigNum::Word BigNum::Add(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) {
+        Word carry = 0;
+
+        for (size_t i = 0; i < num_words; ++i) {
+            Word v;
+            if ((v = lhs[i] + carry) < carry) {
+                v = rhs[i];
+            } else if ((v += rhs[i]) < rhs[i]) {
+                carry = 1;
+            } else {
+                carry = 0;
+            }
+
+            dst[i] = v;
+        }
+
+        return carry;
+    }
+
+    BigNum::Word BigNum::Sub(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) {
+        Word borrow = 0;
+
+        for (size_t i = 0; i < num_words; ++i) {
+            Word v;
+            if ((v = lhs[i] - borrow) > (BigNum::MaxWord - borrow)) {
+                v = BigNum::MaxWord - rhs[i];
+            } else if ((v -= rhs[i]) > (BigNum::MaxWord - rhs[i])) {
+                borrow = 1;
+            } else {
+                borrow = 0;
+            }
+
+            dst[i] = v;
+        }
+
+        return borrow;
+    }
+
+    BigNum::Word BigNum::MultAdd(Word *dst, const Word *w, size_t num_words, Word mult) {
+        /* If multiplying by zero, nothing to do. */
+        if (mult == 0) {
+            return 0;
+        }
+
+        Word carry = 0, work[2];
+        for (size_t i = 0; i < num_words; i++) {
+            /* Multiply, calculate carry for next. */
+            MultWord(work, mult, w[i]);
+            if ((dst[i] += carry) < carry) {
+                carry = 1;
+            } else {
+                carry = 0;
+            }
+
+            if ((dst[i] += work[0]) < work[0]) {
+                carry++;
+            }
+            carry += work[1];
+        }
+
+        return carry;
+    }
+    #endif
+
 }
--- a/libraries/libvapours/source/crypto/impl/crypto_bignum_operations_asm.arch.arm64.s
+++ b/libraries/libvapours/source/crypto/impl/crypto_bignum_operations_asm.arch.arm64.s
@@ -15,9 +15,14 @@
 */

 /* ams::crypto::impl::BigNum::Add(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) */
+#if !defined(ATMOSPHERE_OS_MACOS)
 .section    .text._ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m, "ax", %progbits
 .global     _ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m
 .type       _ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m, %function
+#else
+.text
+.global _ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m
+#endif
 .balign 0x10
 _ZN3ams6crypto4impl6BigNum3AddEPjPKjPKjm:
    /* Check if we have anything to do at all. */
@@ -106,9 +111,14 @@ _ZN3ams6crypto4impl6BigNum3AddEPjPKjPKjm:
    ret

 /* ams::crypto::impl::BigNum::Sub(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) */
+#if !defined(ATMOSPHERE_OS_MACOS)
 .section    .text._ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m, "ax", %progbits
 .global     _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m
 .type       _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m, %function
+#else
+.text
+.global _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m
+#endif
 .balign 0x10
 _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m:
    /* Check if we have anything to do at all. */
@@ -198,9 +208,14 @@ _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m:
    ret

 /* ams::crypto::impl::BigNum::MultAdd(Word *dst, const Word *w, size_t num_words, Word mult) */
+#if !defined(ATMOSPHERE_OS_MACOS)
 .section    .text._ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj, "ax", %progbits
 .global     _ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj
 .type       _ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj, %function
+#else
+.text
+.global     _ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj
+#endif
 .balign 0x10
 _ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj:
    /* Check if we have anything to do at all. */
--- a/libraries/libvapours/source/crypto/impl/crypto_ctr_mode_impl.arch.arm64.cpp
+++ b/libraries/libvapours/source/crypto/impl/crypto_ctr_mode_impl.arch.arm64.cpp
@@ -114,11 +114,11 @@ namespace ams::crypto::impl {
            while (num_blocks >= 3) {
                /* Read blocks in. Keep them in registers for XOR later. */
                const uint8x16_t block0 = vld1q_u8(src);
-                src += AES_BLOCK_SIZE;
+                src += AesEncryptor128::BlockSize;
                const uint8x16_t block1 = vld1q_u8(src);
-                src += AES_BLOCK_SIZE;
+                src += AesEncryptor128::BlockSize;
                const uint8x16_t block2 = vld1q_u8(src);
-                src += AES_BLOCK_SIZE;
+                src += AesEncryptor128::BlockSize;

                /* We'll be encrypting the three CTRs. */
                uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
@@ -178,11 +178,11 @@ namespace ams::crypto::impl {

                /* Store to output. */
                vst1q_u8(dst, tmp0);
-                dst += AES_BLOCK_SIZE;
+                dst += AesEncryptor128::BlockSize;
                vst1q_u8(dst, tmp1);
-                dst += AES_BLOCK_SIZE;
+                dst += AesEncryptor128::BlockSize;
                vst1q_u8(dst, tmp2);
-                dst += AES_BLOCK_SIZE;
+                dst += AesEncryptor128::BlockSize;

                num_blocks -= 3;
            }
@@ -191,7 +191,7 @@ namespace ams::crypto::impl {
        while (num_blocks >= 1) {
            /* Read block in, keep in register for XOR. */
            const uint8x16_t block0 = vld1q_u8(src);
-            src += AES_BLOCK_SIZE;
+            src += AesEncryptor128::BlockSize;

            /* We'll be encrypting the CTR. */
            uint8x16_t tmp0 = ctr0;
@@ -232,7 +232,7 @@ namespace ams::crypto::impl {

            /* Store to output. */
            vst1q_u8(dst, tmp0);
-            dst += AES_BLOCK_SIZE;
+            dst += AesEncryptor128::BlockSize;

            num_blocks--;
        }
@@ -270,11 +270,11 @@ namespace ams::crypto::impl {
            while (num_blocks >= 3) {
                /* Read blocks in. Keep them in registers for XOR later. */
                const uint8x16_t block0 = vld1q_u8(src);
-                src += AES_BLOCK_SIZE;
+                src += AesEncryptor192::BlockSize;
                const uint8x16_t block1 = vld1q_u8(src);
-                src += AES_BLOCK_SIZE;
+                src += AesEncryptor192::BlockSize;
                const uint8x16_t block2 = vld1q_u8(src);
-                src += AES_BLOCK_SIZE;
+                src += AesEncryptor192::BlockSize;

                /* We'll be encrypting the three CTRs. */
                uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
@@ -338,11 +338,11 @@ namespace ams::crypto::impl {

                /* Store to output. */
                vst1q_u8(dst, tmp0);
-                dst += AES_BLOCK_SIZE;
+                dst += AesEncryptor192::BlockSize;
                vst1q_u8(dst, tmp1);
-                dst += AES_BLOCK_SIZE;
+                dst += AesEncryptor192::BlockSize;
                vst1q_u8(dst, tmp2);
-                dst += AES_BLOCK_SIZE;
+                dst += AesEncryptor192::BlockSize;

                num_blocks -= 3;
            }
@@ -351,7 +351,7 @@ namespace ams::crypto::impl {
        while (num_blocks >= 1) {
            /* Read block in, keep in register for XOR. */
            const uint8x16_t block0 = vld1q_u8(src);
-            src += AES_BLOCK_SIZE;
+            src += AesEncryptor192::BlockSize;

            /* We'll be encrypting the CTR. */
            uint8x16_t tmp0 = ctr0;
@@ -396,7 +396,7 @@ namespace ams::crypto::impl {

            /* Store to output. */
            vst1q_u8(dst, tmp0);
-            dst += AES_BLOCK_SIZE;
+            dst += AesEncryptor192::BlockSize;

            num_blocks--;
        }
@@ -436,11 +436,11 @@ namespace ams::crypto::impl {
            while (num_blocks >= 3) {
                /* Read blocks in. Keep them in registers for XOR later. */
                const uint8x16_t block0 = vld1q_u8(src);
-                src += AES_BLOCK_SIZE;
+                src += AesEncryptor256::BlockSize;
                const uint8x16_t block1 = vld1q_u8(src);
-                src += AES_BLOCK_SIZE;
+                src += AesEncryptor256::BlockSize;
                const uint8x16_t block2 = vld1q_u8(src);
-                src += AES_BLOCK_SIZE;
+                src += AesEncryptor256::BlockSize;

                /* We'll be encrypting the three CTRs. */
                uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
@@ -509,11 +509,11 @@ namespace ams::crypto::impl {

                /* Store to output. */
                vst1q_u8(dst, tmp0);
-                dst += AES_BLOCK_SIZE;
+                dst += AesEncryptor256::BlockSize;
                vst1q_u8(dst, tmp1);
-                dst += AES_BLOCK_SIZE;
+                dst += AesEncryptor256::BlockSize;
                vst1q_u8(dst, tmp2);
-                dst += AES_BLOCK_SIZE;
+                dst += AesEncryptor256::BlockSize;

                num_blocks -= 3;
            }
@@ -522,7 +522,7 @@ namespace ams::crypto::impl {
        while (num_blocks >= 1) {
            /* Read block in, keep in register for XOR. */
            const uint8x16_t block0 = vld1q_u8(src);
-            src += AES_BLOCK_SIZE;
+            src += AesEncryptor256::BlockSize;

            /* We'll be encrypting the CTR. */
            uint8x16_t tmp0 = ctr0;
@@ -571,7 +571,7 @@ namespace ams::crypto::impl {

            /* Store to output. */
            vst1q_u8(dst, tmp0);
-            dst += AES_BLOCK_SIZE;
+            dst += AesEncryptor256::BlockSize;

            num_blocks--;
        }
--- a/libraries/libvapours/source/crypto/impl/crypto_ctr_mode_impl.arch.x64.cpp
+++ b/libraries/libvapours/source/crypto/impl/crypto_ctr_mode_impl.arch.x64.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+#include "crypto_aes_impl.arch.x64.hpp"
+
+namespace ams::crypto::impl {
+
+    template<> void CtrModeImpl<AesEncryptor128>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) {
+        /* Check pre-conditions. */
+        AMS_ASSERT(src != nullptr);
+        AMS_ASSERT(dst != nullptr);
+
+        /* If we have aes-ni, use an optimized impl. */
+        if (IsAesNiAvailable()) {
+            /* Load all keys into sse2 registers. */
+            const u8 *raw_round_keys = m_block_cipher->GetRoundKey();
+            const __m128i round_keys[AesEncryptor128::RoundKeySize / BlockSize] = {
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize *  0)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize *  1)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize *  2)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize *  3)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize *  4)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize *  5)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize *  6)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize *  7)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize *  8)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize *  9)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 10)),
+            };
+            static_assert(AesEncryptor128::RoundKeySize / BlockSize == 11);
+
+            /* Declare constant for counter math. */
+            const __m128i One = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+
+            /* Process eight blocks at a time, while we can. */
+            constexpr const auto UnrolledBlockCount = 8;
+            constexpr const auto CounterThreshold   = static_cast<u8>(0x100 - UnrolledBlockCount);
+
+            /* Load the counter. */
+            auto counter = _mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter));
+
+            size_t cur_blocks;
+            for (cur_blocks = 0; cur_blocks + UnrolledBlockCount <= num_blocks; cur_blocks += UnrolledBlockCount) {
+                __m128i b0;
+                __m128i b1;
+                __m128i b2;
+                __m128i b3;
+                __m128i b4;
+                __m128i b5;
+                __m128i b6;
+                __m128i b7;
+
+                __m128i key = round_keys[0];
+
+                /* Get the last byte of the block. */
+                static_assert(util::IsLittleEndian());
+                const u8 counter_val = _mm_extract_epi16(counter, 7) >> BITSIZEOF(u8);
+
+                /* Do initial encryption of each block. */
+                if (CounterThreshold <= counter_val) {
+                    /* We'll overwrap, so take slow path for counter. */
+                    _mm_storeu_si128(reinterpret_cast<__m128i *>(m_counter), counter);
+
+                    b0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
+                    this->IncrementCounter();
+                    b1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
+                    this->IncrementCounter();
+                    b2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
+                    this->IncrementCounter();
+                    b3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
+                    this->IncrementCounter();
+                    b4 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
+                    this->IncrementCounter();
+                    b5 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
+                    this->IncrementCounter();
+                    b6 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
+                    this->IncrementCounter();
+                    b7 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
+                    this->IncrementCounter();
+
+                    counter = _mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter));
+                } else {
+                    /* We can take the fast path for the counter. */
+                    b0 = _mm_xor_si128(counter, key);
+                    counter = _mm_add_epi64(counter, One);
+                    b1 = _mm_xor_si128(counter, key);
+                    counter = _mm_add_epi64(counter, One);
+                    b2 = _mm_xor_si128(counter, key);
+                    counter = _mm_add_epi64(counter, One);
+                    b3 = _mm_xor_si128(counter, key);
+                    counter = _mm_add_epi64(counter, One);
+                    b4 = _mm_xor_si128(counter, key);
+                    counter = _mm_add_epi64(counter, One);
+                    b5 = _mm_xor_si128(counter, key);
+                    counter = _mm_add_epi64(counter, One);
+                    b6 = _mm_xor_si128(counter, key);
+                    counter = _mm_add_epi64(counter, One);
+                    b7 = _mm_xor_si128(counter, key);
+                    counter = _mm_add_epi64(counter, One);
+                }
+
+                /* Do encryption for all rounds. */
+                key = round_keys[1];
+                b0 = _mm_aesenc_si128(b0, key);
+                b1 = _mm_aesenc_si128(b1, key);
+                b2 = _mm_aesenc_si128(b2, key);
+                b3 = _mm_aesenc_si128(b3, key);
+                b4 = _mm_aesenc_si128(b4, key);
+                b5 = _mm_aesenc_si128(b5, key);
+                b6 = _mm_aesenc_si128(b6, key);
+                b7 = _mm_aesenc_si128(b7, key);
+
+                key = round_keys[2];
+                b0 = _mm_aesenc_si128(b0, key);
+                b1 = _mm_aesenc_si128(b1, key);
+                b2 = _mm_aesenc_si128(b2, key);
+                b3 = _mm_aesenc_si128(b3, key);
+                b4 = _mm_aesenc_si128(b4, key);
+                b5 = _mm_aesenc_si128(b5, key);
+                b6 = _mm_aesenc_si128(b6, key);
+                b7 = _mm_aesenc_si128(b7, key);
+
+                key = round_keys[3];
+                b0 = _mm_aesenc_si128(b0, key);
+                b1 = _mm_aesenc_si128(b1, key);
+                b2 = _mm_aesenc_si128(b2, key);
+                b3 = _mm_aesenc_si128(b3, key);
+                b4 = _mm_aesenc_si128(b4, key);
+                b5 = _mm_aesenc_si128(b5, key);
+                b6 = _mm_aesenc_si128(b6, key);
+                b7 = _mm_aesenc_si128(b7, key);
+
+                key = round_keys[4];
+                b0 = _mm_aesenc_si128(b0, key);
+                b1 = _mm_aesenc_si128(b1, key);
+                b2 = _mm_aesenc_si128(b2, key);
+                b3 = _mm_aesenc_si128(b3, key);
+                b4 = _mm_aesenc_si128(b4, key);
+                b5 = _mm_aesenc_si128(b5, key);
+                b6 = _mm_aesenc_si128(b6, key);
+                b7 = _mm_aesenc_si128(b7, key);
+
+                key = round_keys[5];
+                b0 = _mm_aesenc_si128(b0, key);
+                b1 = _mm_aesenc_si128(b1, key);
+                b2 = _mm_aesenc_si128(b2, key);
+                b3 = _mm_aesenc_si128(b3, key);
+                b4 = _mm_aesenc_si128(b4, key);
+                b5 = _mm_aesenc_si128(b5, key);
+                b6 = _mm_aesenc_si128(b6, key);
+                b7 = _mm_aesenc_si128(b7, key);
+
+                key = round_keys[6];
+                b0 = _mm_aesenc_si128(b0, key);
+                b1 = _mm_aesenc_si128(b1, key);
+                b2 = _mm_aesenc_si128(b2, key);
+                b3 = _mm_aesenc_si128(b3, key);
+                b4 = _mm_aesenc_si128(b4, key);
+                b5 = _mm_aesenc_si128(b5, key);
+                b6 = _mm_aesenc_si128(b6, key);
+                b7 = _mm_aesenc_si128(b7, key);
+
+                key = round_keys[7];
+                b0 = _mm_aesenc_si128(b0, key);
+                b1 = _mm_aesenc_si128(b1, key);
+                b2 = _mm_aesenc_si128(b2, key);
+                b3 = _mm_aesenc_si128(b3, key);
+                b4 = _mm_aesenc_si128(b4, key);
+                b5 = _mm_aesenc_si128(b5, key);
+                b6 = _mm_aesenc_si128(b6, key);
+                b7 = _mm_aesenc_si128(b7, key);
+
+                key = round_keys[8];
+                b0 = _mm_aesenc_si128(b0, key);
+                b1 = _mm_aesenc_si128(b1, key);
+                b2 = _mm_aesenc_si128(b2, key);
+                b3 = _mm_aesenc_si128(b3, key);
+                b4 = _mm_aesenc_si128(b4, key);
+                b5 = _mm_aesenc_si128(b5, key);
+                b6 = _mm_aesenc_si128(b6, key);
+                b7 = _mm_aesenc_si128(b7, key);
+
+                key = round_keys[9];
+                b0 = _mm_aesenc_si128(b0, key);
+                b1 = _mm_aesenc_si128(b1, key);
+                b2 = _mm_aesenc_si128(b2, key);
+                b3 = _mm_aesenc_si128(b3, key);
+                b4 = _mm_aesenc_si128(b4, key);
+                b5 = _mm_aesenc_si128(b5, key);
+                b6 = _mm_aesenc_si128(b6, key);
+                b7 = _mm_aesenc_si128(b7, key);
+
+                key = round_keys[10];
+                b0 = _mm_aesenclast_si128(b0, key);
+                b1 = _mm_aesenclast_si128(b1, key);
+                b2 = _mm_aesenclast_si128(b2, key);
+                b3 = _mm_aesenclast_si128(b3, key);
+                b4 = _mm_aesenclast_si128(b4, key);
+                b5 = _mm_aesenclast_si128(b5, key);
+                b6 = _mm_aesenclast_si128(b6, key);
+                b7 = _mm_aesenclast_si128(b7, key);
+
+                /* Write the blocks. */
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 0), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 0)), b0));
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 1), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 1)), b1));
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 2), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 2)), b2));
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 3), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 3)), b3));
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 4), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 4)), b4));
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 5), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 5)), b5));
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 6), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 6)), b6));
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 7), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 7)), b7));
+
+                src += BlockSize * UnrolledBlockCount;
+                dst += BlockSize * UnrolledBlockCount;
+            }
+
+            /* Store the updated counter. */
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(m_counter), counter);
+
+            /* Process blocks one at a time. */
+            for (/* ... */; cur_blocks < num_blocks; ++cur_blocks) {
+                /* Load current counter. */
+                __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter));
+
+                /* Do aes rounds. */
+                b = _mm_xor_si128(b, round_keys[0]);
+                b = _mm_aesenc_si128(b, round_keys[1]);
+                b = _mm_aesenc_si128(b, round_keys[2]);
+                b = _mm_aesenc_si128(b, round_keys[3]);
+                b = _mm_aesenc_si128(b, round_keys[4]);
+                b = _mm_aesenc_si128(b, round_keys[5]);
+                b = _mm_aesenc_si128(b, round_keys[6]);
+                b = _mm_aesenc_si128(b, round_keys[7]);
+                b = _mm_aesenc_si128(b, round_keys[8]);
+                b = _mm_aesenc_si128(b, round_keys[9]);
+                b = _mm_aesenclast_si128(b, round_keys[10]);
+
+                /* Write the block. */
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src)), b));
+
+                /* Advance. */
+                src += BlockSize;
+                dst += BlockSize;
+                this->IncrementCounter();
+            }
+        } else {
+            /* Fall back to the default implementation. */
+            while (num_blocks--) {
+                this->ProcessBlock(dst, src, BlockSize);
+                dst += BlockSize;
+                src += BlockSize;
+            }
+        }
+    }
+
+}
--- a/libraries/libvapours/source/crypto/impl/crypto_sha1_impl.arch.arm64.cpp
+++ b/libraries/libvapours/source/crypto/impl/crypto_sha1_impl.arch.arm64.cpp
@@ -15,31 +15,233 @@
 */
 #include <vapours.hpp>

+#if defined(ATMOSPHERE_IS_STRATOSPHERE)
+#include <arm_neon.h>
+
 namespace ams::crypto::impl {

-#ifdef ATMOSPHERE_IS_STRATOSPHERE
+    namespace {
+
+        constexpr const u32 RoundConstants[4] = {
+            0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
+        };
+
+        /* Define for loading work var from message. */
+        #define SHA1_LOAD_W_FROM_MESSAGE(which)                      \
+        w[which] = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data))); \
+        data += 0x10
+
+        #define SHA1_CALCULATE_W_FROM_PREVIOUS(i) \
+        w[i] = vsha1su1q_u32(vsha1su0q_u32(w[i-4], w[i-3], w[i-2]), w[i-1])
+
+        /* Define for doing four rounds of SHA1. */
+        #define SHA1_DO_ROUND(r, insn, constant)                                   \
+        do {                                                                       \
+            const u32 a = vgetq_lane_u32(cur_abcd, 0);                             \
+            cur_abcd = v##insn##q_u32(cur_abcd, cur_e, vaddq_u32(w[r], constant)); \
+            cur_e = vsha1h_u32(a);                                                 \
+        } while (0)
+
+
+    }

    void Sha1Impl::Initialize() {
-        static_assert(sizeof(m_state) == sizeof(::Sha1Context));
-        ::sha1ContextCreate(reinterpret_cast<::Sha1Context *>(std::addressof(m_state)));
+        /* Reset buffered bytes/bits. */
+        m_buffered_bytes = 0;
+        m_bits_consumed  = 0;
+
+        /* Set intermediate hash. */
+        m_intermediate_hash[0] = 0x67452301;
+        m_intermediate_hash[1] = 0xEFCDAB89;
+        m_intermediate_hash[2] = 0x98BADCFE;
+        m_intermediate_hash[3] = 0x10325476;
+        m_intermediate_hash[4] = 0xC3D2E1F0;
+
+        /* Set state. */
+        m_state = State_Initialized;
    }

    void Sha1Impl::Update(const void *data, size_t size) {
-        static_assert(sizeof(m_state) == sizeof(::Sha1Context));
-        ::sha1ContextUpdate(reinterpret_cast<::Sha1Context *>(std::addressof(m_state)), data, size);
+        /* Verify we're in a state to update. */
+        AMS_ASSERT(m_state == State_Initialized);
+
+        /* Advance our input bit count. */
+        m_bits_consumed += BITSIZEOF(u8) * (((m_buffered_bytes + size) / BlockSize) * BlockSize);
+
+        /* Process anything we have buffered. */
+        const u8 *data8 = static_cast<const u8 *>(data);
+        size_t remaining = size;
+
+        if (m_buffered_bytes > 0) {
+            const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
+            std::memcpy(m_buffer + m_buffered_bytes, data8, copy_size);
+
+            data8            += copy_size;
+            remaining        -= copy_size;
+            m_buffered_bytes += copy_size;
+
+            /* Process a block, if we filled one. */
+            if (m_buffered_bytes == BlockSize) {
+                this->ProcessBlock(m_buffer);
+                m_buffered_bytes = 0;
+            }
+        }
+
+        /* Process blocks, if we have any. */
+        if (remaining >= BlockSize) {
+            const size_t blocks = remaining / BlockSize;
+
+            this->ProcessBlocks(data8, blocks);
+            data8     += BlockSize * blocks;
+            remaining -= BlockSize * blocks;
+        }
+
+        /* Copy any leftover data to our buffer. */
+        if (remaining > 0) {
+            m_buffered_bytes = remaining;
+            std::memcpy(m_buffer, data8, remaining);
+        }
    }

    void Sha1Impl::GetHash(void *dst, size_t size) {
-        static_assert(sizeof(m_state) == sizeof(::Sha1Context));
+        /* Verify we're in a state to get hash. */
+        AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
        AMS_ASSERT(size >= HashSize);
        AMS_UNUSED(size);
-        ::sha1ContextGetHash(reinterpret_cast<::Sha1Context *>(std::addressof(m_state)), dst);
+
+        /* If we need to, process the last block. */
+        if (m_state == State_Initialized) {
+            this->ProcessLastBlock();
+            m_state = State_Done;
+        }
+
+        /* Copy the output hash. */
+        if constexpr (util::IsLittleEndian()) {
+            static_assert(HashSize % sizeof(u32) == 0);
+
+            u32 *dst_32 = static_cast<u32 *>(dst);
+            for (size_t i = 0; i < HashSize / sizeof(u32); ++i) {
+                dst_32[i] = util::LoadBigEndian<u32>(m_intermediate_hash + i);
+            }
+        } else {
+            std::memcpy(dst, m_intermediate_hash, HashSize);
+        }
    }

-#else
+    ALWAYS_INLINE void Sha1Impl::ProcessBlock(const void *data) {
+        return this->ProcessBlocks(static_cast<const u8 *>(data), 1);
+    }

-    /* TODO: Non-EL0 implementation. */
+    void Sha1Impl::ProcessBlocks(const u8 *data, size_t block_count) {
+        /* Setup round constants. */
+        const uint32x4_t k0 = vdupq_n_u32(RoundConstants[0]);
+        const uint32x4_t k1 = vdupq_n_u32(RoundConstants[1]);
+        const uint32x4_t k2 = vdupq_n_u32(RoundConstants[2]);
+        const uint32x4_t k3 = vdupq_n_u32(RoundConstants[3]);

-#endif
+        /* Load hash variables with intermediate state. */
+        uint32x4_t cur_abcd = vld1q_u32(m_intermediate_hash + 0);
+        u32 cur_e = m_intermediate_hash[4];

-}
+        /* Actually do hash processing blocks. */
+        do {
+            /* Save current state. */
+            const uint32x4_t prev_abcd = cur_abcd;
+            const u32 prev_e = cur_e;
+
+            uint32x4_t w[20];
+
+            /* Setup w[0-3] with message. */
+            SHA1_LOAD_W_FROM_MESSAGE(0);
+            SHA1_LOAD_W_FROM_MESSAGE(1);
+            SHA1_LOAD_W_FROM_MESSAGE(2);
+            SHA1_LOAD_W_FROM_MESSAGE(3);
+
+            /* Calculate w[4-19], w[i] = sha1su1(sha1su0(w[i-4], w[i-3], w[i-2]), w[i-1]); */
+            SHA1_CALCULATE_W_FROM_PREVIOUS(4);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(5);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(6);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(7);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(8);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(9);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(10);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(11);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(12);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(13);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(14);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(15);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(16);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(17);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(18);
+            SHA1_CALCULATE_W_FROM_PREVIOUS(19);
+
+            /* Do round calculations 0-20. Uses sha1c, k0. */
+            SHA1_DO_ROUND(0, sha1c, k0);
+            SHA1_DO_ROUND(1, sha1c, k0);
+            SHA1_DO_ROUND(2, sha1c, k0);
+            SHA1_DO_ROUND(3, sha1c, k0);
+            SHA1_DO_ROUND(4, sha1c, k0);
+
+            /* Do round calculations 20-40. Uses sha1p, k1. */
+            SHA1_DO_ROUND(5, sha1p, k1);
+            SHA1_DO_ROUND(6, sha1p, k1);
+            SHA1_DO_ROUND(7, sha1p, k1);
+            SHA1_DO_ROUND(8, sha1p, k1);
+            SHA1_DO_ROUND(9, sha1p, k1);
+
+            /* Do round calculations 40-60. Uses sha1m, k2. */
+            SHA1_DO_ROUND(10, sha1m, k2);
+            SHA1_DO_ROUND(11, sha1m, k2);
+            SHA1_DO_ROUND(12, sha1m, k2);
+            SHA1_DO_ROUND(13, sha1m, k2);
+            SHA1_DO_ROUND(14, sha1m, k2);
+
+            /* Do round calculations 60-80. Uses sha1p, k3. */
+            SHA1_DO_ROUND(15, sha1p, k3);
+            SHA1_DO_ROUND(16, sha1p, k3);
+            SHA1_DO_ROUND(17, sha1p, k3);
+            SHA1_DO_ROUND(18, sha1p, k3);
+            SHA1_DO_ROUND(19, sha1p, k3);
+
+            /* Add to previous. */
+            cur_abcd = vaddq_u32(cur_abcd, prev_abcd);
+            cur_e = cur_e + prev_e;
+        } while (--block_count != 0);
+
+        /* Save result to intermediate hash. */
+        vst1q_u32(m_intermediate_hash, cur_abcd);
+        m_intermediate_hash[4] = cur_e;
+    }
+
+    void Sha1Impl::ProcessLastBlock() {
+        /* Setup the final block. */
+        constexpr const auto BlockSizeWithoutSizeField = BlockSize - sizeof(u64);
+
+        /* Increment our bits consumed. */
+        m_bits_consumed += BITSIZEOF(u8) * m_buffered_bytes;
+
+        /* Add 0x80 terminator. */
+        m_buffer[m_buffered_bytes++] = 0x80;
+
+        /* If we can process the size field directly, do so, otherwise set up to process it. */
+        if (m_buffered_bytes <= BlockSizeWithoutSizeField) {
+            /* Clear up to size field. */
+            std::memset(m_buffer + m_buffered_bytes, 0, BlockSizeWithoutSizeField - m_buffered_bytes);
+        } else {
+            /* Consume full block */
+            std::memset(m_buffer + m_buffered_bytes, 0, BlockSize - m_buffered_bytes);
+            this->ProcessBlock(m_buffer);
+
+            /* Clear up to size field. */
+            std::memset(m_buffer, 0, BlockSizeWithoutSizeField);
+        }
+
+        /* Store the size field. */
+        util::StoreBigEndian<u64>(reinterpret_cast<u64 *>(m_buffer + BlockSizeWithoutSizeField), m_bits_consumed);
+
+        /* Process the final block. */
+        this->ProcessBlock(m_buffer);
+    }
+
+}
+#endif
--- a/libraries/libvapours/source/crypto/impl/crypto_sha1_impl.arch.generic.cpp
+++ b/libraries/libvapours/source/crypto/impl/crypto_sha1_impl.arch.generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+
+namespace ams::crypto::impl {
+
+    namespace {
+
+        constexpr const u32 RoundConstants[4] = {
+            0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
+        };
+
+        constexpr ALWAYS_INLINE u32 Choose(u32 x, u32 y, u32 z) {
+            return (x & y) ^ ((~x) & z);
+        }
+
+        constexpr ALWAYS_INLINE u32 Majority(u32 x, u32 y, u32 z) {
+            return (x & y) ^ (x & z) ^ (y & z);
+        }
+
+        constexpr ALWAYS_INLINE u32 Parity(u32 x, u32 y, u32 z) {
+            return x ^ y ^ z;
+        }
+
+    }
+
+    void Sha1Impl::Initialize() {
+        /* Reset buffered bytes/bits. */
+        m_buffered_bytes = 0;
+        m_bits_consumed  = 0;
+
+        /* Set intermediate hash. */
+        m_intermediate_hash[0] = 0x67452301;
+        m_intermediate_hash[1] = 0xEFCDAB89;
+        m_intermediate_hash[2] = 0x98BADCFE;
+        m_intermediate_hash[3] = 0x10325476;
+        m_intermediate_hash[4] = 0xC3D2E1F0;
+
+        /* Set state. */
+        m_state = State_Initialized;
+    }
+
+    void Sha1Impl::Update(const void *data, size_t size) {
+        /* Verify we're in a state to update. */
+        AMS_ASSERT(m_state == State_Initialized);
+
+        /* Advance our input bit count. */
+        m_bits_consumed += BITSIZEOF(u8) * (((m_buffered_bytes + size) / BlockSize) * BlockSize);
+
+        /* Process anything we have buffered. */
+        const u8 *data8 = static_cast<const u8 *>(data);
+        size_t remaining = size;
+
+        if (m_buffered_bytes > 0) {
+            const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
+            std::memcpy(m_buffer + m_buffered_bytes, data8, copy_size);
+
+            data8            += copy_size;
+            remaining        -= copy_size;
+            m_buffered_bytes += copy_size;
+
+            /* Process a block, if we filled one. */
+            if (m_buffered_bytes == BlockSize) {
+                this->ProcessBlock(m_buffer);
+                m_buffered_bytes = 0;
+            }
+        }
+
+        /* Process blocks, while we have any. */
+        while (remaining >= BlockSize) {
+            this->ProcessBlock(data8);
+            data8     += BlockSize;
+            remaining -= BlockSize;
+        }
+
+        /* Copy any leftover data to our buffer. */
+        if (remaining > 0) {
+            m_buffered_bytes = remaining;
+            std::memcpy(m_buffer, data8, remaining);
+        }
+    }
+
+    void Sha1Impl::GetHash(void *dst, size_t size) {
+        /* Verify we're in a state to get hash. */
+        AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
+        AMS_ASSERT(size >= HashSize);
+        AMS_UNUSED(size);
+
+        /* If we need to, process the last block. */
+        if (m_state == State_Initialized) {
+            this->ProcessLastBlock();
+            m_state = State_Done;
+        }
+
+        /* Copy the output hash. */
+        if constexpr (util::IsLittleEndian()) {
+            static_assert(HashSize % sizeof(u32) == 0);
+
+            u32 *dst_32 = static_cast<u32 *>(dst);
+            for (size_t i = 0; i < HashSize / sizeof(u32); ++i) {
+                dst_32[i] = util::LoadBigEndian<u32>(m_intermediate_hash + i);
+            }
+        } else {
+            std::memcpy(dst, m_intermediate_hash, HashSize);
+        }
+    }
+
+    void Sha1Impl::ProcessBlock(const void *data) {
+        /* Load work variables. */
+        u32 a = m_intermediate_hash[0];
+        u32 b = m_intermediate_hash[1];
+        u32 c = m_intermediate_hash[2];
+        u32 d = m_intermediate_hash[3];
+        u32 e = m_intermediate_hash[4];
+        u32 tmp;
+        size_t i;
+
+        /* Copy the input. */
+        u32 w[80];
+        if constexpr (util::IsLittleEndian()) {
+            static_assert(BlockSize % sizeof(u32) == 0);
+
+            const u32 *src_32 = static_cast<const u32 *>(data);
+            for (size_t i = 0; i < BlockSize / sizeof(u32); ++i) {
+                w[i] = util::LoadBigEndian<u32>(src_32 + i);
+            }
+        } else {
+            std::memcpy(w, data, BlockSize);
+        }
+
+        /* Initialize the rest of w. */
+        for (i = BlockSize / sizeof(u32); i < util::size(w); ++i) {
+            const u32 *prev = w + (i - BlockSize / sizeof(u32));
+            w[i] = util::RotateLeft<u32>(prev[0] ^ prev[2] ^ prev[8] ^ prev[13], 1);
+        }
+
+        /* Perform rounds. */
+        for (i = 0; i < 20; ++i) {
+            tmp = util::RotateLeft<u32>(a, 5) + Choose(b, c, d) + e + w[i] + RoundConstants[0];
+            e = d;
+            d = c;
+            c = util::RotateLeft<u32>(b, 30);
+            b = a;
+            a = tmp;
+        }
+
+        for (/* ... */; i < 40; ++i) {
+            tmp = util::RotateLeft<u32>(a, 5) + Parity(b, c, d) + e + w[i] + RoundConstants[1];
+            e = d;
+            d = c;
+            c = util::RotateLeft<u32>(b, 30);
+            b = a;
+            a = tmp;
+        }
+
+        for (/* ... */; i < 60; ++i) {
+            tmp = util::RotateLeft<u32>(a, 5) + Majority(b, c, d) + e + w[i] + RoundConstants[2];
+            e = d;
+            d = c;
+            c = util::RotateLeft<u32>(b, 30);
+            b = a;
+            a = tmp;
+        }
+
+        for (/* ... */; i < 80; ++i) {
+            tmp = util::RotateLeft<u32>(a, 5) + Parity(b, c, d) + e + w[i] + RoundConstants[3];
+            e = d;
+            d = c;
+            c = util::RotateLeft<u32>(b, 30);
+            b = a;
+            a = tmp;
+        }
+
+        /* Update intermediate hash. */
+        m_intermediate_hash[0] += a;
+        m_intermediate_hash[1] += b;
+        m_intermediate_hash[2] += c;
+        m_intermediate_hash[3] += d;
+        m_intermediate_hash[4] += e;
+    }
+
+    void Sha1Impl::ProcessLastBlock() {
+        /* Setup the final block. */
+        constexpr const auto BlockSizeWithoutSizeField = BlockSize - sizeof(u64);
+
+        /* Increment our bits consumed. */
+        m_bits_consumed += BITSIZEOF(u8) * m_buffered_bytes;
+
+        /* Add 0x80 terminator. */
+        m_buffer[m_buffered_bytes++] = 0x80;
+
+        /* If we can process the size field directly, do so, otherwise set up to process it. */
+        if (m_buffered_bytes <= BlockSizeWithoutSizeField) {
+            /* Clear up to size field. */
+            std::memset(m_buffer + m_buffered_bytes, 0, BlockSizeWithoutSizeField - m_buffered_bytes);
+        } else {
+            /* Consume full block */
+            std::memset(m_buffer + m_buffered_bytes, 0, BlockSize - m_buffered_bytes);
+            this->ProcessBlock(m_buffer);
+
+            /* Clear up to size field. */
+            std::memset(m_buffer, 0, BlockSizeWithoutSizeField);
+        }
+
+        /* Store the size field. */
+        util::StoreBigEndian<u64>(reinterpret_cast<u64 *>(m_buffer + BlockSizeWithoutSizeField), m_bits_consumed);
+
+        /* Process the final block. */
+        this->ProcessBlock(m_buffer);
+    }
+
+}
--- a/libraries/libvapours/source/crypto/impl/crypto_sha256_impl.arch.arm64.cpp
+++ b/libraries/libvapours/source/crypto/impl/crypto_sha256_impl.arch.arm64.cpp
@@ -15,53 +15,313 @@
 */
 #include <vapours.hpp>

+#if defined(ATMOSPHERE_IS_STRATOSPHERE)
+#include <arm_neon.h>
+
 namespace ams::crypto::impl {

-#ifdef ATMOSPHERE_IS_STRATOSPHERE
+    namespace {
+
+        alignas(Sha256Impl::BlockSize) constexpr const u32 RoundConstants[0x40] = {
+            0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
+            0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+            0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
+            0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+            0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
+            0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+            0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
+            0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+            0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
+            0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+            0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
+            0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+            0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
+            0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+            0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
+            0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
+        };
+
+    }

    void Sha256Impl::Initialize() {
-        static_assert(sizeof(m_state) == sizeof(::Sha256Context));
-        ::sha256ContextCreate(reinterpret_cast<::Sha256Context *>(std::addressof(m_state)));
+        /* Reset buffered bytes/bits. */
+        m_buffered_bytes = 0;
+        m_bits_consumed  = 0;
+
+        /* Set intermediate hash. */
+        m_intermediate_hash[0] = 0x6A09E667;
+        m_intermediate_hash[1] = 0xBB67AE85;
+        m_intermediate_hash[2] = 0x3C6EF372;
+        m_intermediate_hash[3] = 0xA54FF53A;
+        m_intermediate_hash[4] = 0x510E527F;
+        m_intermediate_hash[5] = 0x9B05688C;
+        m_intermediate_hash[6] = 0x1F83D9AB;
+        m_intermediate_hash[7] = 0x5BE0CD19;
+
+        /* Set state. */
+        m_state = State_Initialized;
    }

    void Sha256Impl::Update(const void *data, size_t size) {
-        static_assert(sizeof(m_state) == sizeof(::Sha256Context));
-        ::sha256ContextUpdate(reinterpret_cast<::Sha256Context *>(std::addressof(m_state)), data, size);
+        /* Verify we're in a state to update. */
+        AMS_ASSERT(m_state == State_Initialized);
+
+        /* Advance our input bit count. */
+        m_bits_consumed += BITSIZEOF(u8) * (((m_buffered_bytes + size) / BlockSize) * BlockSize);
+
+        /* Process anything we have buffered. */
+        const u8 *data8 = static_cast<const u8 *>(data);
+        size_t remaining = size;
+
+        if (m_buffered_bytes > 0) {
+            const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
+            std::memcpy(m_buffer + m_buffered_bytes, data8, copy_size);
+
+            data8            += copy_size;
+            remaining        -= copy_size;
+            m_buffered_bytes += copy_size;
+
+            /* Process a block, if we filled one. */
+            if (m_buffered_bytes == BlockSize) {
+                this->ProcessBlock(m_buffer);
+                m_buffered_bytes = 0;
+            }
+        }
+
+        /* Process blocks, if we have any. */
+        if (remaining >= BlockSize) {
+            const size_t blocks = remaining / BlockSize;
+
+            this->ProcessBlocks(data8, blocks);
+            data8     += BlockSize * blocks;
+            remaining -= BlockSize * blocks;
+        }
+
+        /* Copy any leftover data to our buffer. */
+        if (remaining > 0) {
+            m_buffered_bytes = remaining;
+            std::memcpy(m_buffer, data8, remaining);
+        }
    }

    void Sha256Impl::GetHash(void *dst, size_t size) {
-        static_assert(sizeof(m_state) == sizeof(::Sha256Context));
+        /* Verify we're in a state to get hash. */
+        AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
        AMS_ASSERT(size >= HashSize);
        AMS_UNUSED(size);

-        ::sha256ContextGetHash(reinterpret_cast<::Sha256Context *>(std::addressof(m_state)), dst);
+        /* If we need to, process the last block. */
+        if (m_state == State_Initialized) {
+            this->ProcessLastBlock();
+            m_state = State_Done;
+        }
+
+        /* Copy the output hash. */
+        if constexpr (util::IsLittleEndian()) {
+            static_assert(HashSize % sizeof(u32) == 0);
+
+            u32 *dst_32 = static_cast<u32 *>(dst);
+            for (size_t i = 0; i < HashSize / sizeof(u32); ++i) {
+                dst_32[i] = util::LoadBigEndian<u32>(m_intermediate_hash + i);
+            }
+        } else {
+            std::memcpy(dst, m_intermediate_hash, HashSize);
+        }
    }

    void Sha256Impl::InitializeWithContext(const Sha256Context *context) {
-        static_assert(sizeof(m_state) == sizeof(::Sha256Context));
-
        /* Copy state in from the context. */
-        std::memcpy(m_state.intermediate_hash, context->intermediate_hash, sizeof(m_state.intermediate_hash));
-        m_state.bits_consumed = context->bits_consumed;
+        std::memcpy(m_intermediate_hash, context->intermediate_hash, sizeof(m_intermediate_hash));
+        m_bits_consumed = context->bits_consumed;

-        /* Clear the rest of state. */
-        std::memset(m_state.buffer, 0, sizeof(m_state.buffer));
-        m_state.num_buffered = 0;
-        m_state.finalized = false;
+        /* Reset other fields. */
+        m_buffered_bytes = 0;
+        m_state = State_Initialized;
    }

    size_t Sha256Impl::GetContext(Sha256Context *context) const {
-        static_assert(sizeof(m_state) == sizeof(::Sha256Context));
-        std::memcpy(context->intermediate_hash, m_state.intermediate_hash, sizeof(context->intermediate_hash));
-        context->bits_consumed = m_state.bits_consumed;
+        /* Check our state. */
+        AMS_ASSERT(m_state == State_Initialized);

-        return m_state.num_buffered;
+        /* Copy out the context. */
+        std::memcpy(context->intermediate_hash, m_intermediate_hash, sizeof(context->intermediate_hash));
+        context->bits_consumed = m_bits_consumed;
+
+        return m_buffered_bytes;
    }

-#else
+    ALWAYS_INLINE void Sha256Impl::ProcessBlock(const void *data) {
+        return this->ProcessBlocks(static_cast<const u8 *>(data), 1);
+    }

-    /* TODO: Non-EL0 implementation. */
+    void Sha256Impl::ProcessBlocks(const u8 *data, size_t block_count) {
+        /* Load previous hash with intermediate state, current hash with zeroes. */
+        uint32x4_t prev_hash0 = vld1q_u32(m_intermediate_hash + 0);
+        uint32x4_t prev_hash1 = vld1q_u32(m_intermediate_hash + 4);
+        uint32x4_t cur_hash0  = vdupq_n_u32(0);
+        uint32x4_t cur_hash1  = vdupq_n_u32(0);

-#endif
+        /* Process blocks. */
+        do {
+            uint32x4_t round_constant0, round_constant1;
+            uint32x4_t data0, data1, data2, data3;
+            uint32x4_t tmp0, tmp1, tmp2, tmp3;
+            uint32x4_t tmp_hash;

-}
+            /* Use optimized ASM implementation to process the block. */
+            __asm__ __volatile__ (
+                "ldp       %q[data0], %q[data1], [%[data]], #0x20\n"
+                "ldp       %q[data2], %q[data3], [%[data]], #0x20\n"
+                "add       %[cur_hash0].4s, %[cur_hash0].4s, %[prev_hash0].4s\n"
+                "ldp       %q[round_constant0], %q[round_constant1], [%[round_constants], 0x00]\n"
+                "add       %[cur_hash1].4s, %[cur_hash1].4s, %[prev_hash1].4s\n"
+                "rev32     %[data0].16b, %[data0].16b\n"
+                "rev32     %[data1].16b, %[data1].16b\n"
+                "rev32     %[data2].16b, %[data2].16b\n"
+                "rev32     %[data3].16b, %[data3].16b\n"
+                "add       %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
+                "add       %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
+                "ldp       %q[round_constant0], %q[round_constant1], [%[round_constants], 0x20]\n"
+                "sha256su0 %[data0].4s, %[data1].4s\n"
+                "mov       %[prev_hash0].16b, %[cur_hash0].16b\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
+                "mov       %[prev_hash1].16b, %[cur_hash1].16b\n"
+                "sha256h2  %q[cur_hash1], %q[prev_hash0], %[tmp0].4s\n"
+                "sha256su0 %[data1].4s, %[data2].4s\n"
+                "sha256su1 %[data0].4s, %[data2].4s, %[data3].4s\n"
+                "add       %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
+                "sha256su0 %[data2].4s, %[data3].4s\n"
+                "sha256su1 %[data1].4s, %[data3].4s, %[data0].4s\n"
+                "add       %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "ldp       %q[round_constant0], %q[round_constant1], [%[round_constants], 0x40]\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
+                "sha256su0 %[data3].4s, %[data0].4s\n"
+                "sha256su1 %[data2].4s, %[data0].4s, %[data1].4s\n"
+                "add       %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
+                "sha256su0 %[data0].4s, %[data1].4s\n"
+                "sha256su1 %[data3].4s, %[data1].4s, %[data2].4s\n"
+                "add       %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "ldp       %q[round_constant0], %q[round_constant1], [%[round_constants], 0x60]\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp0].4s\n"
+                "sha256su0 %[data1].4s, %[data2].4s\n"
+                "sha256su1 %[data0].4s, %[data2].4s, %[data3].4s\n"
+                "add       %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
+                "sha256su0 %[data2].4s, %[data3].4s\n"
+                "sha256su1 %[data1].4s, %[data3].4s, %[data0].4s\n"
+                "add       %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "ldp       %q[round_constant0], %q[round_constant1], [%[round_constants], 0x80]\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
+                "sha256su0 %[data3].4s, %[data0].4s\n"
+                "sha256su1 %[data2].4s, %[data0].4s, %[data1].4s\n"
+                "add       %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
+                "sha256su0 %[data0].4s, %[data1].4s\n"
+                "sha256su1 %[data3].4s, %[data1].4s, %[data2].4s\n"
+                "add       %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "ldp       %q[round_constant0], %q[round_constant1], [%[round_constants], 0xA0]\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp0].4s\n"
+                "sha256su0 %[data1].4s, %[data2].4s\n"
+                "sha256su1 %[data0].4s, %[data2].4s, %[data3].4s\n"
+                "add       %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
+                "sha256su0 %[data2].4s, %[data3].4s\n"
+                "sha256su1 %[data1].4s, %[data3].4s, %[data0].4s\n"
+                "add       %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "ldp       %q[round_constant0], %q[round_constant1], [%[round_constants], 0xC0]\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
+                "sha256su0 %[data3].4s, %[data0].4s\n"
+                "sha256su1 %[data2].4s, %[data0].4s, %[data1].4s\n"
+                "add       %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
+                "sha256su1 %[data3].4s, %[data1].4s, %[data2].4s\n"
+                "add       %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "ldp       %q[round_constant0], %q[round_constant1], [%[round_constants], 0xE0]\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp0].4s\n"
+                "add       %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
+                "add       %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
+                "mov       %[tmp_hash].16b, %[cur_hash0].16b\n"
+                "sha256h   %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
+                "sha256h2  %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
+                : [data0]"=w"(data0), [data1]"=w"(data1), [data2]"=w"(data2), [data3]"=w"(data3),
+                  [tmp0]"=w"(tmp0), [tmp1]"=w"(tmp1), [tmp2]"=w"(tmp2), [tmp3]"=w"(tmp3),
+                  [round_constant0]"=w"(round_constant0), [round_constant1]"=w"(round_constant1),
+                  [cur_hash0]"+w"(cur_hash0), [cur_hash1]"+w"(cur_hash1),
+                  [prev_hash0]"+w"(prev_hash0), [prev_hash1]"+w"(prev_hash1),
+                  [tmp_hash]"=w"(tmp_hash), [data]"+r"(data)
+                : [round_constants]"r"(RoundConstants)
+                :
+            );
+        } while (--block_count != 0);
+
+        /* Add hashes together, and store. */
+        cur_hash0 = vaddq_u32(prev_hash0, cur_hash0);
+        cur_hash1 = vaddq_u32(prev_hash1, cur_hash1);
+        vst1q_u32(m_intermediate_hash + 0, cur_hash0);
+        vst1q_u32(m_intermediate_hash + 4, cur_hash1);
+    }
+
+    void Sha256Impl::ProcessLastBlock() {
+        /* Setup the final block. */
+        constexpr const auto BlockSizeWithoutSizeField = BlockSize - sizeof(u64);
+
+        /* Increment our bits consumed. */
+        m_bits_consumed += BITSIZEOF(u8) * m_buffered_bytes;
+
+        /* Add 0x80 terminator. */
+        m_buffer[m_buffered_bytes++] = 0x80;
+
+        /* If we can process the size field directly, do so, otherwise set up to process it. */
+        if (m_buffered_bytes <= BlockSizeWithoutSizeField) {
+            /* Clear up to size field. */
+            std::memset(m_buffer + m_buffered_bytes, 0, BlockSizeWithoutSizeField - m_buffered_bytes);
+        } else {
+            /* Consume full block */
+            std::memset(m_buffer + m_buffered_bytes, 0, BlockSize - m_buffered_bytes);
+            this->ProcessBlock(m_buffer);
+
+            /* Clear up to size field. */
+            std::memset(m_buffer, 0, BlockSizeWithoutSizeField);
+        }
+
+        /* Store the size field. */
+        util::StoreBigEndian<u64>(reinterpret_cast<u64 *>(m_buffer + BlockSizeWithoutSizeField), m_bits_consumed);
+
+        /* Process the final block. */
+        this->ProcessBlock(m_buffer);
+    }
+
+}
+#endif
--- a/libraries/libvapours/source/crypto/impl/crypto_sha256_impl.arch.generic.cpp
+++ b/libraries/libvapours/source/crypto/impl/crypto_sha256_impl.arch.generic.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+
+namespace ams::crypto::impl {
+
+    namespace {
+
+        alignas(Sha256Impl::BlockSize) constexpr const u32 RoundConstants[0x40] = {
+            0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
+            0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+            0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
+            0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+            0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
+            0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+            0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
+            0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+            0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
+            0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+            0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
+            0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+            0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
+            0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+            0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
+            0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
+        };
+
+        constexpr ALWAYS_INLINE u32 Choose(u32 x, u32 y, u32 z) {
+            return (x & y) ^ ((~x) & z);
+        }
+
+        constexpr ALWAYS_INLINE u32 Majority(u32 x, u32 y, u32 z) {
+            return (x & y) ^ (x & z) ^ (y & z);
+        }
+
+        constexpr ALWAYS_INLINE u32 LargeSigma0(u32 x) {
+            return util::RotateRight<u32>(x, 2) ^ util::RotateRight<u32>(x, 13) ^ util::RotateRight<u32>(x, 22);
+        }
+
+        constexpr ALWAYS_INLINE u32 LargeSigma1(u32 x) {
+            return util::RotateRight<u32>(x, 6) ^ util::RotateRight<u32>(x, 11) ^ util::RotateRight<u32>(x, 25);
+        }
+
+        constexpr ALWAYS_INLINE u32 SmallSigma0(u32 x) {
+            return util::RotateRight<u32>(x, 7) ^ util::RotateRight<u32>(x, 18) ^ (x >> 3);
+        }
+
+        constexpr ALWAYS_INLINE u32 SmallSigma1(u32 x) {
+            return util::RotateRight<u32>(x, 17) ^ util::RotateRight<u32>(x, 19) ^ (x >> 10);
+        }
+
+    }
+
+    void Sha256Impl::Initialize() {
+        /* Reset buffered bytes/bits. */
+        m_buffered_bytes = 0;
+        m_bits_consumed  = 0;
+
+        /* Set intermediate hash. */
+        m_intermediate_hash[0] = 0x6A09E667;
+        m_intermediate_hash[1] = 0xBB67AE85;
+        m_intermediate_hash[2] = 0x3C6EF372;
+        m_intermediate_hash[3] = 0xA54FF53A;
+        m_intermediate_hash[4] = 0x510E527F;
+        m_intermediate_hash[5] = 0x9B05688C;
+        m_intermediate_hash[6] = 0x1F83D9AB;
+        m_intermediate_hash[7] = 0x5BE0CD19;
+
+        /* Set state. */
+        m_state = State_Initialized;
+    }
+
+    void Sha256Impl::Update(const void *data, size_t size) {
+        /* Verify we're in a state to update. */
+        AMS_ASSERT(m_state == State_Initialized);
+
+        /* Advance our input bit count. */
+        m_bits_consumed += BITSIZEOF(u8) * (((m_buffered_bytes + size) / BlockSize) * BlockSize);
+
+        /* Process anything we have buffered. */
+        const u8 *data8 = static_cast<const u8 *>(data);
+        size_t remaining = size;
+
+        if (m_buffered_bytes > 0) {
+            const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
+            std::memcpy(m_buffer + m_buffered_bytes, data8, copy_size);
+
+            data8            += copy_size;
+            remaining        -= copy_size;
+            m_buffered_bytes += copy_size;
+
+            /* Process a block, if we filled one. */
+            if (m_buffered_bytes == BlockSize) {
+                this->ProcessBlock(m_buffer);
+                m_buffered_bytes = 0;
+            }
+        }
+
+        /* Process blocks, if we have any. */
+        while (remaining >= BlockSize) {
+            this->ProcessBlock(data8);
+            data8     += BlockSize;
+            remaining -= BlockSize;
+        }
+
+        /* Copy any leftover data to our buffer. */
+        if (remaining > 0) {
+            m_buffered_bytes = remaining;
+            std::memcpy(m_buffer, data8, remaining);
+        }
+    }
+
+    void Sha256Impl::GetHash(void *dst, size_t size) {
+        /* Verify we're in a state to get hash. */
+        AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
+        AMS_ASSERT(size >= HashSize);
+        AMS_UNUSED(size);
+
+        /* If we need to, process the last block. */
+        if (m_state == State_Initialized) {
+            this->ProcessLastBlock();
+            m_state = State_Done;
+        }
+
+        /* Copy the output hash. */
+        if constexpr (util::IsLittleEndian()) {
+            static_assert(HashSize % sizeof(u32) == 0);
+
+            u32 *dst_32 = static_cast<u32 *>(dst);
+            for (size_t i = 0; i < HashSize / sizeof(u32); ++i) {
+                dst_32[i] = util::LoadBigEndian<u32>(m_intermediate_hash + i);
+            }
+        } else {
+            std::memcpy(dst, m_intermediate_hash, HashSize);
+        }
+    }
+
+    void Sha256Impl::InitializeWithContext(const Sha256Context *context) {
+        /* Copy state in from the context. */
+        std::memcpy(m_intermediate_hash, context->intermediate_hash, sizeof(m_intermediate_hash));
+        m_bits_consumed = context->bits_consumed;
+
+        /* Reset other fields. */
+        m_buffered_bytes = 0;
+        m_state = State_Initialized;
+    }
+
+    size_t Sha256Impl::GetContext(Sha256Context *context) const {
+        /* Check our state. */
+        AMS_ASSERT(m_state == State_Initialized);
+
+        /* Copy out the context. */
+        std::memcpy(context->intermediate_hash, m_intermediate_hash, sizeof(context->intermediate_hash));
+        context->bits_consumed = m_bits_consumed;
+
+        return m_buffered_bytes;
+    }
+
+    void Sha256Impl::ProcessBlock(const void *data) {
+        /* Load work variables. */
+        u32 a = m_intermediate_hash[0];
+        u32 b = m_intermediate_hash[1];
+        u32 c = m_intermediate_hash[2];
+        u32 d = m_intermediate_hash[3];
+        u32 e = m_intermediate_hash[4];
+        u32 f = m_intermediate_hash[5];
+        u32 g = m_intermediate_hash[6];
+        u32 h = m_intermediate_hash[7];
+        u32 tmp[2];
+        size_t i;
+
+        /* Copy the input. */
+        u32 w[64];
+        if constexpr (util::IsLittleEndian()) {
+            static_assert(BlockSize % sizeof(u32) == 0);
+
+            const u32 *src_32 = static_cast<const u32 *>(data);
+            for (size_t i = 0; i < BlockSize / sizeof(u32); ++i) {
+                w[i] = util::LoadBigEndian<u32>(src_32 + i);
+            }
+        } else {
+            std::memcpy(w, data, BlockSize);
+        }
+
+        /* Initialize the rest of w. */
+        for (i = BlockSize / sizeof(u32); i < util::size(w); ++i) {
+            const u32 *prev = w + (i - BlockSize / sizeof(u32));
+            w[i] = prev[0] + SmallSigma0(prev[1]) + prev[9] + SmallSigma1(prev[14]);
+        }
+
+        /* Perform rounds. */
+        for (i = 0; i < 64; ++i) {
+            tmp[0] = h + LargeSigma1(e) + Choose(e, f, g) + RoundConstants[i] + w[i];
+            tmp[1] = LargeSigma0(a) + Majority(a, b, c);
+
+            h = g;
+            g = f;
+            f = e;
+            e = d + tmp[0];
+            d = c;
+            c = b;
+            b = a;
+            a = tmp[0] + tmp[1];
+        }
+
+        /* Update intermediate hash. */
+        m_intermediate_hash[0] += a;
+        m_intermediate_hash[1] += b;
+        m_intermediate_hash[2] += c;
+        m_intermediate_hash[3] += d;
+        m_intermediate_hash[4] += e;
+        m_intermediate_hash[5] += f;
+        m_intermediate_hash[6] += g;
+        m_intermediate_hash[7] += h;
+    }
+
+    void Sha256Impl::ProcessLastBlock() {
+        /* Setup the final block. */
+        constexpr const auto BlockSizeWithoutSizeField = BlockSize - sizeof(u64);
+
+        /* Increment our bits consumed. */
+        m_bits_consumed += BITSIZEOF(u8) * m_buffered_bytes;
+
+        /* Add 0x80 terminator. */
+        m_buffer[m_buffered_bytes++] = 0x80;
+
+        /* If we can process the size field directly, do so, otherwise set up to process it. */
+        if (m_buffered_bytes <= BlockSizeWithoutSizeField) {
+            /* Clear up to size field. */
+            std::memset(m_buffer + m_buffered_bytes, 0, BlockSizeWithoutSizeField - m_buffered_bytes);
+        } else {
+            /* Consume full block */
+            std::memset(m_buffer + m_buffered_bytes, 0, BlockSize - m_buffered_bytes);
+            this->ProcessBlock(m_buffer);
+
+            /* Clear up to size field. */
+            std::memset(m_buffer, 0, BlockSizeWithoutSizeField);
+        }
+
+        /* Store the size field. */
+        util::StoreBigEndian<u64>(reinterpret_cast<u64 *>(m_buffer + BlockSizeWithoutSizeField), m_bits_consumed);
+
+        /* Process the final block. */
+        this->ProcessBlock(m_buffer);
+    }
+
+}
--- a/libraries/libvapours/source/crypto/impl/crypto_sha3_impl.cpp
+++ b/libraries/libvapours/source/crypto/impl/crypto_sha3_impl.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+
+namespace ams::crypto::impl {
+
+    namespace {
+
+        constexpr auto NumRounds = 24;
+
+        constexpr const u64 IotaRoundConstant[NumRounds] = {
+            UINT64_C(0x0000000000000001), UINT64_C(0x0000000000008082),
+            UINT64_C(0x800000000000808A), UINT64_C(0x8000000080008000),
+            UINT64_C(0x000000000000808B), UINT64_C(0x0000000080000001),
+            UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008009),
+            UINT64_C(0x000000000000008A), UINT64_C(0x0000000000000088),
+            UINT64_C(0x0000000080008009), UINT64_C(0x000000008000000A),
+            UINT64_C(0x000000008000808B), UINT64_C(0x800000000000008B),
+            UINT64_C(0x8000000000008089), UINT64_C(0x8000000000008003),
+            UINT64_C(0x8000000000008002), UINT64_C(0x8000000000000080),
+            UINT64_C(0x000000000000800A), UINT64_C(0x800000008000000A),
+            UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008080),
+            UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008008)
+        };
+
+        constexpr const int RhoShiftBit[NumRounds] = {
+             1,  3,  6, 10, 15, 21, 28, 36,
+            45, 55,  2, 14, 27, 41, 56,  8,
+            25, 43, 62, 18, 39, 61, 20, 44
+        };
+
+        constexpr const int RhoNextIndex[NumRounds] = {
+            10,  7, 11, 17, 18,  3,  5, 16,
+             8, 21, 24,  4, 15, 23, 19, 13,
+            12,  2, 20, 14, 22,  9,  6,  1
+        };
+
+    }
+
+    template<size_t HashSize>
+    void Sha3Impl<HashSize>::Initialize() {
+        /* Clear internal state. */
+        std::memset(m_internal_state, 0, sizeof(m_internal_state));
+
+        /* Reset buffered bytes. */
+        m_buffered_bytes = 0;
+
+        /* Set state. */
+        m_state = State_Initialized;
+    }
+
+    template<size_t HashSize>
+    void Sha3Impl<HashSize>::Update(const void *data, size_t size) {
+        /* Verify we're in a state to update. */
+        AMS_ASSERT(m_state == State_Initialized);
+
+        /* Process we have anything buffered. */
+        const u8 *data8 = static_cast<const u8 *>(data);
+        size_t remaining = size;
+        if (m_buffered_bytes > 0) {
+            /* Determine how much we can copy. */
+            const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
+
+            /* Mix the bytes into our state. */
+            u8 *dst8 = reinterpret_cast<u8 *>(m_internal_state) + m_buffered_bytes;
+            for (size_t i = 0; i < copy_size; ++i) {
+                dst8[i] ^= data8[i];
+            }
+
+            /* Advance. */
+            data8            += copy_size;
+            remaining        -= copy_size;
+            m_buffered_bytes += copy_size;
+
+            /* Process a block, if we filled one. */
+            if (m_buffered_bytes == BlockSize) {
+                this->ProcessBlock();
+                m_buffered_bytes = 0;
+            }
+        }
+
+        /* Process blocks, if we have any. */
+        while (remaining >= BlockSize) {
+            /* Mix the bytes into our state. */
+            u8 *dst8 = reinterpret_cast<u8 *>(m_internal_state);
+            for (size_t i = 0; i < BlockSize; ++i) {
+                dst8[i] ^= data8[i];
+            }
+
+            this->ProcessBlock();
+
+            data8     += BlockSize;
+            remaining -= BlockSize;
+        }
+
+        /* Copy any leftover data to our buffer. */
+        if (remaining > 0) {
+            u8 *dst8 = reinterpret_cast<u8 *>(m_internal_state);
+            for (size_t i = 0; i < remaining; ++i) {
+                dst8[i] ^= data8[i];
+            }
+
+            m_buffered_bytes = remaining;
+        }
+    }
+
+    template<size_t HashSize>
+    void Sha3Impl<HashSize>::GetHash(void *dst, size_t size) {
+        /* Verify we're in a state to get hash. */
+        AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
+        AMS_ASSERT(size >= HashSize);
+        AMS_UNUSED(size);
+
+        /* If we need to, process the last block. */
+        if (m_state == State_Initialized) {
+            this->ProcessLastBlock();
+            m_state = State_Done;
+        }
+
+        /* Copy the output hash. */
+        std::memcpy(dst, m_internal_state, HashSize);
+    }
+
+    template<size_t HashSize>
+    void Sha3Impl<HashSize>::InitializeWithContext(const Sha3Context *context) {
+        /* Check the context is for the right hash size. */
+        AMS_ASSERT(context->hash_size == HashSize);
+
+        /* Set buffered bytes. */
+        m_buffered_bytes = context->buffered_bytes;
+
+        /* Copy state in from the context. */
+        std::memcpy(m_internal_state, context->internal_state, sizeof(m_internal_state));
+
+        /* Reset other fields. */
+        m_state = State_Initialized;
+    }
+
+    template<size_t HashSize>
+    void Sha3Impl<HashSize>::GetContext(Sha3Context *context) const {
+        /* Check our state. */
+        AMS_ASSERT(m_state == State_Initialized);
+
+        /* Set the output hash size. */
+        context->hash_size = HashSize;
+
+        /* Set buffered bytes. */
+        context->buffered_bytes = m_buffered_bytes;
+
+        /* Copy out the context. */
+        std::memcpy(context->internal_state, m_internal_state, sizeof(context->internal_state));
+    }
+
+    template<size_t HashSize>
+    void Sha3Impl<HashSize>::ProcessBlock() {
+        /* Ensure correct endianness. */
+        if constexpr (util::IsBigEndian()) {
+            for (size_t i = 0; i < util::size(m_internal_state); ++i) {
+                m_internal_state[i] = util::LoadLittleEndian<u64>(m_internal_state + i);
+            }
+        }
+
+        /* Perform all rounds. */
+        uint64_t tmp, C[5];
+        for (auto round = 0; round < NumRounds; ++round) {
+            /* Handle theta. */
+            for (size_t i = 0; i < 5; ++i) {
+                C[i] = m_internal_state[i] ^ m_internal_state[i + 5] ^ m_internal_state[i + 10] ^ m_internal_state[i + 15] ^ m_internal_state[i + 20];
+            }
+
+            for (size_t i = 0; i < 5; ++i) {
+                tmp = C[(i + 4) % 5] ^ util::RotateLeft<u64>(C[(i + 1) % 5], 1);
+                for (size_t j = 0; j < 5; ++j) {
+                    m_internal_state[5 * j + i] ^= tmp;
+                }
+            }
+
+            /* Handle rho/pi. */
+            tmp = m_internal_state[1];
+            for (size_t i = 0; i < NumRounds; ++i) {
+                const auto rho_next_idx = RhoNextIndex[i];
+                C[0] = m_internal_state[rho_next_idx];
+                m_internal_state[rho_next_idx] = util::RotateLeft<u64>(tmp, RhoShiftBit[i]);
+                tmp = C[0];
+            }
+
+            /* Handle chi. */
+            for (size_t i = 0; i < 5; ++i) {
+                for (size_t j = 0; j < 5; ++j) {
+                    C[j] = m_internal_state[5 * i + j];
+                }
+                for (size_t j = 0; j < 5; ++j) {
+                    m_internal_state[5 * i + j] ^= (~C[(j + 1) % 5]) & C[(j + 2) % 5];
+                }
+            }
+
+            /* Handle iota. */
+            m_internal_state[0] ^= IotaRoundConstant[round];
+        }
+        /* Ensure correct endianness. */
+        if constexpr (util::IsBigEndian()) {
+            for (size_t i = 0; i < util::size(m_internal_state); ++i) {
+                util::StoreLittleEndian<u64>(m_internal_state + i, m_internal_state[i]);
+            }
+        }
+    }
+
+    template<size_t HashSize>
+    void Sha3Impl<HashSize>::ProcessLastBlock() {
+        /* Mix final bits (011) into our state. */
+        reinterpret_cast<u8 *>(m_internal_state)[m_buffered_bytes] ^= 0b110;
+
+        /* Mix in the high bit of the last word in our block. */
+        constexpr u64 FinalMask = UINT64_C(0x8000000000000000);
+        m_internal_state[(BlockSize / sizeof(u64)) - 1] ^= FinalMask;
+
+        /* Process the last block. */
+        this->ProcessBlock();
+    }
+
+    /* Explicitly instantiate the supported hash sizes. */
+    template class Sha3Impl<224 / BITSIZEOF(u8)>;
+    template class Sha3Impl<256 / BITSIZEOF(u8)>;
+    template class Sha3Impl<384 / BITSIZEOF(u8)>;
+    template class Sha3Impl<512 / BITSIZEOF(u8)>;
+
+}