Revert "hoc-clk: add live vdd2, live boost clock and basic pwm dimming"

This reverts commit 15b7df8ef1.
This commit is contained in:
souldbminersmwc
2025-11-09 16:14:52 -05:00
parent 22ec140738
commit 21a3f953d7
3804 changed files with 435 additions and 570162 deletions

View File

@@ -1,501 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
#if defined(ATMOSPHERE_IS_STRATOSPHERE)
#include <arm_neon.h>
#endif
namespace ams::crypto::impl {
#if defined(ATMOSPHERE_IS_STRATOSPHERE)
namespace {
/* Helper macros to setup for inline AES asm */
#define AES_ENC_DEC_SETUP_VARS() \
const auto *ctx = reinterpret_cast<const RoundKeyHelper<KeySize> *>(m_round_keys); \
static_assert(sizeof(*ctx) == sizeof(m_round_keys)); \
\
uint8x16_t tmp = vld1q_u8((const uint8_t *)src); \
uint8x16_t tmp2
#define AES_ENC_DEC_OUTPUT_VARS() \
[tmp]"+w"(tmp), [tmp2]"=w"(tmp2)
#define AES_ENC_DEC_STORE_RESULT() \
vst1q_u8((uint8_t *)dst, tmp)
/* Helper macros to do AES encryption, via inline asm. */
#define AES_ENC_ROUND(n) \
"ldr %q[tmp2], %[round_key_" #n "]\n" \
"aese %[tmp].16b, %[tmp2].16b\n" \
"aesmc %[tmp].16b, %[tmp].16b\n"
#define AES_ENC_FINAL_ROUND() \
"ldr %q[tmp2], %[round_key_second_last]\n" \
"aese %[tmp].16b, %[tmp2].16b\n" \
"ldr %q[tmp2], %[round_key_last]\n" \
"eor %[tmp].16b, %[tmp].16b, %[tmp2].16b"
#define AES_ENC_INPUT_ROUND_KEY(num_rounds, n) \
[round_key_##n]"m"(ctx->round_keys[(n-1)])
#define AES_ENC_INPUT_LAST_ROUND_KEYS(num_rounds) \
[round_key_second_last]"m"(ctx->round_keys[(num_rounds - 1)]), \
[round_key_last]"m"(ctx->round_keys[(num_rounds)])
/* Helper macros to do AES decryption, via inline asm. */
#define AES_DEC_ROUND(n) \
"ldr %q[tmp2], %[round_key_" #n "]\n" \
"aesd %[tmp].16b, %[tmp2].16b\n" \
"aesimc %[tmp].16b, %[tmp].16b\n"
#define AES_DEC_FINAL_ROUND() \
"ldr %q[tmp2], %[round_key_second_last]\n" \
"aesd %[tmp].16b, %[tmp2].16b\n" \
"ldr %q[tmp2], %[round_key_last]\n" \
"eor %[tmp].16b, %[tmp].16b, %[tmp2].16b"
#define AES_DEC_INPUT_ROUND_KEY(num_rounds, n) \
[round_key_##n]"m"(ctx->round_keys[(num_rounds + 1 - n)])
#define AES_DEC_INPUT_LAST_ROUND_KEYS(num_rounds) \
[round_key_second_last]"m"(ctx->round_keys[1]), \
[round_key_last]"m"(ctx->round_keys[0])
constexpr const u8 RoundKeyRcon0[] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, 0x2F,
0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, 0xD4, 0xB3, 0x7D, 0xFA, 0xEF, 0xC5, 0x91,
};
constexpr const u8 SubBytesTable[0x100] = {
0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16,
};
constexpr auto AesWordByte0Shift = 0 * BITSIZEOF(u8);
constexpr auto AesWordByte1Shift = 1 * BITSIZEOF(u8);
constexpr auto AesWordByte2Shift = 2 * BITSIZEOF(u8);
constexpr auto AesWordByte3Shift = 3 * BITSIZEOF(u8);
constexpr u32 SubBytesAndRotate(u32 v) {
return (static_cast<u32>(SubBytesTable[(v >> AesWordByte0Shift) & 0xFFu]) << AesWordByte3Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte1Shift) & 0xFFu]) << AesWordByte0Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte2Shift) & 0xFFu]) << AesWordByte1Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte3Shift) & 0xFFu]) << AesWordByte2Shift);
}
constexpr u32 SubBytes(u32 v) {
return (static_cast<u32>(SubBytesTable[(v >> AesWordByte0Shift) & 0xFFu]) << AesWordByte0Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte1Shift) & 0xFFu]) << AesWordByte1Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte2Shift) & 0xFFu]) << AesWordByte2Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte3Shift) & 0xFFu]) << AesWordByte3Shift);
}
}
template<size_t KeySize>
AesImpl<KeySize>::~AesImpl() {
ClearMemory(this, sizeof(*this));
}
template<size_t KeySize>
void AesImpl<KeySize>::Initialize(const void *key, size_t key_size, bool is_encrypt) {
/* Check pre-conditions. */
AMS_ASSERT(key_size == KeySize);
AMS_UNUSED(key_size);
/* Set up key. */
u32 *dst = m_round_keys;
std::memcpy(dst, key, KeySize);
/* Perform key scheduling. */
constexpr auto InitialKeyWords = KeySize / sizeof(u32);
u32 tmp = dst[InitialKeyWords - 1];
for (auto i = InitialKeyWords; i < (RoundCount + 1) * 4; ++i) {
const auto idx_in_key = i % InitialKeyWords;
if (idx_in_key == 0) {
/* At start of key word, we need to handle sub/rotate/rcon. */
tmp = SubBytesAndRotate(tmp);
tmp ^= (RoundKeyRcon0[i / InitialKeyWords - 1] << AesWordByte0Shift);
} else if ((InitialKeyWords > 6) && idx_in_key == 4) {
/* Halfway into a 256-bit key word, we need to do an additional subbytes. */
tmp = SubBytes(tmp);
}
/* Set the key word. */
tmp ^= dst[i - InitialKeyWords];
dst[i] = tmp;
}
/* If decrypting, perform inverse mix columns on all round keys. */
if (!is_encrypt) {
auto *key8 = reinterpret_cast<u8 *>(m_round_keys) + BlockSize;
for (auto i = 1; i < RoundCount; ++i) {
vst1q_u8(key8, vaesimcq_u8(vld1q_u8(key8)));
key8 += BlockSize;
}
}
}
template<size_t KeySize>
void AesImpl<KeySize>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Get the key. */
const u8 *key8 = reinterpret_cast<const u8 *>(m_round_keys);
/* Read the block. */
uint8x16_t block = vld1q_u8(static_cast<const u8 *>(src));
/* Encrypt block. */
for (auto round = 1; round < RoundCount; ++round) {
/* Do aes round. */
block = vaeseq_u8(block, vld1q_u8(key8));
key8 += BlockSize;
/* Do mix columns. */
block = vaesmcq_u8(block);
}
/* Do last aes round. */
block = vaeseq_u8(block, vld1q_u8(key8));
key8 += BlockSize;
/* Add the final round key. */
block = veorq_u8(block, vld1q_u8(key8));
/* Store the block. */
vst1q_u8(static_cast<u8 *>(dst), block);
}
template<size_t KeySize>
void AesImpl<KeySize>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Get the key. */
const u8 *key8 = reinterpret_cast<const u8 *>(m_round_keys) + (RoundCount * BlockSize);
/* Read the block. */
uint8x16_t block = vld1q_u8(static_cast<const u8 *>(src));
/* Encrypt block. */
for (auto round = RoundCount; round > 1; --round) {
/* Do aes round. */
block = vaesdq_u8(block, vld1q_u8(key8));
key8 -= BlockSize;
/* Do mix columns. */
block = vaesimcq_u8(block);
}
/* Do last aes round. */
block = vaesdq_u8(block, vld1q_u8(key8));
key8 -= BlockSize;
/* Add the first round key. */
block = veorq_u8(block, vld1q_u8(key8));
/* Store the block. */
vst1q_u8(static_cast<u8 *>(dst), block);
}
/* Specializations when building specifically for cortex-a57 (or for apple M* processors). */
#if defined(ATMOSPHERE_CPU_ARM_CORTEX_A57) || defined(ATMOSPHERE_OS_MACOS)
namespace {
template<size_t KeySize>
struct RoundKeyHelper {
u8 round_keys[AesImpl<KeySize>::RoundCount + 1][AesImpl<KeySize>::BlockSize];
};
}
template<>
void AesImpl<16>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Setup for asm */
AES_ENC_DEC_SETUP_VARS();
/* Use optimized assembly to do all rounds. */
__asm__ __volatile__ (
AES_ENC_ROUND(1)
AES_ENC_ROUND(2)
AES_ENC_ROUND(3)
AES_ENC_ROUND(4)
AES_ENC_ROUND(5)
AES_ENC_ROUND(6)
AES_ENC_ROUND(7)
AES_ENC_ROUND(8)
AES_ENC_ROUND(9)
AES_ENC_FINAL_ROUND()
: AES_ENC_DEC_OUTPUT_VARS()
: AES_ENC_INPUT_ROUND_KEY(RoundCount, 1),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 2),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 3),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 4),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 5),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 6),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 7),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 8),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 9),
AES_ENC_INPUT_LAST_ROUND_KEYS(RoundCount)
);
/* Store result. */
AES_ENC_DEC_STORE_RESULT();
}
template<>
void AesImpl<24>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Setup for asm */
AES_ENC_DEC_SETUP_VARS();
/* Use optimized assembly to do all rounds. */
__asm__ __volatile__ (
AES_ENC_ROUND(1)
AES_ENC_ROUND(2)
AES_ENC_ROUND(3)
AES_ENC_ROUND(4)
AES_ENC_ROUND(5)
AES_ENC_ROUND(6)
AES_ENC_ROUND(7)
AES_ENC_ROUND(8)
AES_ENC_ROUND(9)
AES_ENC_ROUND(10)
AES_ENC_ROUND(11)
AES_ENC_FINAL_ROUND()
: AES_ENC_DEC_OUTPUT_VARS()
: AES_ENC_INPUT_ROUND_KEY(RoundCount, 1),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 2),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 3),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 4),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 5),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 6),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 7),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 8),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 9),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 10),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 11),
AES_ENC_INPUT_LAST_ROUND_KEYS(RoundCount)
);
/* Store result. */
AES_ENC_DEC_STORE_RESULT();
}
template<>
void AesImpl<32>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Setup for asm */
AES_ENC_DEC_SETUP_VARS();
/* Use optimized assembly to do all rounds. */
__asm__ __volatile__ (
AES_ENC_ROUND(1)
AES_ENC_ROUND(2)
AES_ENC_ROUND(3)
AES_ENC_ROUND(4)
AES_ENC_ROUND(5)
AES_ENC_ROUND(6)
AES_ENC_ROUND(7)
AES_ENC_ROUND(8)
AES_ENC_ROUND(9)
AES_ENC_ROUND(10)
AES_ENC_ROUND(11)
AES_ENC_ROUND(12)
AES_ENC_ROUND(13)
AES_ENC_FINAL_ROUND()
: AES_ENC_DEC_OUTPUT_VARS()
: AES_ENC_INPUT_ROUND_KEY(RoundCount, 1),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 2),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 3),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 4),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 5),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 6),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 7),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 8),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 9),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 10),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 11),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 12),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 13),
AES_ENC_INPUT_LAST_ROUND_KEYS(RoundCount)
);
/* Store result. */
AES_ENC_DEC_STORE_RESULT();
}
template<>
void AesImpl<16>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Setup for asm */
AES_ENC_DEC_SETUP_VARS();
/* Use optimized assembly to do all rounds. */
__asm__ __volatile__ (
AES_DEC_ROUND(1)
AES_DEC_ROUND(2)
AES_DEC_ROUND(3)
AES_DEC_ROUND(4)
AES_DEC_ROUND(5)
AES_DEC_ROUND(6)
AES_DEC_ROUND(7)
AES_DEC_ROUND(8)
AES_DEC_ROUND(9)
AES_DEC_FINAL_ROUND()
: AES_ENC_DEC_OUTPUT_VARS()
: AES_DEC_INPUT_ROUND_KEY(RoundCount, 1),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 2),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 3),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 4),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 5),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 6),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 7),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 8),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 9),
AES_DEC_INPUT_LAST_ROUND_KEYS(RoundCount)
);
/* Store result. */
AES_ENC_DEC_STORE_RESULT();
}
template<>
void AesImpl<24>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Setup for asm */
AES_ENC_DEC_SETUP_VARS();
/* Use optimized assembly to do all rounds. */
__asm__ __volatile__ (
AES_DEC_ROUND(1)
AES_DEC_ROUND(2)
AES_DEC_ROUND(3)
AES_DEC_ROUND(4)
AES_DEC_ROUND(5)
AES_DEC_ROUND(6)
AES_DEC_ROUND(7)
AES_DEC_ROUND(8)
AES_DEC_ROUND(9)
AES_DEC_ROUND(10)
AES_DEC_ROUND(11)
AES_DEC_FINAL_ROUND()
: AES_ENC_DEC_OUTPUT_VARS()
: AES_DEC_INPUT_ROUND_KEY(RoundCount, 1),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 2),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 3),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 4),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 5),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 6),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 7),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 8),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 9),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 10),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 11),
AES_DEC_INPUT_LAST_ROUND_KEYS(RoundCount)
);
/* Store result. */
AES_ENC_DEC_STORE_RESULT();
}
template<>
void AesImpl<32>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Setup for asm */
AES_ENC_DEC_SETUP_VARS();
/* Use optimized assembly to do all rounds. */
__asm__ __volatile__ (
AES_DEC_ROUND(1)
AES_DEC_ROUND(2)
AES_DEC_ROUND(3)
AES_DEC_ROUND(4)
AES_DEC_ROUND(5)
AES_DEC_ROUND(6)
AES_DEC_ROUND(7)
AES_DEC_ROUND(8)
AES_DEC_ROUND(9)
AES_DEC_ROUND(10)
AES_DEC_ROUND(11)
AES_DEC_ROUND(12)
AES_DEC_ROUND(13)
AES_DEC_FINAL_ROUND()
: AES_ENC_DEC_OUTPUT_VARS()
: AES_DEC_INPUT_ROUND_KEY(RoundCount, 1),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 2),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 3),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 4),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 5),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 6),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 7),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 8),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 9),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 10),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 11),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 12),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 13),
AES_DEC_INPUT_LAST_ROUND_KEYS(RoundCount)
);
/* Store result. */
AES_ENC_DEC_STORE_RESULT();
}
#endif
/* Explicitly instantiate the three supported key sizes. */
template class AesImpl<16>;
template class AesImpl<24>;
template class AesImpl<32>;
#else
/* NOTE: Exosphere defines this in libexosphere. */
/* TODO: Non-EL0 implementation. */
#endif
}

View File

@@ -1,435 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
#include "crypto_aes_impl.arch.x64.hpp"
namespace ams::crypto::impl {
namespace {
constexpr bool IsSupportedKeySize(size_t size) {
return size == 16 || size == 24 || size == 32;
}
constexpr int WordsPerBlock = AesImpl<16>::BlockSize / sizeof(u32);
static_assert(AesImpl<16>::BlockSize == AesImpl<24>::BlockSize);
static_assert(AesImpl<16>::BlockSize == AesImpl<32>::BlockSize);
bool GetAesNiAvailabilityImpl() {
/* Call cpu id. */
int a = 0, b = 0, c = 0, d = 0;
__asm__ __volatile__("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(1) : "memory");
/* Check for AES-NI and SSE2. */
return (c & (1 << 25)) && (d & (1 << 26));
}
static_assert(util::IsLittleEndian());
constexpr const u8 RoundKeyRcon0[] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, 0x2F,
0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, 0xD4, 0xB3, 0x7D, 0xFA, 0xEF, 0xC5, 0x91,
};
constexpr const u8 SubBytesTable[0x100] = {
0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16,
};
constexpr const u8 InvSubBytesTable[0x100] = {
0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB,
0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB,
0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E,
0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25,
0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92,
0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84,
0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06,
0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B,
0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73,
0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E,
0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B,
0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4,
0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F,
0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF,
0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61,
0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D,
};
constexpr bool IsSubBytesTableValid() {
for (size_t i = 0; i < 0x100; ++i) {
if (SubBytesTable[InvSubBytesTable[i]] != i) {
return false;
}
if (InvSubBytesTable[SubBytesTable[i]] != i) {
return false;
}
}
return true;
}
static_assert(IsSubBytesTableValid());
constexpr const u32 EncryptTable[0x100] = {
0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591,
0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC,
0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB,
0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B,
0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83,
0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A,
0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F,
0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA,
0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B,
0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413,
0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6,
0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85,
0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511,
0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B,
0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1,
0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF,
0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E,
0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6,
0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B,
0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD,
0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8,
0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2,
0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949,
0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810,
0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697,
0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F,
0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C,
0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27,
0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433,
0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5,
0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0,
0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C,
};
constexpr const u32 DecryptTable[0x100] = {
0x50A7F451, 0x5365417E, 0xC3A4171A, 0x965E273A, 0xCB6BAB3B, 0xF1459D1F, 0xAB58FAAC, 0x9303E34B,
0x55FA3020, 0xF66D76AD, 0x9176CC88, 0x254C02F5, 0xFCD7E54F, 0xD7CB2AC5, 0x80443526, 0x8FA362B5,
0x495AB1DE, 0x671BBA25, 0x980EEA45, 0xE1C0FE5D, 0x02752FC3, 0x12F04C81, 0xA397468D, 0xC6F9D36B,
0xE75F8F03, 0x959C9215, 0xEB7A6DBF, 0xDA595295, 0x2D83BED4, 0xD3217458, 0x2969E049, 0x44C8C98E,
0x6A89C275, 0x78798EF4, 0x6B3E5899, 0xDD71B927, 0xB64FE1BE, 0x17AD88F0, 0x66AC20C9, 0xB43ACE7D,
0x184ADF63, 0x82311AE5, 0x60335197, 0x457F5362, 0xE07764B1, 0x84AE6BBB, 0x1CA081FE, 0x942B08F9,
0x58684870, 0x19FD458F, 0x876CDE94, 0xB7F87B52, 0x23D373AB, 0xE2024B72, 0x578F1FE3, 0x2AAB5566,
0x0728EBB2, 0x03C2B52F, 0x9A7BC586, 0xA50837D3, 0xF2872830, 0xB2A5BF23, 0xBA6A0302, 0x5C8216ED,
0x2B1CCF8A, 0x92B479A7, 0xF0F207F3, 0xA1E2694E, 0xCDF4DA65, 0xD5BE0506, 0x1F6234D1, 0x8AFEA6C4,
0x9D532E34, 0xA055F3A2, 0x32E18A05, 0x75EBF6A4, 0x39EC830B, 0xAAEF6040, 0x069F715E, 0x51106EBD,
0xF98A213E, 0x3D06DD96, 0xAE053EDD, 0x46BDE64D, 0xB58D5491, 0x055DC471, 0x6FD40604, 0xFF155060,
0x24FB9819, 0x97E9BDD6, 0xCC434089, 0x779ED967, 0xBD42E8B0, 0x888B8907, 0x385B19E7, 0xDBEEC879,
0x470A7CA1, 0xE90F427C, 0xC91E84F8, 0x00000000, 0x83868009, 0x48ED2B32, 0xAC70111E, 0x4E725A6C,
0xFBFF0EFD, 0x5638850F, 0x1ED5AE3D, 0x27392D36, 0x64D90F0A, 0x21A65C68, 0xD1545B9B, 0x3A2E3624,
0xB1670A0C, 0x0FE75793, 0xD296EEB4, 0x9E919B1B, 0x4FC5C080, 0xA220DC61, 0x694B775A, 0x161A121C,
0x0ABA93E2, 0xE52AA0C0, 0x43E0223C, 0x1D171B12, 0x0B0D090E, 0xADC78BF2, 0xB9A8B62D, 0xC8A91E14,
0x8519F157, 0x4C0775AF, 0xBBDD99EE, 0xFD607FA3, 0x9F2601F7, 0xBCF5725C, 0xC53B6644, 0x347EFB5B,
0x7629438B, 0xDCC623CB, 0x68FCEDB6, 0x63F1E4B8, 0xCADC31D7, 0x10856342, 0x40229713, 0x2011C684,
0x7D244A85, 0xF83DBBD2, 0x1132F9AE, 0x6DA129C7, 0x4B2F9E1D, 0xF330B2DC, 0xEC52860D, 0xD0E3C177,
0x6C16B32B, 0x99B970A9, 0xFA489411, 0x2264E947, 0xC48CFCA8, 0x1A3FF0A0, 0xD82C7D56, 0xEF903322,
0xC74E4987, 0xC1D138D9, 0xFEA2CA8C, 0x360BD498, 0xCF81F5A6, 0x28DE7AA5, 0x268EB7DA, 0xA4BFAD3F,
0xE49D3A2C, 0x0D927850, 0x9BCC5F6A, 0x62467E54, 0xC2138DF6, 0xE8B8D890, 0x5EF7392E, 0xF5AFC382,
0xBE805D9F, 0x7C93D069, 0xA92DD56F, 0xB31225CF, 0x3B99ACC8, 0xA77D1810, 0x6E639CE8, 0x7BBB3BDB,
0x097826CD, 0xF418596E, 0x01B79AEC, 0xA89A4F83, 0x656E95E6, 0x7EE6FFAA, 0x08CFBC21, 0xE6E815EF,
0xD99BE7BA, 0xCE366F4A, 0xD4099FEA, 0xD67CB029, 0xAFB2A431, 0x31233F2A, 0x3094A5C6, 0xC066A235,
0x37BC4E74, 0xA6CA82FC, 0xB0D090E0, 0x15D8A733, 0x4A9804F1, 0xF7DAEC41, 0x0E50CD7F, 0x2FF69117,
0x8DD64D76, 0x4DB0EF43, 0x544DAACC, 0xDF0496E4, 0xE3B5D19E, 0x1B886A4C, 0xB81F2CC1, 0x7F516546,
0x04EA5E9D, 0x5D358C01, 0x737487FA, 0x2E410BFB, 0x5A1D67B3, 0x52D2DB92, 0x335610E9, 0x1347D66D,
0x8C61D79A, 0x7A0CA137, 0x8E14F859, 0x893C13EB, 0xEE27A9CE, 0x35C961B7, 0xEDE51CE1, 0x3CB1477A,
0x59DFD29C, 0x3F73F255, 0x79CE1418, 0xBF37C773, 0xEACDF753, 0x5BAAFD5F, 0x146F3DDF, 0x86DB4478,
0x81F3AFCA, 0x3EC468B9, 0x2C342438, 0x5F40A3C2, 0x72C31D16, 0x0C25E2BC, 0x8B493C28, 0x41950DFF,
0x7101A839, 0xDEB30C08, 0x9CE4B4D8, 0x90C15664, 0x6184CB7B, 0x70B632D5, 0x745C6C48, 0x4257B8D0,
};
constexpr auto AesWordByte0Shift = 0 * BITSIZEOF(u8);
constexpr auto AesWordByte1Shift = 1 * BITSIZEOF(u8);
constexpr auto AesWordByte2Shift = 2 * BITSIZEOF(u8);
constexpr auto AesWordByte3Shift = 3 * BITSIZEOF(u8);
constexpr auto AesMixShift = 3 * BITSIZEOF(u8);
constexpr void InverseMixColumns(u32 *dst, const u32 *src) {
for (auto i = 0; i < WordsPerBlock; ++i) {
const u32 v0 = src[i];
const u32 v1 = (((v0 & 0x7F7F7F7Fu) << 1) ^ (((v0 & 0x80808080) >> 7) * 0x1B));
const u32 v2 = (((v1 & 0x7F7F7F7Fu) << 1) ^ (((v1 & 0x80808080) >> 7) * 0x1B));
const u32 v3 = (((v2 & 0x7F7F7F7Fu) << 1) ^ (((v2 & 0x80808080) >> 7) * 0x1B));
u32 v = v0 ^ v3;
v ^= util::RotateLeft(v, AesMixShift) ^ v2;
v ^= util::RotateLeft(v, AesMixShift) ^ v1;
v ^= util::RotateLeft(v, AesMixShift) ^ v0;
dst[i] = v;
}
}
constexpr u32 SubBytesAndRotate(u32 v) {
return (static_cast<u32>(SubBytesTable[(v >> AesWordByte0Shift) & 0xFFu]) << AesWordByte3Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte1Shift) & 0xFFu]) << AesWordByte0Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte2Shift) & 0xFFu]) << AesWordByte1Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte3Shift) & 0xFFu]) << AesWordByte2Shift);
}
constexpr u32 SubBytes(u32 v) {
return (static_cast<u32>(SubBytesTable[(v >> AesWordByte0Shift) & 0xFFu]) << AesWordByte0Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte1Shift) & 0xFFu]) << AesWordByte1Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte2Shift) & 0xFFu]) << AesWordByte2Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte3Shift) & 0xFFu]) << AesWordByte3Shift);
}
constexpr u32 ShiftSubMix(u32 v0, u32 v1, u32 v2, u32 v3) {
return (util::RotateLeft(static_cast<u32>(EncryptTable[(v0 >> AesWordByte0Shift) & 0xFFu]), AesWordByte0Shift)) ^
(util::RotateLeft(static_cast<u32>(EncryptTable[(v1 >> AesWordByte1Shift) & 0xFFu]), AesWordByte1Shift)) ^
(util::RotateLeft(static_cast<u32>(EncryptTable[(v2 >> AesWordByte2Shift) & 0xFFu]), AesWordByte2Shift)) ^
(util::RotateLeft(static_cast<u32>(EncryptTable[(v3 >> AesWordByte3Shift) & 0xFFu]), AesWordByte3Shift));
}
constexpr u32 ShiftSub(u32 v0, u32 v1, u32 v2, u32 v3) {
return (static_cast<u32>(SubBytesTable[(v0 >> AesWordByte0Shift) & 0xFFu]) << AesWordByte0Shift) ^
(static_cast<u32>(SubBytesTable[(v1 >> AesWordByte1Shift) & 0xFFu]) << AesWordByte1Shift) ^
(static_cast<u32>(SubBytesTable[(v2 >> AesWordByte2Shift) & 0xFFu]) << AesWordByte2Shift) ^
(static_cast<u32>(SubBytesTable[(v3 >> AesWordByte3Shift) & 0xFFu]) << AesWordByte3Shift);
}
constexpr u32 InvShiftSubMix(u32 v0, u32 v1, u32 v2, u32 v3) {
return (util::RotateLeft(static_cast<u32>(DecryptTable[(v0 >> AesWordByte0Shift) & 0xFFu]), AesWordByte0Shift)) ^
(util::RotateLeft(static_cast<u32>(DecryptTable[(v1 >> AesWordByte1Shift) & 0xFFu]), AesWordByte1Shift)) ^
(util::RotateLeft(static_cast<u32>(DecryptTable[(v2 >> AesWordByte2Shift) & 0xFFu]), AesWordByte2Shift)) ^
(util::RotateLeft(static_cast<u32>(DecryptTable[(v3 >> AesWordByte3Shift) & 0xFFu]), AesWordByte3Shift));
}
constexpr u32 InvShiftSub(u32 v0, u32 v1, u32 v2, u32 v3) {
return (static_cast<u32>(InvSubBytesTable[(v0 >> AesWordByte0Shift) & 0xFFu]) << AesWordByte0Shift) ^
(static_cast<u32>(InvSubBytesTable[(v1 >> AesWordByte1Shift) & 0xFFu]) << AesWordByte1Shift) ^
(static_cast<u32>(InvSubBytesTable[(v2 >> AesWordByte2Shift) & 0xFFu]) << AesWordByte2Shift) ^
(static_cast<u32>(InvSubBytesTable[(v3 >> AesWordByte3Shift) & 0xFFu]) << AesWordByte3Shift);
}
}
const bool g_is_aes_ni_available = GetAesNiAvailabilityImpl();
template<size_t KeySize>
AesImpl<KeySize>::~AesImpl() {
ClearMemory(this, sizeof(*this));
}
template<size_t KeySize>
void AesImpl<KeySize>::Initialize(const void *key, size_t key_size, bool is_encrypt) {
/* Check pre-conditions. */
static_assert(IsSupportedKeySize(KeySize));
AMS_ASSERT(key != nullptr);
AMS_ASSERT(key_size == KeySize);
AMS_UNUSED(key_size);
/* Set up key. */
u32 *dst = m_round_keys;
std::memcpy(dst, key, KeySize);
/* Perform key scheduling. */
constexpr auto InitialKeyWords = KeySize / sizeof(u32);
u32 tmp = dst[InitialKeyWords - 1];
for (auto i = InitialKeyWords; i < (RoundCount + 1) * 4; ++i) {
const auto idx_in_key = i % InitialKeyWords;
if (idx_in_key == 0) {
/* At start of key word, we need to handle sub/rotate/rcon. */
tmp = SubBytesAndRotate(tmp);
tmp ^= (RoundKeyRcon0[i / InitialKeyWords - 1] << AesWordByte0Shift);
} else if ((InitialKeyWords > 6) && idx_in_key == 4) {
/* Halfway into a 256-bit key word, we need to do an additional subbytes. */
tmp = SubBytes(tmp);
}
/* Set the key word. */
tmp ^= dst[i - InitialKeyWords];
dst[i] = tmp;
}
/* If decrypting, perform inverse mix columns on all round keys. */
if (!is_encrypt) {
if (IsAesNiAvailable()) {
auto *key8 = reinterpret_cast<u8 *>(m_round_keys) + BlockSize;
for (auto i = 1; i < RoundCount; ++i) {
auto * const key128 = reinterpret_cast<__m128i *>(key8);
_mm_storeu_si128(key128, _mm_aesimc_si128(_mm_loadu_si128(key128)));
key8 += BlockSize;
}
} else {
for (auto i = 1; i < RoundCount; ++i) {
InverseMixColumns(m_round_keys + WordsPerBlock * i, m_round_keys + WordsPerBlock * i);
}
}
}
}
template<size_t KeySize>
void AesImpl<KeySize>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
static_assert(IsSupportedKeySize(KeySize));
AMS_ASSERT(dst_size == BlockSize && src_size == BlockSize);
AMS_UNUSED(dst_size, src_size);
/* Perform block encryption. */
if (IsAesNiAvailable()) {
const auto *key8 = reinterpret_cast<const u8 *>(m_round_keys);
/* Load the block. */
auto block = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
/* Add the first round key. */
block = _mm_xor_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
key8 += BlockSize;
/* Perform aes round on remaining round keys. */
for (auto i = 1; i < RoundCount; ++i) {
block = _mm_aesenc_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
key8 += BlockSize;
}
/* Do final update. */
block = _mm_aesenclast_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
/* Store the output. */
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), block);
} else {
static_assert(WordsPerBlock == 4);
/* Without AES-NI, we'll operate on words. */
const u32 *key32 = m_round_keys;
const u32 *src32 = static_cast<const u32 *>(src);
u32 *dst32 = static_cast< u32 *>(dst);
/* Add the first round key. */
u32 v0 = src32[0] ^ key32[0];
u32 v1 = src32[1] ^ key32[1];
u32 v2 = src32[2] ^ key32[2];
u32 v3 = src32[3] ^ key32[3];
key32 += 4;
/* Perform each round. */
auto round = RoundCount;
while (--round > 0) {
/* Perform aes round. */
const u32 e0 = ShiftSubMix(v0, v1, v2, v3);
const u32 e1 = ShiftSubMix(v1, v2, v3, v0);
const u32 e2 = ShiftSubMix(v2, v3, v0, v1);
const u32 e3 = ShiftSubMix(v3, v0, v1, v2);
/* Add the round key. */
v0 = e0 ^ key32[0];
v1 = e1 ^ key32[1];
v2 = e2 ^ key32[2];
v3 = e3 ^ key32[3];
key32 += 4;
}
/* Perform the final round. */
dst32[0] = key32[0] ^ ShiftSub(v0, v1, v2, v3);
dst32[1] = key32[1] ^ ShiftSub(v1, v2, v3, v0);
dst32[2] = key32[2] ^ ShiftSub(v2, v3, v0, v1);
dst32[3] = key32[3] ^ ShiftSub(v3, v0, v1, v2);
}
}
template<size_t KeySize>
void AesImpl<KeySize>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
static_assert(IsSupportedKeySize(KeySize));
AMS_ASSERT(dst_size == BlockSize && src_size == BlockSize);
AMS_UNUSED(dst_size, src_size);
/* Perform block decryption. */
if (IsAesNiAvailable()) {
const auto *key8 = reinterpret_cast<const u8 *>(m_round_keys) + (RoundCount * BlockSize);
/* Load the block. */
auto block = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
/* Add the final round key. */
block = _mm_xor_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
key8 -= BlockSize;
/* Perform aes invround on remaining round keys. */
for (auto i = RoundCount; i > 1; --i) {
block = _mm_aesdec_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
key8 -= BlockSize;
}
/* Do final update. */
block = _mm_aesdeclast_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
/* Store the output. */
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), block);
} else {
static_assert(WordsPerBlock == 4);
/* Without AES-NI, we'll operate on words. */
const u32 *key32 = m_round_keys + WordsPerBlock * RoundCount;
const u32 *src32 = static_cast<const u32 *>(src);
u32 *dst32 = static_cast< u32 *>(dst);
/* Add the final round key. */
u32 v0 = src32[0] ^ key32[0];
u32 v1 = src32[1] ^ key32[1];
u32 v2 = src32[2] ^ key32[2];
u32 v3 = src32[3] ^ key32[3];
key32 -= 4;
/* Perform each round. */
auto round = RoundCount;
while (--round > 0) {
/* Perform aes inv round. */
const u32 e0 = InvShiftSubMix(v0, v3, v2, v1);
const u32 e1 = InvShiftSubMix(v1, v0, v3, v2);
const u32 e2 = InvShiftSubMix(v2, v1, v0, v3);
const u32 e3 = InvShiftSubMix(v3, v2, v1, v0);
/* Add the round key. */
v0 = e0 ^ key32[0];
v1 = e1 ^ key32[1];
v2 = e2 ^ key32[2];
v3 = e3 ^ key32[3];
key32 -= 4;
}
/* Perform the final round. */
dst32[0] = key32[0] ^ InvShiftSub(v0, v3, v2, v1);
dst32[1] = key32[1] ^ InvShiftSub(v1, v0, v3, v2);
dst32[2] = key32[2] ^ InvShiftSub(v2, v1, v0, v3);
dst32[3] = key32[3] ^ InvShiftSub(v3, v2, v1, v0);
}
}
/* Explicitly instantiate the three supported key sizes. */
template class AesImpl<16>;
template class AesImpl<24>;
template class AesImpl<32>;
}

View File

@@ -1,28 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <vapours.hpp>
#include <x86intrin.h>
namespace ams::crypto::impl {
extern const bool g_is_aes_ni_available;
ALWAYS_INLINE bool IsAesNiAvailable() {
return g_is_aes_ni_available;
}
}

View File

@@ -1,148 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
namespace ams::crypto::impl {
void BigNum::ImportImpl(Word *out, size_t out_size, const u8 *src, size_t src_size) {
size_t octet_ofs = src_size;
size_t word_ofs = 0;
/* Parse octets into words. */
while (word_ofs < out_size && octet_ofs > 0) {
Word w = 0;
for (size_t shift = 0; octet_ofs > 0 && shift < BITSIZEOF(Word); shift += BITSIZEOF(u8)) {
w |= static_cast<Word>(src[--octet_ofs]) << shift;
}
out[word_ofs++] = w;
}
/* Zero-fill upper words. */
while (word_ofs < out_size) {
out[word_ofs++] = 0;
}
}
void BigNum::ExportImpl(u8 *out, size_t out_size, const Word *src, size_t src_size) {
size_t octet_ofs = out_size;
/* Parse words into octets. */
for (size_t word_ofs = 0; word_ofs < src_size && octet_ofs > 0; word_ofs++) {
const Word w = src[word_ofs];
for (size_t shift = 0; octet_ofs > 0 && shift < BITSIZEOF(Word); shift += BITSIZEOF(u8)) {
out[--octet_ofs] = static_cast<u8>(w >> shift);
}
}
/* Zero-clear remaining octets. */
while (octet_ofs > 0) {
out[--octet_ofs] = 0;
}
}
size_t BigNum::GetSize() const {
if (m_num_words == 0) {
return 0;
}
static_assert(sizeof(Word) == 4);
size_t size = m_num_words * sizeof(Word);
const Word last = m_words[m_num_words - 1];
AMS_ASSERT(last != 0);
if (last >= 0x01000000u) {
return size - 0;
} else if (last >= 0x00010000u) {
return size - 1;
} else if (last >= 0x00000100u) {
return size - 2;
} else {
return size - 3;
}
}
bool BigNum::Import(const void *src, size_t src_size) {
AMS_ASSERT((src != nullptr) || (src_size != 0));
/* Ignore leading zeroes. */
const u8 *data = static_cast<const u8 *>(src);
while (src_size > 0 && *data == 0) {
++data;
--src_size;
}
/* Ensure we have space for the number. */
AMS_ASSERT(src_size <= m_max_words * sizeof(Word));
if (AMS_UNLIKELY(!(src_size <= m_max_words * sizeof(Word)))) {
return false;
}
/* Import. */
m_num_words = util::AlignUp(src_size, sizeof(Word)) / sizeof(Word);
ImportImpl(m_words, m_max_words, data, src_size);
return true;
}
void BigNum::Export(void *dst, size_t dst_size) {
AMS_ASSERT(dst_size >= this->GetSize());
ExportImpl(static_cast<u8 *>(dst), dst_size, m_words, m_num_words);
}
bool BigNum::ExpMod(void *dst, const void *src, size_t size, const BigNum &exp, u32 *work_buf, size_t work_buf_size) const {
/* Can't exponentiate with or about zero. */
if (this->IsZero() || exp.IsZero()) {
return false;
}
AMS_ASSERT(size == this->GetSize());
/* Create an allocator. */
WordAllocator allocator(work_buf, work_buf_size / sizeof(Word));
ON_SCOPE_EXIT { ClearMemory(work_buf, allocator.GetMaxUsedSize()); };
/* Create a BigNum for the signature. */
BigNum signature;
auto signature_words = allocator.Allocate(size / sizeof(Word));
if (!signature_words.IsValid()) {
return false;
}
/* Import data for the signature. */
signature.ReserveStatic(signature_words.GetBuffer(), signature_words.GetCount());
if (!signature.Import(src, size)) {
return false;
}
/* Perform the exponentiation. */
if (!ExpMod(signature.m_words, signature.m_words, exp.m_words, exp.m_num_words, m_words, m_num_words, std::addressof(allocator))) {
return false;
}
/* We succeeded, so export. */
signature.UpdateCount();
signature.Export(dst, size);
return true;
}
void BigNum::ClearToZero() {
std::memset(m_words, 0, m_num_words * sizeof(Word));
}
void BigNum::UpdateCount() {
m_num_words = CountWords(m_words, m_max_words);
}
}

View File

@@ -1,490 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
namespace ams::crypto::impl {
namespace {
constexpr ALWAYS_INLINE BigNum::Word GetTop2Bits(BigNum::Word w) {
return (w >> (BigNum::BitsPerWord - 2)) & 0x3u;
}
constexpr ALWAYS_INLINE void MultWord(BigNum::Word *dst, BigNum::Word lhs, BigNum::Word rhs) {
static_assert(sizeof(BigNum::DoubleWord) == sizeof(BigNum::Word) * 2);
BigNum::DoubleWord result = static_cast<BigNum::DoubleWord>(lhs) * static_cast<BigNum::DoubleWord>(rhs);
dst[0] = static_cast<BigNum::Word>(result & ~BigNum::Word());
dst[1] = static_cast<BigNum::Word>(result >> BITSIZEOF(BigNum::Word));
}
constexpr ALWAYS_INLINE BigNum::HalfWord GetUpperHalf(BigNum::Word word) {
static_assert(sizeof(BigNum::Word) == sizeof(BigNum::HalfWord) * 2);
return static_cast<BigNum::HalfWord>((word >> BITSIZEOF(BigNum::HalfWord)) & ~BigNum::HalfWord());
}
constexpr ALWAYS_INLINE BigNum::HalfWord GetLowerHalf(BigNum::Word word) {
static_assert(sizeof(BigNum::Word) == sizeof(BigNum::HalfWord) * 2);
return static_cast<BigNum::HalfWord>(word & ~BigNum::HalfWord());
}
constexpr ALWAYS_INLINE BigNum::Word ToUpperHalf(BigNum::HalfWord half) {
static_assert(sizeof(BigNum::Word) == sizeof(BigNum::HalfWord) * 2);
return static_cast<BigNum::Word>(half) << BITSIZEOF(BigNum::HalfWord);
}
[[maybe_unused]] constexpr ALWAYS_INLINE BigNum::Word ToLowerHalf(BigNum::HalfWord half) {
static_assert(sizeof(BigNum::Word) == sizeof(BigNum::HalfWord) * 2);
return static_cast<BigNum::Word>(half);
}
constexpr ALWAYS_INLINE BigNum::Word DivWord(const BigNum::Word *w, BigNum::Word div) {
using Word = BigNum::Word;
using HalfWord = BigNum::HalfWord;
Word work[2] = { w[0], w[1] };
HalfWord r_hi = 0, r_lo = 0;
HalfWord d_hi = GetUpperHalf(div);
HalfWord d_lo = GetLowerHalf(div);
if (d_hi == BigNum::MaxHalfWord) {
r_hi = GetUpperHalf(work[1]);
} else {
r_hi = GetLowerHalf(work[1] / (d_hi + 1));
}
{
const Word hh = static_cast<Word>(r_hi) * static_cast<Word>(d_hi);
const Word hl = static_cast<Word>(r_hi) * static_cast<Word>(d_lo);
const Word uhl = ToUpperHalf(static_cast<HalfWord>(hl));
if ((work[0] -= uhl) > (BigNum::MaxWord - uhl)) {
work[1]--;
}
work[1] -= GetUpperHalf(hl);
work[1] -= hh;
const Word udl = ToUpperHalf(d_lo);
while (work[1] > d_hi || (work[1] == d_hi && work[0] >= udl)) {
if ((work[0] -= udl) > (BigNum::MaxWord - udl)) {
work[1]--;
}
work[1] -= d_hi;
r_hi++;
}
}
if (d_hi == BigNum::MaxHalfWord) {
r_lo = GetLowerHalf(work[1]);
} else {
r_lo = GetLowerHalf((ToUpperHalf(static_cast<HalfWord>(work[1])) + GetUpperHalf(work[0])) / (d_hi + 1));
}
{
const Word ll = static_cast<Word>(r_lo) * static_cast<Word>(d_lo);
const Word lh = static_cast<Word>(r_lo) * static_cast<Word>(d_hi);
if ((work[0] -= ll) > (BigNum::MaxWord - ll)) {
work[1]--;
}
const Word ulh = ToUpperHalf(static_cast<HalfWord>(lh));
if ((work[0] -= ulh) > (BigNum::MaxWord - ulh)) {
work[1]--;
}
work[1] -= GetUpperHalf(lh);
while ((work[1] > 0) || (work[1] == 0 && work[0] >= div)) {
if ((work[0] -= div) > (BigNum::MaxWord - div)) {
work[1]--;
}
r_lo++;
}
}
return ToUpperHalf(r_hi) + r_lo;
}
}
bool BigNum::IsZero(const Word *w, size_t num_words) {
for (size_t i = 0; i < num_words; i++) {
if (w[i]) {
return false;
}
}
return true;
}
int BigNum::Compare(const Word *lhs, const Word *rhs, size_t num_words) {
for (s32 i = static_cast<s32>(num_words) - 1; i >= 0; i--) {
if (lhs[i] > rhs[i]) {
return 1;
} else if (lhs[i] < rhs[i]) {
return -1;
}
}
return 0;
}
size_t BigNum::CountWords(const Word *w, size_t num_words) {
s32 i = static_cast<s32>(num_words) - 1;
while (i >= 0 && !w[i]) {
i--;
}
return i + 1;
}
size_t BigNum::CountSignificantBits(Word w) {
size_t i;
for (i = 0; i < BitsPerWord && w != 0; i++) {
w >>= 1;
}
return i;
}
void BigNum::ClearToZero(Word *w, size_t num_words) {
for (size_t i = 0; i < num_words; i++) {
w[i] = 0;
}
}
void BigNum::SetToWord(Word *w, size_t num_words, Word v) {
ClearToZero(w, num_words);
w[0] = v;
}
void BigNum::Copy(Word *dst, const Word *src, size_t num_words) {
for (size_t i = 0; i < num_words; i++) {
dst[i] = src[i];
}
}
BigNum::Word BigNum::LeftShift(Word *dst, const Word *w, size_t num_words, const size_t shift) {
if (shift >= BitsPerWord) {
return 0;
}
const size_t invshift = BitsPerWord - shift;
Word carry = 0;
for (size_t i = 0; i < num_words; i++) {
const Word cur = w[i];
dst[i] = (cur << shift) | carry;
carry = shift ? (cur >> invshift) : 0;
}
return carry;
}
BigNum::Word BigNum::RightShift(Word *dst, const Word *w, size_t num_words, const size_t shift) {
if (shift >= BitsPerWord) {
return 0;
}
const size_t invshift = BitsPerWord - shift;
Word carry = 0;
for (s32 i = static_cast<s32>(num_words) - 1; i >= 0; i--) {
const Word cur = w[i];
dst[i] = (cur >> shift) | carry;
carry = shift ? (cur << invshift) : 0;
}
return carry;
}
BigNum::Word BigNum::MultSub(Word *dst, const Word *w, const Word *v, size_t num_words, Word mult) {
/* If multiplying by zero, nothing to do. */
if (mult == 0) {
return 0;
}
Word borrow = 0, work[2];
for (size_t i = 0; i < num_words; i++) {
/* Multiply, calculate borrow for next. */
MultWord(work, mult, v[i]);
if ((dst[i] = (w[i] - borrow)) > (MaxWord - borrow)) {
borrow = 1;
} else {
borrow = 0;
}
if ((dst[i] -= work[0]) > (MaxWord - work[0])) {
borrow++;
}
borrow += work[1];
}
return borrow;
}
bool BigNum::ExpMod(Word *dst, const Word *src, const Word *exp, size_t exp_words, const Word *mod, size_t mod_words, WordAllocator *allocator) {
/* Nintendo uses an algorithm that relies on powers of exp. */
bool needs_exp[4] = {};
if (exp_words > 1) {
needs_exp[2] = true;
needs_exp[3] = true;
} else {
Word exp_w = exp[0];
for (size_t i = 0; i < BitsPerWord / 2; i++) {
/* Nintendo at each step determines needed exponent from a pair of two bits. */
needs_exp[exp_w & 0x3u] = true;
exp_w >>= 2;
}
if (needs_exp[3]) {
needs_exp[2] = true;
}
}
/* Allocate space for powers 1, 2, 3. */
auto power_1 = allocator->Allocate(mod_words);
auto power_2 = allocator->Allocate(mod_words);
auto power_3 = allocator->Allocate(mod_words);
if (!(power_1.IsValid() && power_2.IsValid() && power_3.IsValid())) {
return false;
}
decltype(power_1)* powers[3] = { std::addressof(power_1), std::addressof(power_2), std::addressof(power_3) };
/* Set the powers of src. */
Copy(power_1.GetBuffer(), src, mod_words);
if (needs_exp[2]) {
if (!MultMod(power_2.GetBuffer(), power_1.GetBuffer(), src, mod, mod_words, allocator)) {
return false;
}
}
if (needs_exp[3]) {
if (!MultMod(power_3.GetBuffer(), power_2.GetBuffer(), src, mod, mod_words, allocator)) {
return false;
}
}
/* Allocate space to work. */
auto work = allocator->Allocate(mod_words);
if (!work.IsValid()) {
return false;
}
SetToWord(work.GetBuffer(), work.GetCount(), 1);
/* Ensure we're working with the correct exponent word count. */
exp_words = CountWords(exp, exp_words);
for (s32 i = static_cast<s32>(exp_words - 1); i >= 0; i--) {
Word cur_word = exp[i];
size_t cur_bits = BitsPerWord;
/* Remove leading zeroes in first word. */
if (i == static_cast<s32>(exp_words - 1)) {
while (!GetTop2Bits(cur_word)) {
cur_word <<= 2;
cur_bits -= 2;
}
}
/* Compute current modular multiplicative step. */
for (size_t j = 0; j < cur_bits; j += 2, cur_word <<= 2) {
/* Exponentiate current work to the 4th power. */
if (!MultMod(work.GetBuffer(), work.GetBuffer(), work.GetBuffer(), mod, mod_words, allocator)) {
return false;
}
if (!MultMod(work.GetBuffer(), work.GetBuffer(), work.GetBuffer(), mod, mod_words, allocator)) {
return false;
}
if (const Word top = GetTop2Bits(cur_word)) {
if (!MultMod(work.GetBuffer(), work.GetBuffer(), powers[top - 1]->GetBuffer(), mod, mod_words, allocator)) {
return false;
}
}
}
}
/* Copy work to output. */
Copy(dst, work.GetBuffer(), mod_words);
return true;
}
bool BigNum::MultMod(Word *dst, const Word *src, const Word *mult, const Word *mod, size_t num_words, WordAllocator *allocator) {
/* Allocate work. */
auto work = allocator->Allocate(2 * num_words);
if (!work.IsValid()) {
return false;
}
/* Multiply. */
if (!Mult(work.GetBuffer(), src, mult, num_words, allocator)) {
return false;
}
/* Mod. */
if (!Mod(dst, work.GetBuffer(), 2 * num_words, mod, num_words, allocator)) {
return false;
}
return true;
}
bool BigNum::Mod(Word *dst, const Word *src, size_t src_words, const Word *mod, size_t mod_words, WordAllocator *allocator) {
/* Allocate work. */
auto work = allocator->Allocate(src_words);
if (!work.IsValid()) {
return false;
}
if (!DivMod(work.GetBuffer(), dst, src, src_words, mod, mod_words, allocator)) {
return false;
}
return true;
}
bool BigNum::DivMod(Word *quot, Word *rem, const Word *top, size_t top_words, const Word *bot, size_t bot_words, WordAllocator *allocator) {
/* Allocate work. */
auto top_work = allocator->Allocate(top_words + 1);
auto bot_work = allocator->Allocate(bot_words);
if (!(top_work.IsValid() && bot_work.IsValid())) {
return false;
}
/* Prevent division by zero. */
size_t bot_work_words = CountWords(bot, bot_words);
if (bot_work_words == 0) {
return false;
}
ClearToZero(quot, top_words);
ClearToZero(top_work.GetBuffer(), bot_work_words);
/* Align to edges. */
const size_t shift = BitsPerWord - CountSignificantBits(bot[bot_work_words - 1]);
top_work.GetBuffer()[top_words] = LeftShift(top_work.GetBuffer(), top, top_words, shift);
LeftShift(bot_work.GetBuffer(), bot, bot_work_words, shift);
const Word tb = bot_work.GetBuffer()[bot_work_words - 1];
/* Repeatedly div + sub. */
for (s32 i = (top_words - bot_work_words); i >= 0; i--) {
Word cur_word;
if (tb == MaxWord) {
cur_word = top_work.GetBuffer()[i + bot_work_words];
} else {
cur_word = DivWord(top_work.GetBuffer() + i + bot_work_words - 1, tb + 1);
}
top_work.GetBuffer()[i + bot_work_words] -= MultSub(top_work.GetBuffer() + i, top_work.GetBuffer() + i, bot_work.GetBuffer(), bot_work_words, cur_word);
while (top_work.GetBuffer()[i + bot_work_words] || Compare(top_work.GetBuffer() + i, bot_work.GetBuffer(), bot_work_words) >= 0) {
cur_word++;
top_work.GetBuffer()[i + bot_work_words] -= Sub(top_work.GetBuffer() + i, top_work.GetBuffer() + i, bot_work.GetBuffer(), bot_work_words);
}
quot[i] = cur_word;
}
/* Calculate remainder. */
ClearToZero(rem, bot_words);
RightShift(rem, top_work.GetBuffer(), bot_work_words, shift);
return true;
}
bool BigNum::Mult(Word *dst, const Word *lhs, const Word *rhs, size_t num_words, WordAllocator *allocator) {
/* Allocate work. */
auto work = allocator->Allocate(2 * num_words);
if (!work.IsValid()) {
return false;
}
ClearToZero(work.GetBuffer(), work.GetCount());
/* Repeatedly add and multiply. */
const size_t lhs_words = CountWords(lhs, num_words);
const size_t rhs_words = CountWords(rhs, num_words);
for (size_t i = 0; i < lhs_words; i++) {
work.GetBuffer()[i + rhs_words] += MultAdd(work.GetBuffer() + i, rhs, rhs_words, lhs[i]);
}
/* Copy to output. */
Copy(dst, work.GetBuffer(), work.GetCount());
return true;
}
#if !defined(ATMOSPHERE_ARCH_ARM64)
BigNum::Word BigNum::Add(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) {
Word carry = 0;
for (size_t i = 0; i < num_words; ++i) {
Word v;
if ((v = lhs[i] + carry) < carry) {
v = rhs[i];
} else if ((v += rhs[i]) < rhs[i]) {
carry = 1;
} else {
carry = 0;
}
dst[i] = v;
}
return carry;
}
BigNum::Word BigNum::Sub(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) {
Word borrow = 0;
for (size_t i = 0; i < num_words; ++i) {
Word v;
if ((v = lhs[i] - borrow) > (BigNum::MaxWord - borrow)) {
v = BigNum::MaxWord - rhs[i];
} else if ((v -= rhs[i]) > (BigNum::MaxWord - rhs[i])) {
borrow = 1;
} else {
borrow = 0;
}
dst[i] = v;
}
return borrow;
}
BigNum::Word BigNum::MultAdd(Word *dst, const Word *w, size_t num_words, Word mult) {
/* If multiplying by zero, nothing to do. */
if (mult == 0) {
return 0;
}
Word carry = 0, work[2];
for (size_t i = 0; i < num_words; i++) {
/* Multiply, calculate carry for next. */
MultWord(work, mult, w[i]);
if ((dst[i] += carry) < carry) {
carry = 1;
} else {
carry = 0;
}
if ((dst[i] += work[0]) < work[0]) {
carry++;
}
carry += work[1];
}
return carry;
}
#endif
}

View File

@@ -1,294 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/* ams::crypto::impl::BigNum::Add(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) */
#if !defined(ATMOSPHERE_OS_MACOS)
.section .text._ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m, "ax", %progbits
.global _ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m
.type _ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m, %function
#else
.text
.global _ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m
#endif
.balign 0x10
_ZN3ams6crypto4impl6BigNum3AddEPjPKjPKjm:
/* Check if we have anything to do at all. */
msr nzcv, xzr
cbz x3, 7f
/* Save registers. */
stp x16, x17, [sp, #-16]!
stp xzr, x19, [sp, #-16]!
stp x20, x21, [sp, #-16]!
/* Check if we have less than 16 words to process. */
lsr x20, x3, #4
cbz x20, 2f
sub x3, x3, x20, lsl #4
1: /* Process 16 words at a time. */
/* NOTE: Nintendo uses X18 here, we will use X21 for EL1+ compat. */
ldp x4, x5, [x1], #16
ldp x12, x13, [x2], #16
ldp x6, x7, [x1], #16
ldp x14, x15, [x2], #16
ldp x8, x9, [x1], #16
ldp x16, x17, [x2], #16
ldp x10, x11, [x1], #16
ldp x21, x19, [x2], #16
adcs x4, x4, x12
adcs x5, x5, x13
stp x4, x5, [x0], #16
adcs x6, x6, x14
adcs x7, x7, x15
stp x6, x7, [x0], #16
adcs x8, x8, x16
adcs x9, x9, x17
stp x8, x9, [x0], #16
adcs x10, x10, x21
adcs x11, x11, x19
stp x10, x11, [x0], #16
sub x20, x20, #1
cbnz x20, 1b
2: /* We have less than 16 words to process. */
lsr x15, x3, #2
cbz x15, 4f
sub x3, x3, x15, lsl #2
3: /* Process 4 words at a time. */
ldp x4, x5, [x1], #16
ldp x8, x9, [x2], #16
sub x15, x15, #1
adcs x4, x4, x8
adcs x5, x5, x9
stp x4, x5, [x0], #16
cbnz x15, 3b
4: /* We have less than 4 words to process. */
cbz x3, 6f
5: /* Process 1 word at a time. */
ldr w4, [x1], #4
ldr w8, [x2], #4
adcs w4, w4, w8
str w4, [x0], #4
sub x3, x3, #1
cbnz x3, 5b
6: /* Restore registers we used while adding. */
ldp x20, x21, [sp], #16
ldp xzr, x19, [sp], #16
ldp x16, x17, [sp], #16
7: /* We're done. */
adc x0, xzr, xzr
ret
/* ams::crypto::impl::BigNum::Sub(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) */
#if !defined(ATMOSPHERE_OS_MACOS)
.section .text._ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m, "ax", %progbits
.global _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m
.type _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m, %function
#else
.text
.global _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m
#endif
.balign 0x10
_ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m:
/* Check if we have anything to do at all. */
mov x4, #0x20000000
msr nzcv, x4
cbz x3, 7f
/* Save registers. */
stp x16, x17, [sp, #-16]!
stp xzr, x19, [sp, #-16]!
stp x20, x21, [sp, #-16]!
/* Check if we have less than 16 words to process. */
lsr x20, x3, #4
cbz x20, 2f
sub x3, x3, x20, lsl #4
1: /* Process 16 words at a time. */
/* NOTE: Nintendo uses X18 here, we will use X21 for EL1+ compat. */
ldp x4, x5, [x1], #16
ldp x12, x13, [x2], #16
ldp x6, x7, [x1], #16
ldp x14, x15, [x2], #16
ldp x8, x9, [x1], #16
ldp x16, x17, [x2], #16
ldp x10, x11, [x1], #16
ldp x21, x19, [x2], #16
sbcs x4, x4, x12
sbcs x5, x5, x13
stp x4, x5, [x0], #16
sbcs x6, x6, x14
sbcs x7, x7, x15
stp x6, x7, [x0], #16
sbcs x8, x8, x16
sbcs x9, x9, x17
stp x8, x9, [x0], #16
sbcs x10, x10, x21
sbcs x11, x11, x19
stp x10, x11, [x0], #16
sub x20, x20, #1
cbnz x20, 1b
2: /* We have less than 16 words to process. */
lsr x15, x3, #2
cbz x15, 4f
sub x3, x3, x15, lsl #2
3: /* Process 4 words at a time. */
ldp x4, x5, [x1], #16
ldp x8, x9, [x2], #16
sub x15, x15, #1
sbcs x4, x4, x8
sbcs x5, x5, x9
stp x4, x5, [x0], #16
cbnz x15, 3b
4: /* We have less than 4 words to process. */
cbz x3, 6f
5: /* Process 1 word at a time. */
ldr w4, [x1], #4
ldr w8, [x2], #4
sbcs w4, w4, w8
str w4, [x0], #4
sub x3, x3, #1
cbnz x3, 5b
6: /* Restore registers we used while adding. */
ldp x20, x21, [sp], #16
ldp xzr, x19, [sp], #16
ldp x16, x17, [sp], #16
7: /* We're done. */
cinc x0, xzr, cc
ret
/* ams::crypto::impl::BigNum::MultAdd(Word *dst, const Word *w, size_t num_words, Word mult) */
#if !defined(ATMOSPHERE_OS_MACOS)
.section .text._ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj, "ax", %progbits
.global _ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj
.type _ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj, %function
#else
.text
.global _ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj
#endif
.balign 0x10
_ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj:
/* Check if we have anything to do at all. */
mov x15, xzr
cbz x2, 5f
/* Check if we have less than four words to process. */
lsr x6, x2, #2
cbz x6, 2f
/* We have more than four words to process. */
sub x2, x2, x6, lsl #2
stp x16, x17, [sp, #-16]!
1: /* Loop processing four words at a time. */
ldp w4, w5, [x1], #8
ldp w16, w7, [x1], #8
ldp w8, w9, [x0]
ldp w10, w11, [x0, #8]
umaddl x4, w3, w4, x8
umaddl x5, w3, w5, x9
umaddl x16, w3, w16, x10
umaddl x7, w3, w7, x11
add x12, x4, x15, lsr #32
add x13, x5, x12, lsr #32
stp w12, w13, [x0], #8
add x14, x16, x13, lsr #32
add x15, x7, x14, lsr #32
stp w14, w15, [x0], #8
sub x6, x6, #1
cbnz x6, 1b
ldp x16, x17, [sp], #16
2: /* We have less than four words. Check if we have less than two. */
lsr x6, x2, #1
cbz x6, 4f
/* We have more than two words to process. */
sub x2, x2, x6, lsl #1
3: /* Loop processing two words at a time. */
ldp w4, w5, [x1], #8
ldp w8, w9, [x0]
umaddl x4, w3, w4, x8
umaddl x5, w3, w5, x9
sub x6, x6, #1
add x14, x4, x15, lsr #32
add x15, x5, x14, lsr #32
stp w14, w15, [x0], #8
cbnz x6, 3b
4: /* We have less than two words to process. */
cbz x2, 5f
/* We have one word to process. */
ldr w4, [x1], #4
ldr w8, [x0]
umaddl x4, w3, w4, x8
add x15, x4, x15, lsr #32
str w15, [x0], #4
5: /* We're done. */
lsr x0, x15, #32
ret

View File

@@ -1,61 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
#include "crypto_update_impl.hpp"
namespace ams::crypto::impl {
void CbcMacImpl::UpdateGeneric(const void *data, size_t size) {
/* Check pre-conditions. */
AMS_ASSERT(m_state == State_Initialized);
/* Update. */
UpdateImpl<void>(this, data, size);
}
void CbcMacImpl::ProcessBlocksGeneric(const void *data, size_t num_blocks) {
/* If we have a block remaining, process it. */
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock(m_buffer);
m_buffered_bytes = 0;
}
/* Process blocks. */
const u8 *data8 = static_cast<const u8 *>(data);
u8 block[BlockSize];
while ((--num_blocks) > 0) {
for (size_t i = 0; i < BlockSize; ++i) {
block[i] = data8[i] ^ m_mac[i];
}
m_cipher_function(m_mac, block, m_cipher_context);
data8 += BlockSize;
}
/* Process the last block. */
std::memcpy(m_buffer, data8, BlockSize);
m_buffered_bytes = BlockSize;
}
template<>
void CbcMacImpl::Update<AesEncryptor128>(const void *data, size_t size) {
this->UpdateGeneric(data, size);
}
}

View File

@@ -1,81 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
namespace ams::crypto::impl {
void CbcMacImpl::ProcessBlock(const void *data) {
/* Procses the block. */
const u8 *data8 = static_cast<const u8 *>(data);
u8 block[BlockSize];
for (size_t i = 0; i < BlockSize; ++i) {
block[i] = data8[i] ^ m_mac[i];
}
m_cipher_function(m_mac, block, m_cipher_context);
}
void CbcMacImpl::ProcessPartialData(const void *data, size_t size) {
/* Copy in the data. */
std::memcpy(m_buffer + m_buffered_bytes, data, size);
m_buffered_bytes += size;
}
void CbcMacImpl::ProcessRemainingData(const void *data, size_t size) {
/* If we have a block remaining, process it. */
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock(m_buffer);
m_buffered_bytes = 0;
}
/* Copy the remaining data. */
std::memcpy(m_buffer, data, size);
m_buffered_bytes = size;
}
void CbcMacImpl::GetMac(void *mac, size_t mac_size) {
/* Check pre-conditions. */
AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
AMS_ASSERT(mac_size >= BlockSize);
AMS_UNUSED(mac_size);
/* Ensure we're done. */
if (m_state == State_Initialized) {
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock(m_buffer);
m_buffered_bytes = 0;
}
m_state = State_Done;
}
/* Copy out the mac. */
std::memcpy(mac, m_mac, sizeof(m_mac));
}
void CbcMacImpl::MaskBufferedData(const void *data, size_t size) {
/* Check pre-conditions. */
AMS_ASSERT(m_buffered_bytes == BlockSize);
AMS_ASSERT(size == BlockSize);
AMS_UNUSED(size);
/* Mask the data. */
for (size_t i = 0; i < BlockSize; ++i) {
m_buffer[i] ^= static_cast<const u8 *>(data)[i];
}
}
}

View File

@@ -1,591 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
#ifdef ATMOSPHERE_IS_STRATOSPHERE
#include <arm_neon.h>
namespace ams::crypto::impl {
/* Variable management macros. */
#define DECLARE_ROUND_KEY_VAR(n) \
const uint8x16_t round_key_##n = vld1q_u8(keys + (BlockSize * n))
#define AES_ENC_DEC_OUTPUT_THREE_BLOCKS() \
[tmp0]"+w"(tmp0), [tmp1]"+w"(tmp1), [tmp2]"+w"(tmp2)
#define AES_ENC_DEC_OUTPUT_THREE_CTRS() \
[ctr0]"+w"(ctr0), [ctr1]"+w"(ctr1), [ctr2]"+w"(ctr2)
#define AES_ENC_DEC_OUTPUT_ONE_BLOCK() \
[tmp0]"+w"(tmp0)
#define AES_ENC_DEC_OUTPUT_ONE_CTR() \
[ctr0]"+w"(ctr0)
#define CTR_INCREMENT_OUTPUT_HIGH_LOW() \
[high]"=&r"(high), [low]"=&r"(low)
#define CTR_INCREMENT_OUTPUT_HIGH_LOW_TMP() \
[high_tmp]"=&r"(high_tmp), [low_tmp]"=&r"(low_tmp)
#define CTR_INCREMENT_OUTPUT_HL_SINGLE_TMP() \
[hl_tmp]"=&r"(hl_tmp)
#define AES_ENC_DEC_INPUT_ROUND_KEY(n) \
[round_key_##n]"w"(round_key_##n)
/* AES Encryption macros. */
#define AES_ENC_ROUND(n, i) \
"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \
"aesmc %[tmp" #i "].16b, %[tmp" #i "].16b\n"
#define AES_ENC_SECOND_LAST_ROUND(n, i) \
"aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
#define AES_ENC_LAST_ROUND(n, i) \
"eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
namespace {
ALWAYS_INLINE uint8x16_t IncrementCounterOptimized(const uint8x16_t ctr) {
uint8x16_t inc;
uint64_t high, low;
/* Use ASM. TODO: Better than using intrinsics? */
__asm__ __volatile__ (
"mov %[high], %[ctr].d[0]\n"
"mov %[low], %[ctr].d[1]\n"
"rev %[high], %[high]\n"
"rev %[low], %[low]\n"
"adds %[low], %[low], 1\n"
"cinc %[high], %[high], cs\n"
"rev %[high], %[high]\n"
"rev %[low], %[low]\n"
"mov %[inc].d[0], %[high]\n"
"mov %[inc].d[1], %[low]\n"
: [inc]"=w"(inc),
CTR_INCREMENT_OUTPUT_HIGH_LOW()
: [ctr]"w"(ctr)
: "cc"
);
return inc;
}
}
template<>
void CtrModeImpl<AesEncryptor128>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) {
/* Preload all round keys + iv into neon registers. */
const u8 *keys = m_block_cipher->GetRoundKey();
DECLARE_ROUND_KEY_VAR(0);
DECLARE_ROUND_KEY_VAR(1);
DECLARE_ROUND_KEY_VAR(2);
DECLARE_ROUND_KEY_VAR(3);
DECLARE_ROUND_KEY_VAR(4);
DECLARE_ROUND_KEY_VAR(5);
DECLARE_ROUND_KEY_VAR(6);
DECLARE_ROUND_KEY_VAR(7);
DECLARE_ROUND_KEY_VAR(8);
DECLARE_ROUND_KEY_VAR(9);
DECLARE_ROUND_KEY_VAR(10);
uint8x16_t ctr0 = vld1q_u8(m_counter);
uint64_t high, low;
/* Process three blocks at a time, when possible. */
if (num_blocks >= 3) {
/* Increment CTR twice. */
uint8x16_t ctr1 = IncrementCounterOptimized(ctr0);
uint8x16_t ctr2 = IncrementCounterOptimized(ctr1);
uint64_t high_tmp, low_tmp;
while (num_blocks >= 3) {
/* Read blocks in. Keep them in registers for XOR later. */
const uint8x16_t block0 = vld1q_u8(src);
src += AesEncryptor128::BlockSize;
const uint8x16_t block1 = vld1q_u8(src);
src += AesEncryptor128::BlockSize;
const uint8x16_t block2 = vld1q_u8(src);
src += AesEncryptor128::BlockSize;
/* We'll be encrypting the three CTRs. */
uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
/* Actually do encryption, use optimized asm. */
/* Interleave CTR calculations with AES ones, to mask latencies. */
__asm__ __volatile__ (
AES_ENC_ROUND(0, 0) "mov %[high], %[ctr2].d[0]\n"
AES_ENC_ROUND(0, 1) "mov %[low], %[ctr2].d[1]\n"
AES_ENC_ROUND(0, 2) "rev %[high], %[high]\n"
AES_ENC_ROUND(1, 0) "rev %[low], %[low]\n"
AES_ENC_ROUND(1, 1) "adds %[low], %[low], 1\n"
AES_ENC_ROUND(1, 2) "cinc %[high], %[high], cs\n"
AES_ENC_ROUND(2, 0) "rev %[high_tmp], %[high]\n"
AES_ENC_ROUND(2, 1) "rev %[low_tmp], %[low]\n"
AES_ENC_ROUND(2, 2) "mov %[ctr0].d[0], %[high_tmp]\n"
AES_ENC_ROUND(3, 0) "mov %[ctr0].d[1], %[low_tmp]\n"
AES_ENC_ROUND(3, 1) "adds %[low], %[low], 1\n"
AES_ENC_ROUND(3, 2) "cinc %[high], %[high], cs\n"
AES_ENC_ROUND(4, 0) "rev %[high_tmp], %[high]\n"
AES_ENC_ROUND(4, 1) "rev %[low_tmp], %[low]\n"
AES_ENC_ROUND(4, 2) "mov %[ctr1].d[0], %[high_tmp]\n"
AES_ENC_ROUND(5, 0) "mov %[ctr1].d[1], %[low_tmp]\n"
AES_ENC_ROUND(5, 1) "adds %[low], %[low], 1\n"
AES_ENC_ROUND(5, 2) "cinc %[high], %[high], cs\n"
AES_ENC_ROUND(6, 0) "rev %[high_tmp], %[high]\n"
AES_ENC_ROUND(6, 1) "rev %[low_tmp], %[low]\n"
AES_ENC_ROUND(6, 2) "mov %[ctr2].d[0], %[high_tmp]\n"
AES_ENC_ROUND(7, 0) "mov %[ctr2].d[1], %[low_tmp]\n"
AES_ENC_ROUND(7, 1)
AES_ENC_ROUND(7, 2)
AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2)
AES_ENC_SECOND_LAST_ROUND(9, 0) AES_ENC_SECOND_LAST_ROUND(9, 1) AES_ENC_SECOND_LAST_ROUND(9, 2)
AES_ENC_LAST_ROUND(10, 0) AES_ENC_LAST_ROUND(10, 1) AES_ENC_LAST_ROUND(10, 2)
: AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
AES_ENC_DEC_OUTPUT_THREE_CTRS(),
CTR_INCREMENT_OUTPUT_HIGH_LOW(),
CTR_INCREMENT_OUTPUT_HIGH_LOW_TMP()
: AES_ENC_DEC_INPUT_ROUND_KEY(0),
AES_ENC_DEC_INPUT_ROUND_KEY(1),
AES_ENC_DEC_INPUT_ROUND_KEY(2),
AES_ENC_DEC_INPUT_ROUND_KEY(3),
AES_ENC_DEC_INPUT_ROUND_KEY(4),
AES_ENC_DEC_INPUT_ROUND_KEY(5),
AES_ENC_DEC_INPUT_ROUND_KEY(6),
AES_ENC_DEC_INPUT_ROUND_KEY(7),
AES_ENC_DEC_INPUT_ROUND_KEY(8),
AES_ENC_DEC_INPUT_ROUND_KEY(9),
AES_ENC_DEC_INPUT_ROUND_KEY(10)
: "cc"
);
/* XOR blocks. */
tmp0 = veorq_u8(block0, tmp0);
tmp1 = veorq_u8(block1, tmp1);
tmp2 = veorq_u8(block2, tmp2);
/* Store to output. */
vst1q_u8(dst, tmp0);
dst += AesEncryptor128::BlockSize;
vst1q_u8(dst, tmp1);
dst += AesEncryptor128::BlockSize;
vst1q_u8(dst, tmp2);
dst += AesEncryptor128::BlockSize;
num_blocks -= 3;
}
}
while (num_blocks >= 1) {
/* Read block in, keep in register for XOR. */
const uint8x16_t block0 = vld1q_u8(src);
src += AesEncryptor128::BlockSize;
/* We'll be encrypting the CTR. */
uint8x16_t tmp0 = ctr0;
/* Actually do encryption, use optimized asm. */
/* Interleave CTR calculations with AES ones, to mask latencies. */
__asm__ __volatile__ (
AES_ENC_ROUND(0, 0) "mov %[high], %[ctr0].d[0]\n"
AES_ENC_ROUND(1, 0) "mov %[low], %[ctr0].d[1]\n"
AES_ENC_ROUND(2, 0) "rev %[high], %[high]\n"
AES_ENC_ROUND(3, 0) "rev %[low], %[low]\n"
AES_ENC_ROUND(4, 0) "adds %[low], %[low], 1\n"
AES_ENC_ROUND(5, 0) "cinc %[high], %[high], cs\n"
AES_ENC_ROUND(6, 0) "rev %[high], %[high]\n"
AES_ENC_ROUND(7, 0) "rev %[low], %[low]\n"
AES_ENC_ROUND(8, 0) "mov %[ctr0].d[0], %[high]\n"
AES_ENC_SECOND_LAST_ROUND(9, 0) "mov %[ctr0].d[1], %[low]\n"
AES_ENC_LAST_ROUND(10, 0)
: AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
AES_ENC_DEC_OUTPUT_ONE_CTR(),
CTR_INCREMENT_OUTPUT_HIGH_LOW()
: AES_ENC_DEC_INPUT_ROUND_KEY(0),
AES_ENC_DEC_INPUT_ROUND_KEY(1),
AES_ENC_DEC_INPUT_ROUND_KEY(2),
AES_ENC_DEC_INPUT_ROUND_KEY(3),
AES_ENC_DEC_INPUT_ROUND_KEY(4),
AES_ENC_DEC_INPUT_ROUND_KEY(5),
AES_ENC_DEC_INPUT_ROUND_KEY(6),
AES_ENC_DEC_INPUT_ROUND_KEY(7),
AES_ENC_DEC_INPUT_ROUND_KEY(8),
AES_ENC_DEC_INPUT_ROUND_KEY(9),
AES_ENC_DEC_INPUT_ROUND_KEY(10)
: "cc"
);
/* XOR blocks. */
tmp0 = veorq_u8(block0, tmp0);
/* Store to output. */
vst1q_u8(dst, tmp0);
dst += AesEncryptor128::BlockSize;
num_blocks--;
}
vst1q_u8(m_counter, ctr0);
}
template<>
void CtrModeImpl<AesEncryptor192>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) {
/* Preload all round keys + iv into neon registers. */
const u8 *keys = m_block_cipher->GetRoundKey();
DECLARE_ROUND_KEY_VAR(0);
DECLARE_ROUND_KEY_VAR(1);
DECLARE_ROUND_KEY_VAR(2);
DECLARE_ROUND_KEY_VAR(3);
DECLARE_ROUND_KEY_VAR(4);
DECLARE_ROUND_KEY_VAR(5);
DECLARE_ROUND_KEY_VAR(6);
DECLARE_ROUND_KEY_VAR(7);
DECLARE_ROUND_KEY_VAR(8);
DECLARE_ROUND_KEY_VAR(9);
DECLARE_ROUND_KEY_VAR(10);
DECLARE_ROUND_KEY_VAR(11);
DECLARE_ROUND_KEY_VAR(12);
uint8x16_t ctr0 = vld1q_u8(m_counter);
uint64_t high, low;
/* Process three blocks at a time, when possible. */
if (num_blocks >= 3) {
/* Increment CTR twice. */
uint8x16_t ctr1 = IncrementCounterOptimized(ctr0);
uint8x16_t ctr2 = IncrementCounterOptimized(ctr1);
uint64_t high_tmp, low_tmp;
while (num_blocks >= 3) {
/* Read blocks in. Keep them in registers for XOR later. */
const uint8x16_t block0 = vld1q_u8(src);
src += AesEncryptor192::BlockSize;
const uint8x16_t block1 = vld1q_u8(src);
src += AesEncryptor192::BlockSize;
const uint8x16_t block2 = vld1q_u8(src);
src += AesEncryptor192::BlockSize;
/* We'll be encrypting the three CTRs. */
uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
/* Actually do encryption, use optimized asm. */
/* Interleave CTR calculations with AES ones, to mask latencies. */
__asm__ __volatile__ (
AES_ENC_ROUND(0, 0) "mov %[high], %[ctr2].d[0]\n"
AES_ENC_ROUND(0, 1) "mov %[low], %[ctr2].d[1]\n"
AES_ENC_ROUND(0, 2) "rev %[high], %[high]\n"
AES_ENC_ROUND(1, 0) "rev %[low], %[low]\n"
AES_ENC_ROUND(1, 1) "adds %[low], %[low], 1\n"
AES_ENC_ROUND(1, 2) "cinc %[high], %[high], cs\n"
AES_ENC_ROUND(2, 0) "rev %[high_tmp], %[high]\n"
AES_ENC_ROUND(2, 1) "rev %[low_tmp], %[low]\n"
AES_ENC_ROUND(2, 2) "mov %[ctr0].d[0], %[high_tmp]\n"
AES_ENC_ROUND(3, 0) "mov %[ctr0].d[1], %[low_tmp]\n"
AES_ENC_ROUND(3, 1) "adds %[low], %[low], 1\n"
AES_ENC_ROUND(3, 2) "cinc %[high], %[high], cs\n"
AES_ENC_ROUND(4, 0) "rev %[high_tmp], %[high]\n"
AES_ENC_ROUND(4, 1) "rev %[low_tmp], %[low]\n"
AES_ENC_ROUND(4, 2) "mov %[ctr1].d[0], %[high_tmp]\n"
AES_ENC_ROUND(5, 0) "mov %[ctr1].d[1], %[low_tmp]\n"
AES_ENC_ROUND(5, 1) "adds %[low], %[low], 1\n"
AES_ENC_ROUND(5, 2) "cinc %[high], %[high], cs\n"
AES_ENC_ROUND(6, 0) "rev %[high_tmp], %[high]\n"
AES_ENC_ROUND(6, 1) "rev %[low_tmp], %[low]\n"
AES_ENC_ROUND(6, 2) "mov %[ctr2].d[0], %[high_tmp]\n"
AES_ENC_ROUND(7, 0) "mov %[ctr2].d[1], %[low_tmp]\n"
AES_ENC_ROUND(7, 1)
AES_ENC_ROUND(7, 2)
AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2)
AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2)
AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2)
AES_ENC_SECOND_LAST_ROUND(11, 0) AES_ENC_SECOND_LAST_ROUND(11, 1) AES_ENC_SECOND_LAST_ROUND(11, 2)
AES_ENC_LAST_ROUND(12, 0) AES_ENC_LAST_ROUND(12, 1) AES_ENC_LAST_ROUND(12, 2)
: AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
AES_ENC_DEC_OUTPUT_THREE_CTRS(),
CTR_INCREMENT_OUTPUT_HIGH_LOW(),
CTR_INCREMENT_OUTPUT_HIGH_LOW_TMP()
: AES_ENC_DEC_INPUT_ROUND_KEY(0),
AES_ENC_DEC_INPUT_ROUND_KEY(1),
AES_ENC_DEC_INPUT_ROUND_KEY(2),
AES_ENC_DEC_INPUT_ROUND_KEY(3),
AES_ENC_DEC_INPUT_ROUND_KEY(4),
AES_ENC_DEC_INPUT_ROUND_KEY(5),
AES_ENC_DEC_INPUT_ROUND_KEY(6),
AES_ENC_DEC_INPUT_ROUND_KEY(7),
AES_ENC_DEC_INPUT_ROUND_KEY(8),
AES_ENC_DEC_INPUT_ROUND_KEY(9),
AES_ENC_DEC_INPUT_ROUND_KEY(10),
AES_ENC_DEC_INPUT_ROUND_KEY(11),
AES_ENC_DEC_INPUT_ROUND_KEY(12)
: "cc"
);
/* XOR blocks. */
tmp0 = veorq_u8(block0, tmp0);
tmp1 = veorq_u8(block1, tmp1);
tmp2 = veorq_u8(block2, tmp2);
/* Store to output. */
vst1q_u8(dst, tmp0);
dst += AesEncryptor192::BlockSize;
vst1q_u8(dst, tmp1);
dst += AesEncryptor192::BlockSize;
vst1q_u8(dst, tmp2);
dst += AesEncryptor192::BlockSize;
num_blocks -= 3;
}
}
while (num_blocks >= 1) {
/* Read block in, keep in register for XOR. */
const uint8x16_t block0 = vld1q_u8(src);
src += AesEncryptor192::BlockSize;
/* We'll be encrypting the CTR. */
uint8x16_t tmp0 = ctr0;
/* Actually do encryption, use optimized asm. */
/* Interleave CTR calculations with AES ones, to mask latencies. */
__asm__ __volatile__ (
AES_ENC_ROUND(0, 0) "mov %[high], %[ctr0].d[0]\n"
AES_ENC_ROUND(1, 0) "mov %[low], %[ctr0].d[1]\n"
AES_ENC_ROUND(2, 0) "rev %[high], %[high]\n"
AES_ENC_ROUND(3, 0) "rev %[low], %[low]\n"
AES_ENC_ROUND(4, 0) "adds %[low], %[low], 1\n"
AES_ENC_ROUND(5, 0) "cinc %[high], %[high], cs\n"
AES_ENC_ROUND(6, 0) "rev %[high], %[high]\n"
AES_ENC_ROUND(7, 0) "rev %[low], %[low]\n"
AES_ENC_ROUND(8, 0) "mov %[ctr0].d[0], %[high]\n"
AES_ENC_ROUND(9, 0) "mov %[ctr0].d[1], %[low]\n"
AES_ENC_ROUND(10, 0)
AES_ENC_SECOND_LAST_ROUND(11, 0)
AES_ENC_LAST_ROUND(12, 0)
: AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
AES_ENC_DEC_OUTPUT_ONE_CTR(),
CTR_INCREMENT_OUTPUT_HIGH_LOW()
: AES_ENC_DEC_INPUT_ROUND_KEY(0),
AES_ENC_DEC_INPUT_ROUND_KEY(1),
AES_ENC_DEC_INPUT_ROUND_KEY(2),
AES_ENC_DEC_INPUT_ROUND_KEY(3),
AES_ENC_DEC_INPUT_ROUND_KEY(4),
AES_ENC_DEC_INPUT_ROUND_KEY(5),
AES_ENC_DEC_INPUT_ROUND_KEY(6),
AES_ENC_DEC_INPUT_ROUND_KEY(7),
AES_ENC_DEC_INPUT_ROUND_KEY(8),
AES_ENC_DEC_INPUT_ROUND_KEY(9),
AES_ENC_DEC_INPUT_ROUND_KEY(10),
AES_ENC_DEC_INPUT_ROUND_KEY(11),
AES_ENC_DEC_INPUT_ROUND_KEY(12)
: "cc"
);
/* XOR blocks. */
tmp0 = veorq_u8(block0, tmp0);
/* Store to output. */
vst1q_u8(dst, tmp0);
dst += AesEncryptor192::BlockSize;
num_blocks--;
}
vst1q_u8(m_counter, ctr0);
}
template<>
void CtrModeImpl<AesEncryptor256>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) {
/* Preload all round keys + iv into neon registers. */
const u8 *keys = m_block_cipher->GetRoundKey();
DECLARE_ROUND_KEY_VAR(0);
DECLARE_ROUND_KEY_VAR(1);
DECLARE_ROUND_KEY_VAR(2);
DECLARE_ROUND_KEY_VAR(3);
DECLARE_ROUND_KEY_VAR(4);
DECLARE_ROUND_KEY_VAR(5);
DECLARE_ROUND_KEY_VAR(6);
DECLARE_ROUND_KEY_VAR(7);
DECLARE_ROUND_KEY_VAR(8);
DECLARE_ROUND_KEY_VAR(9);
DECLARE_ROUND_KEY_VAR(10);
DECLARE_ROUND_KEY_VAR(11);
DECLARE_ROUND_KEY_VAR(12);
DECLARE_ROUND_KEY_VAR(13);
DECLARE_ROUND_KEY_VAR(14);
uint8x16_t ctr0 = vld1q_u8(m_counter);
uint64_t high, low;
/* Process three blocks at a time, when possible. */
if (num_blocks >= 3) {
/* Increment CTR twice. */
uint8x16_t ctr1 = IncrementCounterOptimized(ctr0);
uint8x16_t ctr2 = IncrementCounterOptimized(ctr1);
uint64_t hl_tmp;
while (num_blocks >= 3) {
/* Read blocks in. Keep them in registers for XOR later. */
const uint8x16_t block0 = vld1q_u8(src);
src += AesEncryptor256::BlockSize;
const uint8x16_t block1 = vld1q_u8(src);
src += AesEncryptor256::BlockSize;
const uint8x16_t block2 = vld1q_u8(src);
src += AesEncryptor256::BlockSize;
/* We'll be encrypting the three CTRs. */
uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
/* Actually do encryption, use optimized asm. */
/* Interleave CTR calculations with AES ones, to mask latencies. */
/* Note: ASM here only uses one temporary u64 instead of two, due to 30 operand limit. */
__asm__ __volatile__ (
AES_ENC_ROUND(0, 0) "mov %[high], %[ctr2].d[0]\n"
AES_ENC_ROUND(0, 1) "mov %[low], %[ctr2].d[1]\n"
AES_ENC_ROUND(0, 2) "rev %[high], %[high]\n"
AES_ENC_ROUND(1, 0) "rev %[low], %[low]\n"
AES_ENC_ROUND(1, 1) "adds %[low], %[low], 1\n"
AES_ENC_ROUND(1, 2) "cinc %[high], %[high], cs\n"
AES_ENC_ROUND(2, 0) "rev %[hl_tmp], %[high]\n"
AES_ENC_ROUND(2, 1) "mov %[ctr0].d[0], %[hl_tmp]\n"
AES_ENC_ROUND(2, 2) "rev %[hl_tmp], %[low]\n"
AES_ENC_ROUND(3, 0) "mov %[ctr0].d[1], %[hl_tmp]\n"
AES_ENC_ROUND(3, 1) "adds %[low], %[low], 1\n"
AES_ENC_ROUND(3, 2) "cinc %[high], %[high], cs\n"
AES_ENC_ROUND(4, 0) "rev %[hl_tmp], %[high]\n"
AES_ENC_ROUND(4, 1) "mov %[ctr1].d[0], %[hl_tmp]\n"
AES_ENC_ROUND(4, 2) "rev %[hl_tmp], %[low]\n"
AES_ENC_ROUND(5, 0) "mov %[ctr1].d[1], %[hl_tmp]\n"
AES_ENC_ROUND(5, 1) "adds %[low], %[low], 1\n"
AES_ENC_ROUND(5, 2) "cinc %[high], %[high], cs\n"
AES_ENC_ROUND(6, 0) "rev %[hl_tmp], %[high]\n"
AES_ENC_ROUND(6, 1) "mov %[ctr2].d[0], %[hl_tmp]\n"
AES_ENC_ROUND(6, 2) "rev %[hl_tmp], %[low]\n"
AES_ENC_ROUND(7, 0) "mov %[ctr2].d[1], %[hl_tmp]\n"
AES_ENC_ROUND(7, 1)
AES_ENC_ROUND(7, 2)
AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2)
AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2)
AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2)
AES_ENC_ROUND(11, 0) AES_ENC_ROUND(11, 1) AES_ENC_ROUND(11, 2)
AES_ENC_ROUND(12, 0) AES_ENC_ROUND(12, 1) AES_ENC_ROUND(12, 2)
AES_ENC_SECOND_LAST_ROUND(13, 0) AES_ENC_SECOND_LAST_ROUND(13, 1) AES_ENC_SECOND_LAST_ROUND(13, 2)
AES_ENC_LAST_ROUND(14, 0) AES_ENC_LAST_ROUND(14, 1) AES_ENC_LAST_ROUND(14, 2)
: AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
AES_ENC_DEC_OUTPUT_THREE_CTRS(),
CTR_INCREMENT_OUTPUT_HIGH_LOW(),
CTR_INCREMENT_OUTPUT_HL_SINGLE_TMP()
: AES_ENC_DEC_INPUT_ROUND_KEY(0),
AES_ENC_DEC_INPUT_ROUND_KEY(1),
AES_ENC_DEC_INPUT_ROUND_KEY(2),
AES_ENC_DEC_INPUT_ROUND_KEY(3),
AES_ENC_DEC_INPUT_ROUND_KEY(4),
AES_ENC_DEC_INPUT_ROUND_KEY(5),
AES_ENC_DEC_INPUT_ROUND_KEY(6),
AES_ENC_DEC_INPUT_ROUND_KEY(7),
AES_ENC_DEC_INPUT_ROUND_KEY(8),
AES_ENC_DEC_INPUT_ROUND_KEY(9),
AES_ENC_DEC_INPUT_ROUND_KEY(10),
AES_ENC_DEC_INPUT_ROUND_KEY(11),
AES_ENC_DEC_INPUT_ROUND_KEY(12),
AES_ENC_DEC_INPUT_ROUND_KEY(13),
AES_ENC_DEC_INPUT_ROUND_KEY(14)
: "cc"
);
/* XOR blocks. */
tmp0 = veorq_u8(block0, tmp0);
tmp1 = veorq_u8(block1, tmp1);
tmp2 = veorq_u8(block2, tmp2);
/* Store to output. */
vst1q_u8(dst, tmp0);
dst += AesEncryptor256::BlockSize;
vst1q_u8(dst, tmp1);
dst += AesEncryptor256::BlockSize;
vst1q_u8(dst, tmp2);
dst += AesEncryptor256::BlockSize;
num_blocks -= 3;
}
}
while (num_blocks >= 1) {
/* Read block in, keep in register for XOR. */
const uint8x16_t block0 = vld1q_u8(src);
src += AesEncryptor256::BlockSize;
/* We'll be encrypting the CTR. */
uint8x16_t tmp0 = ctr0;
/* Actually do encryption, use optimized asm. */
/* Interleave CTR calculations with AES ones, to mask latencies. */
__asm__ __volatile__ (
AES_ENC_ROUND(0, 0) "mov %[high], %[ctr0].d[0]\n"
AES_ENC_ROUND(1, 0) "mov %[low], %[ctr0].d[1]\n"
AES_ENC_ROUND(2, 0) "rev %[high], %[high]\n"
AES_ENC_ROUND(3, 0) "rev %[low], %[low]\n"
AES_ENC_ROUND(4, 0) "adds %[low], %[low], 1\n"
AES_ENC_ROUND(5, 0) "cinc %[high], %[high], cs\n"
AES_ENC_ROUND(6, 0) "rev %[high], %[high]\n"
AES_ENC_ROUND(7, 0) "rev %[low], %[low]\n"
AES_ENC_ROUND(8, 0) "mov %[ctr0].d[0], %[high]\n"
AES_ENC_ROUND(9, 0) "mov %[ctr0].d[1], %[low]\n"
AES_ENC_ROUND(10, 0)
AES_ENC_ROUND(11, 0)
AES_ENC_ROUND(12, 0)
AES_ENC_SECOND_LAST_ROUND(13, 0)
AES_ENC_LAST_ROUND(14, 0)
: AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
AES_ENC_DEC_OUTPUT_ONE_CTR(),
CTR_INCREMENT_OUTPUT_HIGH_LOW()
: AES_ENC_DEC_INPUT_ROUND_KEY(0),
AES_ENC_DEC_INPUT_ROUND_KEY(1),
AES_ENC_DEC_INPUT_ROUND_KEY(2),
AES_ENC_DEC_INPUT_ROUND_KEY(3),
AES_ENC_DEC_INPUT_ROUND_KEY(4),
AES_ENC_DEC_INPUT_ROUND_KEY(5),
AES_ENC_DEC_INPUT_ROUND_KEY(6),
AES_ENC_DEC_INPUT_ROUND_KEY(7),
AES_ENC_DEC_INPUT_ROUND_KEY(8),
AES_ENC_DEC_INPUT_ROUND_KEY(9),
AES_ENC_DEC_INPUT_ROUND_KEY(10),
AES_ENC_DEC_INPUT_ROUND_KEY(11),
AES_ENC_DEC_INPUT_ROUND_KEY(12),
AES_ENC_DEC_INPUT_ROUND_KEY(13),
AES_ENC_DEC_INPUT_ROUND_KEY(14)
: "cc"
);
/* XOR blocks. */
tmp0 = veorq_u8(block0, tmp0);
/* Store to output. */
vst1q_u8(dst, tmp0);
dst += AesEncryptor256::BlockSize;
num_blocks--;
}
vst1q_u8(m_counter, ctr0);
}
}
#else
/* TODO: Non-EL0 implementation. */
namespace ams::crypto::impl {
}
#endif

View File

@@ -1,269 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
#include "crypto_aes_impl.arch.x64.hpp"
namespace ams::crypto::impl {
template<> void CtrModeImpl<AesEncryptor128>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) {
/* Check pre-conditions. */
AMS_ASSERT(src != nullptr);
AMS_ASSERT(dst != nullptr);
/* If we have aes-ni, use an optimized impl. */
if (IsAesNiAvailable()) {
/* Load all keys into sse2 registers. */
const u8 *raw_round_keys = m_block_cipher->GetRoundKey();
const __m128i round_keys[AesEncryptor128::RoundKeySize / BlockSize] = {
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 0)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 1)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 2)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 3)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 4)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 5)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 6)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 7)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 8)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 9)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 10)),
};
static_assert(AesEncryptor128::RoundKeySize / BlockSize == 11);
/* Declare constant for counter math. */
const __m128i One = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
/* Process eight blocks at a time, while we can. */
constexpr const auto UnrolledBlockCount = 8;
constexpr const auto CounterThreshold = static_cast<u8>(0x100 - UnrolledBlockCount);
/* Load the counter. */
auto counter = _mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter));
size_t cur_blocks;
for (cur_blocks = 0; cur_blocks + UnrolledBlockCount <= num_blocks; cur_blocks += UnrolledBlockCount) {
__m128i b0;
__m128i b1;
__m128i b2;
__m128i b3;
__m128i b4;
__m128i b5;
__m128i b6;
__m128i b7;
__m128i key = round_keys[0];
/* Get the last byte of the block. */
static_assert(util::IsLittleEndian());
const u8 counter_val = _mm_extract_epi16(counter, 7) >> BITSIZEOF(u8);
/* Do initial encryption of each block. */
if (CounterThreshold <= counter_val) {
/* We'll overwrap, so take slow path for counter. */
_mm_storeu_si128(reinterpret_cast<__m128i *>(m_counter), counter);
b0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b4 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b5 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b6 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b7 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
counter = _mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter));
} else {
/* We can take the fast path for the counter. */
b0 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b1 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b2 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b3 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b4 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b5 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b6 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b7 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
}
/* Do encryption for all rounds. */
key = round_keys[1];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[2];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[3];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[4];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[5];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[6];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[7];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[8];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[9];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[10];
b0 = _mm_aesenclast_si128(b0, key);
b1 = _mm_aesenclast_si128(b1, key);
b2 = _mm_aesenclast_si128(b2, key);
b3 = _mm_aesenclast_si128(b3, key);
b4 = _mm_aesenclast_si128(b4, key);
b5 = _mm_aesenclast_si128(b5, key);
b6 = _mm_aesenclast_si128(b6, key);
b7 = _mm_aesenclast_si128(b7, key);
/* Write the blocks. */
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 0), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 0)), b0));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 1), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 1)), b1));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 2), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 2)), b2));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 3), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 3)), b3));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 4), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 4)), b4));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 5), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 5)), b5));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 6), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 6)), b6));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 7), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 7)), b7));
src += BlockSize * UnrolledBlockCount;
dst += BlockSize * UnrolledBlockCount;
}
/* Store the updated counter. */
_mm_storeu_si128(reinterpret_cast<__m128i *>(m_counter), counter);
/* Process blocks one at a time. */
for (/* ... */; cur_blocks < num_blocks; ++cur_blocks) {
/* Load current counter. */
__m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter));
/* Do aes rounds. */
b = _mm_xor_si128(b, round_keys[0]);
b = _mm_aesenc_si128(b, round_keys[1]);
b = _mm_aesenc_si128(b, round_keys[2]);
b = _mm_aesenc_si128(b, round_keys[3]);
b = _mm_aesenc_si128(b, round_keys[4]);
b = _mm_aesenc_si128(b, round_keys[5]);
b = _mm_aesenc_si128(b, round_keys[6]);
b = _mm_aesenc_si128(b, round_keys[7]);
b = _mm_aesenc_si128(b, round_keys[8]);
b = _mm_aesenc_si128(b, round_keys[9]);
b = _mm_aesenclast_si128(b, round_keys[10]);
/* Write the block. */
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src)), b));
/* Advance. */
src += BlockSize;
dst += BlockSize;
this->IncrementCounter();
}
} else {
/* Fall back to the default implementation. */
while (num_blocks--) {
this->ProcessBlock(dst, src, BlockSize);
dst += BlockSize;
src += BlockSize;
}
}
}
}

View File

@@ -1,306 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
#if defined(ATMOSPHERE_IS_STRATOSPHERE)
/* TODO: EL0 implementation. */
namespace ams::crypto::impl {
}
#else
/* EL1+ implementation. */
namespace ams::crypto::impl {
namespace {
constexpr u64 GetMultiplyFactor(u8 value) {
constexpr size_t Shift = BITSIZEOF(u8) - 1;
constexpr u8 Mask = (1u << Shift);
return (value & Mask) >> Shift;
}
/* TODO: Big endian support, eventually? */
constexpr void GaloisShiftLeft(u64 *block) {
/* Shift the block left by one. */
block[1] <<= 1;
block[1] |= (block[0] & (static_cast<u64>(1) << (BITSIZEOF(u64) - 1))) >> (BITSIZEOF(u64) - 1);
block[0] <<= 1;
}
constexpr u8 GaloisShiftRight(u64 *block) {
/* Determine the mask to return. */
constexpr u8 GaloisFieldMask = 0xE1;
const u8 mask = (block[0] & 1) * GaloisFieldMask;
/* Shift the block right by one. */
block[0] >>= 1;
block[0] |= (block[1] & 1) << (BITSIZEOF(u64) - 1);
block[1] >>= 1;
/* Return the mask. */
return mask;
}
/* Multiply two 128-bit numbers X, Y in the GF(128) Galois Field. */
void GaloisFieldMult(void *dst, const void *x, const void *y) {
/* Our block size is 16 bytes (for a 128-bit integer). */
constexpr size_t BlockSize = 16;
constexpr size_t FieldSize = 128;
/* Declare work blocks for us to store temporary values. */
u8 x_block[BlockSize];
u8 y_block[BlockSize];
u8 out[BlockSize];
/* Declare 64-bit pointers for our convenience. */
u64 *x_64 = static_cast<u64 *>(static_cast<void *>(x_block));
u64 *y_64 = static_cast<u64 *>(static_cast<void *>(y_block));
u64 *out_64 = static_cast<u64 *>(static_cast<void *>(out));
/* Initialize our work blocks. */
for (size_t i = 0; i < BlockSize; ++i) {
x_block[i] = static_cast<const u8 *>(x)[BlockSize - 1 - i];
y_block[i] = static_cast<const u8 *>(y)[BlockSize - 1 - i];
out[i] = 0;
}
/* Perform multiplication on each bit in y. */
for (size_t i = 0; i < FieldSize; ++i) {
/* Get the multiply factor for this bit. */
const auto y_mult = GetMultiplyFactor(y_block[BlockSize - 1]);
/* Multiply x by the factor. */
out_64[0] ^= x_64[0] * y_mult;
out_64[1] ^= x_64[1] * y_mult;
/* Shift left y by one. */
GaloisShiftLeft(y_64);
/* Shift right x by one, and mask appropriately. */
const u8 x_mask = GaloisShiftRight(x_64);
x_block[BlockSize - 1] ^= x_mask;
}
/* Copy out our result. */
for (size_t i = 0; i < BlockSize; ++i) {
static_cast<u8 *>(dst)[i] = out[BlockSize - 1 - i];
}
}
}
template<class BlockCipher>
void GcmModeImpl<BlockCipher>::Initialize(const BlockCipher *block_cipher) {
/* Set member variables. */
m_block_cipher = block_cipher;
m_cipher_func = std::addressof(GcmModeImpl<BlockCipher>::ProcessBlock);
/* Pre-calculate values to speed up galois field multiplications later. */
this->InitializeHashKey();
/* Note that we're initialized. */
m_state = State_Initialized;
}
template<class BlockCipher>
void GcmModeImpl<BlockCipher>::Reset(const void *iv, size_t iv_size) {
/* Validate pre-conditions. */
AMS_ASSERT(m_state >= State_Initialized);
/* Reset blocks. */
m_block_x.block_128.Clear();
m_block_tmp.block_128.Clear();
/* Clear sizes. */
m_aad_size = 0;
m_msg_size = 0;
m_aad_remaining = 0;
m_msg_remaining = 0;
/* Update our state. */
m_state = State_ProcessingAad;
/* Set our iv. */
if (iv_size == 12) {
/* If our iv is the correct size, simply copy in the iv, and set the magic bit. */
std::memcpy(std::addressof(m_block_ek0), iv, iv_size);
util::StoreBigEndian(m_block_ek0.block_32 + 3, static_cast<u32>(1));
} else {
/* Clear our ek0 block. */
m_block_ek0.block_128.Clear();
/* Update using the iv as aad. */
this->UpdateAad(iv, iv_size);
/* Treat the iv as fake msg for the mac that will become our iv. */
m_msg_size = m_aad_size;
m_aad_size = 0;
/* Compute a non-final mac. */
this->ComputeMac(false);
/* Set our ek0 block to our calculated mac block. */
m_block_ek0 = m_block_x;
/* Clear our calculated mac block. */
m_block_x.block_128.Clear();
/* Reset our state. */
m_msg_size = 0;
m_aad_size = 0;
m_msg_remaining = 0;
m_aad_remaining = 0;
}
/* Set the working block to the iv. */
m_block_ek = m_block_ek0;
}
template<class BlockCipher>
void GcmModeImpl<BlockCipher>::UpdateAad(const void *aad, size_t aad_size) {
/* Validate pre-conditions. */
AMS_ASSERT(m_state == State_ProcessingAad);
AMS_ASSERT(m_msg_size == 0);
/* Update our aad size. */
m_aad_size += aad_size;
/* Define a working tracker variable. */
const u8 *cur_aad = static_cast<const u8 *>(aad);
/* Process any leftover aad data from a previous invocation. */
if (m_aad_remaining > 0) {
while (aad_size > 0) {
/* Copy in a byte of the aad to our partial block. */
m_block_x.block_8[m_aad_remaining] ^= *(cur_aad++);
/* Note that we consumed a byte. */
--aad_size;
/* Increment our partial block size. */
m_aad_remaining = (m_aad_remaining + 1) % BlockSize;
/* If we have a complete block, process it and move onward. */
GaloisFieldMult(std::addressof(m_block_x), std::addressof(m_block_x), std::addressof(m_h_mult_blocks[0]));
}
}
/* Process as many blocks as we can. */
while (aad_size >= BlockSize) {
/* Xor the current aad into our work block. */
for (size_t i = 0; i < BlockSize; ++i) {
m_block_x.block_8[i] ^= *(cur_aad++);
}
/* Multiply the blocks in our galois field. */
GaloisFieldMult(std::addressof(m_block_x), std::addressof(m_block_x), std::addressof(m_h_mult_blocks[0]));
/* Note that we've processed a block. */
aad_size -= BlockSize;
}
/* Update our state with whatever aad is left over. */
if (aad_size > 0) {
/* Note how much left over data we have. */
m_aad_remaining = static_cast<u32>(aad_size);
/* Xor the data in. */
for (size_t i = 0; i < aad_size; ++i) {
m_block_x.block_8[i] ^= *(cur_aad++);
}
}
}
/* TODO: template<class BlockCipher> size_t GcmModeImpl<BlockCipher>::UpdateEncrypt(void *dst, size_t dst_size, const void *src, size_t src_size); */
/* TODO: template<class BlockCipher> size_t GcmModeImpl<BlockCipher>::UpdateDecrypt(void *dst, size_t dst_size, const void *src, size_t src_size); */
template<class BlockCipher>
void GcmModeImpl<BlockCipher>::GetMac(void *dst, size_t dst_size) {
/* Validate pre-conditions. */
AMS_ASSERT(State_ProcessingAad <= m_state && m_state <= State_Done);
AMS_ASSERT(dst != nullptr);
AMS_ASSERT(dst_size >= MacSize);
AMS_ASSERT(m_aad_remaining == 0);
AMS_ASSERT(m_msg_remaining == 0);
AMS_UNUSED(dst_size);
/* If we haven't already done so, compute the final mac. */
if (m_state != State_Done) {
this->ComputeMac(true);
m_state = State_Done;
}
static_assert(sizeof(m_block_x) == MacSize);
std::memcpy(dst, std::addressof(m_block_x), MacSize);
}
template<class BlockCipher>
void GcmModeImpl<BlockCipher>::InitializeHashKey() {
/* We want to encrypt an empty block to use for intermediate calculations. */
/* NOTE: Non-EL1 implementations will do multiple encryptions ahead of time, */
/* to speed up galois field arithmetic. */
constexpr const Block EmptyBlock = {};
this->ProcessBlock(std::addressof(m_h_mult_blocks[0]), std::addressof(EmptyBlock), m_block_cipher);
}
template<class BlockCipher>
void GcmModeImpl<BlockCipher>::ComputeMac(bool encrypt) {
/* If we have leftover data, process it. */
if (m_aad_remaining > 0 || m_msg_remaining > 0) {
GaloisFieldMult(std::addressof(m_block_x), std::addressof(m_block_x), std::addressof(m_h_mult_blocks[0]));
}
/* Setup the last block. */
Block last_block = Block{ .block_128 = { m_msg_size, m_aad_size } };
/* Multiply the last block by 8 to account for bit vs byte sizes. */
static_assert(AMS_OFFSETOF(Block128, hi) == 0);
GaloisShiftLeft(std::addressof(last_block.block_128.hi));
GaloisShiftLeft(std::addressof(last_block.block_128.hi));
GaloisShiftLeft(std::addressof(last_block.block_128.hi));
/* Xor the data in. */
for (size_t i = 0; i < BlockSize; ++i) {
m_block_x.block_8[BlockSize - 1 - i] ^= last_block.block_8[i];
}
/* Perform the final multiplication. */
GaloisFieldMult(std::addressof(m_block_x), std::addressof(m_block_x), std::addressof(m_h_mult_blocks[0]));
/* If we need to do an encryption, do so. */
if (encrypt) {
/* Encrypt the iv. */
u8 enc_result[BlockSize];
this->ProcessBlock(enc_result, std::addressof(m_block_ek0), m_block_cipher);
/* Xor the iv in. */
for (size_t i = 0; i < BlockSize; ++i) {
m_block_x.block_8[i] ^= enc_result[i];
}
}
}
/* Explicitly instantiate the valid template classes. */
template class GcmModeImpl<AesEncryptor128>;
}
#endif

View File

@@ -1,255 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
namespace ams::crypto::impl {
namespace {
struct Md5Constants {
static constexpr const u32 A = 0x67452301;
static constexpr const u32 B = 0xEFCDAB89;
static constexpr const u32 C = 0x98BADCFE;
static constexpr const u32 D = 0x10325476;
static constexpr const u32 T[] = {
0xD76AA478, 0xE8C7B756, 0x242070DB, 0xC1BDCEEE,
0xF57C0FAF, 0x4787C62A, 0xA8304613, 0xFD469501,
0x698098D8, 0x8B44F7AF, 0xFFFF5BB1, 0x895CD7BE,
0x6B901122, 0xFD987193, 0xA679438E, 0x49B40821,
0xF61E2562, 0xC040B340, 0x265E5A51, 0xE9B6C7AA,
0xD62F105D, 0x02441453, 0xD8A1E681, 0xE7D3FBC8,
0x21E1CDE6, 0xC33707D6, 0xF4D50D87, 0x455A14ED,
0xA9E3E905, 0xFCEFA3F8, 0x676F02D9, 0x8D2A4C8A,
0xFFFA3942, 0x8771F681, 0x6D9D6122, 0xFDE5380C,
0xA4BEEA44, 0x4BDECFA9, 0xF6BB4B60, 0xBEBFBC70,
0x289B7EC6, 0xEAA127FA, 0xD4EF3085, 0x04881D05,
0xD9D4D039, 0xE6DB99E5, 0x1FA27CF8, 0xC4AC5665,
0xF4292244, 0x432AFF97, 0xAB9423A7, 0xFC93A039,
0x655B59C3, 0x8F0CCC92, 0xFFEFF47D, 0x85845DD1,
0x6FA87E4F, 0xFE2CE6E0, 0xA3014314, 0x4E0811A1,
0xF7537E82, 0xBD3AF235, 0x2AD7D2BB, 0xEB86D391,
};
static constexpr u32 K[] = {
0x1, 0x6, 0xB, 0x0,
0x5, 0xA, 0xF, 0x4,
0x9, 0xE, 0x3, 0x8,
0xD, 0x2, 0x7, 0xC,
0x5, 0x8, 0xB, 0xE,
0x1, 0x4, 0x7, 0xA,
0xD, 0x0, 0x3, 0x6,
0x9, 0xC, 0xF, 0x2,
0x0, 0x7, 0xE, 0x5,
0xC, 0x3, 0xA, 0x1,
0x8, 0xF, 0x6, 0xD,
0x4, 0xB, 0x2, 0x9,
};
static constexpr u8 Padding[] = {
0x80
};
};
constexpr ALWAYS_INLINE u32 F(u32 x, u32 y, u32 z) { return (x & y) | ((~x) & z); }
constexpr ALWAYS_INLINE u32 G(u32 x, u32 y, u32 z) { return (x & z) | (y & (~z)); }
constexpr ALWAYS_INLINE u32 H(u32 x, u32 y, u32 z) { return x ^ y ^ z; }
constexpr ALWAYS_INLINE u32 I(u32 x, u32 y, u32 z) { return y ^ (x | (~z)); }
constexpr ALWAYS_INLINE u32 CalculateRound1(u32 a, u32 b, u32 c, u32 d, u32 x, u32 s, u32 t) { return b + util::RotateLeft<u32>(a + F(b, c, d) + x + t, s); }
constexpr ALWAYS_INLINE u32 CalculateRound2(u32 a, u32 b, u32 c, u32 d, u32 x, u32 s, u32 t) { return b + util::RotateLeft<u32>(a + G(b, c, d) + x + t, s); }
constexpr ALWAYS_INLINE u32 CalculateRound3(u32 a, u32 b, u32 c, u32 d, u32 x, u32 s, u32 t) { return b + util::RotateLeft<u32>(a + H(b, c, d) + x + t, s); }
constexpr ALWAYS_INLINE u32 CalculateRound4(u32 a, u32 b, u32 c, u32 d, u32 x, u32 s, u32 t) { return b + util::RotateLeft<u32>(a + I(b, c, d) + x + t, s); }
void Encode(u32 *dst, const u32 *src, size_t size) {
if constexpr (util::IsBigEndian()) {
for (size_t i = 0; i < size; i += sizeof(u32)) {
util::StoreLittleEndian(dst + i, src[i]);
}
} else {
std::memcpy(dst, src, size);
}
}
void Decode(u32 *dst, const u32 *src, size_t size) {
if constexpr (util::IsBigEndian()) {
for (size_t i = 0; i < size; i += sizeof(u32)) {
dst[i] = util::LoadLittleEndian(src + i);
}
} else {
std::memcpy(dst, src, size);
}
}
}
void Md5Impl::Initialize() {
/* Set constants. */
m_x.p.a = Md5Constants::A;
m_x.p.b = Md5Constants::B;
m_x.p.c = Md5Constants::C;
m_x.p.d = Md5Constants::D;
/* Set size. */
m_size = 0;
/* Set initialized. */
m_state = State_Initialized;
}
void Md5Impl::Update(const void *data, size_t size) {
/* Check pre-conditions. */
AMS_ASSERT(m_state == State_Initialized);
/* Determine how much we can process. */
const size_t work_idx = m_size % BlockSize;
const size_t work_remaining = BlockSize - work_idx;
/* Increment our size. */
m_size += size;
/* Copy in the data to our buffer, if we don't have a full block. */
if (work_remaining > size) {
if (size > 0) {
std::memcpy(m_y + work_idx, data, size);
}
return;
}
/* Copy what we can to complete our block. */
std::memcpy(m_y + work_idx, data, work_remaining);
/* Process the block. */
this->ProcessBlock();
/* Adjust size to account for what we've processed. */
size -= work_remaining;
/* Process as many full blocks as we can. */
const u8 *cur_block = static_cast<const u8 *>(data) + work_remaining;
for (size_t i = 0; i < size / BlockSize; ++i) {
std::memcpy(m_y, cur_block, BlockSize);
cur_block += BlockSize;
this->ProcessBlock();
}
/* Copy in any leftover data. */
if (const auto left = size % BlockSize; left > 0) {
std::memcpy(m_y, cur_block, left);
}
}
void Md5Impl::GetHash(void *dst, size_t size) {
/* Check pre-conditions. */
AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
AMS_ASSERT(size >= HashSize);
AMS_UNUSED(size);
/* If we need to, finish processing. */
if (m_state == State_Initialized) {
this->ProcessLastBlock();
m_state = State_Done;
}
/* Encode the result. */
Encode(static_cast<u32 *>(dst), m_x.state, HashSize);
}
void Md5Impl::ProcessBlock() {
/* Declare tracking pointers for rounds. */
u32 x[BlockSize / sizeof(u32)];
const u32 *p_t = Md5Constants::T;
const u32 *p_k = Md5Constants::K;
const u32 *p_x = x;
/* Extract current state. */
u32 a = m_x.p.a;
u32 b = m_x.p.b;
u32 c = m_x.p.c;
u32 d = m_x.p.d;
/* Decode the block into native endian. */
Decode(x, reinterpret_cast<const u32 *>(m_y), BlockSize);
/* Perform round 1. */
for (size_t i = 0; i < 4; ++i) {
a = CalculateRound1(a, b, c, d, *p_x++, 7, *p_t++);
d = CalculateRound1(d, a, b, c, *p_x++, 12, *p_t++);
c = CalculateRound1(c, d, a, b, *p_x++, 17, *p_t++);
b = CalculateRound1(b, c, d, a, *p_x++, 22, *p_t++);
}
/* Perform round 2. */
for (size_t i = 0; i < 4; ++i) {
a = CalculateRound2(a, b, c, d, x[*p_k++], 5, *p_t++);
d = CalculateRound2(d, a, b, c, x[*p_k++], 9, *p_t++);
c = CalculateRound2(c, d, a, b, x[*p_k++], 14, *p_t++);
b = CalculateRound2(b, c, d, a, x[*p_k++], 20, *p_t++);
}
/* Perform round 3. */
for (size_t i = 0; i < 4; ++i) {
a = CalculateRound3(a, b, c, d, x[*p_k++], 4, *p_t++);
d = CalculateRound3(d, a, b, c, x[*p_k++], 11, *p_t++);
c = CalculateRound3(c, d, a, b, x[*p_k++], 16, *p_t++);
b = CalculateRound3(b, c, d, a, x[*p_k++], 23, *p_t++);
}
/* Perform round 4. */
for (size_t i = 0; i < 4; ++i) {
a = CalculateRound4(a, b, c, d, x[*p_k++], 6, *p_t++);
d = CalculateRound4(d, a, b, c, x[*p_k++], 10, *p_t++);
c = CalculateRound4(c, d, a, b, x[*p_k++], 15, *p_t++);
b = CalculateRound4(b, c, d, a, x[*p_k++], 21, *p_t++);
}
/* Mix the result back into our state. */
m_x.p.a += a;
m_x.p.b += b;
m_x.p.c += c;
m_x.p.d += d;
}
void Md5Impl::ProcessLastBlock() {
/* Get bit count. */
const u64 bit_count = m_size * BITSIZEOF(u8);
/* Add padding byte unconditionally. */
this->Update(Md5Constants::Padding, sizeof(Md5Constants::Padding));
/* Determine remaining. */
size_t work_idx = m_size % BlockSize;
size_t work_remaining = BlockSize - work_idx;
/* We want to process 8000.....{bit count}. */
if (work_remaining < sizeof(u64)) {
std::memset(m_y + work_idx, 0, work_remaining);
this->ProcessBlock();
work_idx = 0;
work_remaining = BlockSize;
}
if (work_remaining > sizeof(u64)) {
std::memset(m_y + work_idx, 0, work_remaining - sizeof(u64));
}
util::StoreLittleEndian<u64>(reinterpret_cast<u64 *>(m_y + BlockSize - sizeof(u64)), bit_count);
this->ProcessBlock();
}
}

View File

@@ -1,247 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
#if defined(ATMOSPHERE_IS_STRATOSPHERE)
#include <arm_neon.h>
namespace ams::crypto::impl {
namespace {
constexpr const u32 RoundConstants[4] = {
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
};
/* Define for loading work var from message. */
#define SHA1_LOAD_W_FROM_MESSAGE(which) \
w[which] = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data))); \
data += 0x10
#define SHA1_CALCULATE_W_FROM_PREVIOUS(i) \
w[i] = vsha1su1q_u32(vsha1su0q_u32(w[i-4], w[i-3], w[i-2]), w[i-1])
/* Define for doing four rounds of SHA1. */
#define SHA1_DO_ROUND(r, insn, constant) \
do { \
const u32 a = vgetq_lane_u32(cur_abcd, 0); \
cur_abcd = v##insn##q_u32(cur_abcd, cur_e, vaddq_u32(w[r], constant)); \
cur_e = vsha1h_u32(a); \
} while (0)
}
void Sha1Impl::Initialize() {
/* Reset buffered bytes/bits. */
m_buffered_bytes = 0;
m_bits_consumed = 0;
/* Set intermediate hash. */
m_intermediate_hash[0] = 0x67452301;
m_intermediate_hash[1] = 0xEFCDAB89;
m_intermediate_hash[2] = 0x98BADCFE;
m_intermediate_hash[3] = 0x10325476;
m_intermediate_hash[4] = 0xC3D2E1F0;
/* Set state. */
m_state = State_Initialized;
}
void Sha1Impl::Update(const void *data, size_t size) {
/* Verify we're in a state to update. */
AMS_ASSERT(m_state == State_Initialized);
/* Advance our input bit count. */
m_bits_consumed += BITSIZEOF(u8) * (((m_buffered_bytes + size) / BlockSize) * BlockSize);
/* Process anything we have buffered. */
const u8 *data8 = static_cast<const u8 *>(data);
size_t remaining = size;
if (m_buffered_bytes > 0) {
const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
std::memcpy(m_buffer + m_buffered_bytes, data8, copy_size);
data8 += copy_size;
remaining -= copy_size;
m_buffered_bytes += copy_size;
/* Process a block, if we filled one. */
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock(m_buffer);
m_buffered_bytes = 0;
}
}
/* Process blocks, if we have any. */
if (remaining >= BlockSize) {
const size_t blocks = remaining / BlockSize;
this->ProcessBlocks(data8, blocks);
data8 += BlockSize * blocks;
remaining -= BlockSize * blocks;
}
/* Copy any leftover data to our buffer. */
if (remaining > 0) {
m_buffered_bytes = remaining;
std::memcpy(m_buffer, data8, remaining);
}
}
void Sha1Impl::GetHash(void *dst, size_t size) {
/* Verify we're in a state to get hash. */
AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
AMS_ASSERT(size >= HashSize);
AMS_UNUSED(size);
/* If we need to, process the last block. */
if (m_state == State_Initialized) {
this->ProcessLastBlock();
m_state = State_Done;
}
/* Copy the output hash. */
if constexpr (util::IsLittleEndian()) {
static_assert(HashSize % sizeof(u32) == 0);
u32 *dst_32 = static_cast<u32 *>(dst);
for (size_t i = 0; i < HashSize / sizeof(u32); ++i) {
dst_32[i] = util::LoadBigEndian<u32>(m_intermediate_hash + i);
}
} else {
std::memcpy(dst, m_intermediate_hash, HashSize);
}
}
ALWAYS_INLINE void Sha1Impl::ProcessBlock(const void *data) {
return this->ProcessBlocks(static_cast<const u8 *>(data), 1);
}
void Sha1Impl::ProcessBlocks(const u8 *data, size_t block_count) {
/* Setup round constants. */
const uint32x4_t k0 = vdupq_n_u32(RoundConstants[0]);
const uint32x4_t k1 = vdupq_n_u32(RoundConstants[1]);
const uint32x4_t k2 = vdupq_n_u32(RoundConstants[2]);
const uint32x4_t k3 = vdupq_n_u32(RoundConstants[3]);
/* Load hash variables with intermediate state. */
uint32x4_t cur_abcd = vld1q_u32(m_intermediate_hash + 0);
u32 cur_e = m_intermediate_hash[4];
/* Actually do hash processing blocks. */
do {
/* Save current state. */
const uint32x4_t prev_abcd = cur_abcd;
const u32 prev_e = cur_e;
uint32x4_t w[20];
/* Setup w[0-3] with message. */
SHA1_LOAD_W_FROM_MESSAGE(0);
SHA1_LOAD_W_FROM_MESSAGE(1);
SHA1_LOAD_W_FROM_MESSAGE(2);
SHA1_LOAD_W_FROM_MESSAGE(3);
/* Calculate w[4-19], w[i] = sha1su1(sha1su0(w[i-4], w[i-3], w[i-2]), w[i-1]); */
SHA1_CALCULATE_W_FROM_PREVIOUS(4);
SHA1_CALCULATE_W_FROM_PREVIOUS(5);
SHA1_CALCULATE_W_FROM_PREVIOUS(6);
SHA1_CALCULATE_W_FROM_PREVIOUS(7);
SHA1_CALCULATE_W_FROM_PREVIOUS(8);
SHA1_CALCULATE_W_FROM_PREVIOUS(9);
SHA1_CALCULATE_W_FROM_PREVIOUS(10);
SHA1_CALCULATE_W_FROM_PREVIOUS(11);
SHA1_CALCULATE_W_FROM_PREVIOUS(12);
SHA1_CALCULATE_W_FROM_PREVIOUS(13);
SHA1_CALCULATE_W_FROM_PREVIOUS(14);
SHA1_CALCULATE_W_FROM_PREVIOUS(15);
SHA1_CALCULATE_W_FROM_PREVIOUS(16);
SHA1_CALCULATE_W_FROM_PREVIOUS(17);
SHA1_CALCULATE_W_FROM_PREVIOUS(18);
SHA1_CALCULATE_W_FROM_PREVIOUS(19);
/* Do round calculations 0-20. Uses sha1c, k0. */
SHA1_DO_ROUND(0, sha1c, k0);
SHA1_DO_ROUND(1, sha1c, k0);
SHA1_DO_ROUND(2, sha1c, k0);
SHA1_DO_ROUND(3, sha1c, k0);
SHA1_DO_ROUND(4, sha1c, k0);
/* Do round calculations 20-40. Uses sha1p, k1. */
SHA1_DO_ROUND(5, sha1p, k1);
SHA1_DO_ROUND(6, sha1p, k1);
SHA1_DO_ROUND(7, sha1p, k1);
SHA1_DO_ROUND(8, sha1p, k1);
SHA1_DO_ROUND(9, sha1p, k1);
/* Do round calculations 40-60. Uses sha1m, k2. */
SHA1_DO_ROUND(10, sha1m, k2);
SHA1_DO_ROUND(11, sha1m, k2);
SHA1_DO_ROUND(12, sha1m, k2);
SHA1_DO_ROUND(13, sha1m, k2);
SHA1_DO_ROUND(14, sha1m, k2);
/* Do round calculations 60-80. Uses sha1p, k3. */
SHA1_DO_ROUND(15, sha1p, k3);
SHA1_DO_ROUND(16, sha1p, k3);
SHA1_DO_ROUND(17, sha1p, k3);
SHA1_DO_ROUND(18, sha1p, k3);
SHA1_DO_ROUND(19, sha1p, k3);
/* Add to previous. */
cur_abcd = vaddq_u32(cur_abcd, prev_abcd);
cur_e = cur_e + prev_e;
} while (--block_count != 0);
/* Save result to intermediate hash. */
vst1q_u32(m_intermediate_hash, cur_abcd);
m_intermediate_hash[4] = cur_e;
}
void Sha1Impl::ProcessLastBlock() {
/* Setup the final block. */
constexpr const auto BlockSizeWithoutSizeField = BlockSize - sizeof(u64);
/* Increment our bits consumed. */
m_bits_consumed += BITSIZEOF(u8) * m_buffered_bytes;
/* Add 0x80 terminator. */
m_buffer[m_buffered_bytes++] = 0x80;
/* If we can process the size field directly, do so, otherwise set up to process it. */
if (m_buffered_bytes <= BlockSizeWithoutSizeField) {
/* Clear up to size field. */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSizeWithoutSizeField - m_buffered_bytes);
} else {
/* Consume full block */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSize - m_buffered_bytes);
this->ProcessBlock(m_buffer);
/* Clear up to size field. */
std::memset(m_buffer, 0, BlockSizeWithoutSizeField);
}
/* Store the size field. */
util::StoreBigEndian<u64>(reinterpret_cast<u64 *>(m_buffer + BlockSizeWithoutSizeField), m_bits_consumed);
/* Process the final block. */
this->ProcessBlock(m_buffer);
}
}
#endif

View File

@@ -1,225 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
namespace ams::crypto::impl {
namespace {
constexpr const u32 RoundConstants[4] = {
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
};
constexpr ALWAYS_INLINE u32 Choose(u32 x, u32 y, u32 z) {
return (x & y) ^ ((~x) & z);
}
constexpr ALWAYS_INLINE u32 Majority(u32 x, u32 y, u32 z) {
return (x & y) ^ (x & z) ^ (y & z);
}
constexpr ALWAYS_INLINE u32 Parity(u32 x, u32 y, u32 z) {
return x ^ y ^ z;
}
}
void Sha1Impl::Initialize() {
/* Reset buffered bytes/bits. */
m_buffered_bytes = 0;
m_bits_consumed = 0;
/* Set intermediate hash. */
m_intermediate_hash[0] = 0x67452301;
m_intermediate_hash[1] = 0xEFCDAB89;
m_intermediate_hash[2] = 0x98BADCFE;
m_intermediate_hash[3] = 0x10325476;
m_intermediate_hash[4] = 0xC3D2E1F0;
/* Set state. */
m_state = State_Initialized;
}
void Sha1Impl::Update(const void *data, size_t size) {
/* Verify we're in a state to update. */
AMS_ASSERT(m_state == State_Initialized);
/* Advance our input bit count. */
m_bits_consumed += BITSIZEOF(u8) * (((m_buffered_bytes + size) / BlockSize) * BlockSize);
/* Process anything we have buffered. */
const u8 *data8 = static_cast<const u8 *>(data);
size_t remaining = size;
if (m_buffered_bytes > 0) {
const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
std::memcpy(m_buffer + m_buffered_bytes, data8, copy_size);
data8 += copy_size;
remaining -= copy_size;
m_buffered_bytes += copy_size;
/* Process a block, if we filled one. */
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock(m_buffer);
m_buffered_bytes = 0;
}
}
/* Process blocks, while we have any. */
while (remaining >= BlockSize) {
this->ProcessBlock(data8);
data8 += BlockSize;
remaining -= BlockSize;
}
/* Copy any leftover data to our buffer. */
if (remaining > 0) {
m_buffered_bytes = remaining;
std::memcpy(m_buffer, data8, remaining);
}
}
void Sha1Impl::GetHash(void *dst, size_t size) {
/* Verify we're in a state to get hash. */
AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
AMS_ASSERT(size >= HashSize);
AMS_UNUSED(size);
/* If we need to, process the last block. */
if (m_state == State_Initialized) {
this->ProcessLastBlock();
m_state = State_Done;
}
/* Copy the output hash. */
if constexpr (util::IsLittleEndian()) {
static_assert(HashSize % sizeof(u32) == 0);
u32 *dst_32 = static_cast<u32 *>(dst);
for (size_t i = 0; i < HashSize / sizeof(u32); ++i) {
dst_32[i] = util::LoadBigEndian<u32>(m_intermediate_hash + i);
}
} else {
std::memcpy(dst, m_intermediate_hash, HashSize);
}
}
void Sha1Impl::ProcessBlock(const void *data) {
/* Load work variables. */
u32 a = m_intermediate_hash[0];
u32 b = m_intermediate_hash[1];
u32 c = m_intermediate_hash[2];
u32 d = m_intermediate_hash[3];
u32 e = m_intermediate_hash[4];
u32 tmp;
size_t i;
/* Copy the input. */
u32 w[80];
if constexpr (util::IsLittleEndian()) {
static_assert(BlockSize % sizeof(u32) == 0);
const u32 *src_32 = static_cast<const u32 *>(data);
for (size_t i = 0; i < BlockSize / sizeof(u32); ++i) {
w[i] = util::LoadBigEndian<u32>(src_32 + i);
}
} else {
std::memcpy(w, data, BlockSize);
}
/* Initialize the rest of w. */
for (i = BlockSize / sizeof(u32); i < util::size(w); ++i) {
const u32 *prev = w + (i - BlockSize / sizeof(u32));
w[i] = util::RotateLeft<u32>(prev[0] ^ prev[2] ^ prev[8] ^ prev[13], 1);
}
/* Perform rounds. */
for (i = 0; i < 20; ++i) {
tmp = util::RotateLeft<u32>(a, 5) + Choose(b, c, d) + e + w[i] + RoundConstants[0];
e = d;
d = c;
c = util::RotateLeft<u32>(b, 30);
b = a;
a = tmp;
}
for (/* ... */; i < 40; ++i) {
tmp = util::RotateLeft<u32>(a, 5) + Parity(b, c, d) + e + w[i] + RoundConstants[1];
e = d;
d = c;
c = util::RotateLeft<u32>(b, 30);
b = a;
a = tmp;
}
for (/* ... */; i < 60; ++i) {
tmp = util::RotateLeft<u32>(a, 5) + Majority(b, c, d) + e + w[i] + RoundConstants[2];
e = d;
d = c;
c = util::RotateLeft<u32>(b, 30);
b = a;
a = tmp;
}
for (/* ... */; i < 80; ++i) {
tmp = util::RotateLeft<u32>(a, 5) + Parity(b, c, d) + e + w[i] + RoundConstants[3];
e = d;
d = c;
c = util::RotateLeft<u32>(b, 30);
b = a;
a = tmp;
}
/* Update intermediate hash. */
m_intermediate_hash[0] += a;
m_intermediate_hash[1] += b;
m_intermediate_hash[2] += c;
m_intermediate_hash[3] += d;
m_intermediate_hash[4] += e;
}
void Sha1Impl::ProcessLastBlock() {
/* Setup the final block. */
constexpr const auto BlockSizeWithoutSizeField = BlockSize - sizeof(u64);
/* Increment our bits consumed. */
m_bits_consumed += BITSIZEOF(u8) * m_buffered_bytes;
/* Add 0x80 terminator. */
m_buffer[m_buffered_bytes++] = 0x80;
/* If we can process the size field directly, do so, otherwise set up to process it. */
if (m_buffered_bytes <= BlockSizeWithoutSizeField) {
/* Clear up to size field. */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSizeWithoutSizeField - m_buffered_bytes);
} else {
/* Consume full block */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSize - m_buffered_bytes);
this->ProcessBlock(m_buffer);
/* Clear up to size field. */
std::memset(m_buffer, 0, BlockSizeWithoutSizeField);
}
/* Store the size field. */
util::StoreBigEndian<u64>(reinterpret_cast<u64 *>(m_buffer + BlockSizeWithoutSizeField), m_bits_consumed);
/* Process the final block. */
this->ProcessBlock(m_buffer);
}
}

View File

@@ -1,327 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
#if defined(ATMOSPHERE_IS_STRATOSPHERE)
#include <arm_neon.h>
namespace ams::crypto::impl {
namespace {
alignas(Sha256Impl::BlockSize) constexpr const u32 RoundConstants[0x40] = {
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
};
}
void Sha256Impl::Initialize() {
/* Reset buffered bytes/bits. */
m_buffered_bytes = 0;
m_bits_consumed = 0;
/* Set intermediate hash. */
m_intermediate_hash[0] = 0x6A09E667;
m_intermediate_hash[1] = 0xBB67AE85;
m_intermediate_hash[2] = 0x3C6EF372;
m_intermediate_hash[3] = 0xA54FF53A;
m_intermediate_hash[4] = 0x510E527F;
m_intermediate_hash[5] = 0x9B05688C;
m_intermediate_hash[6] = 0x1F83D9AB;
m_intermediate_hash[7] = 0x5BE0CD19;
/* Set state. */
m_state = State_Initialized;
}
void Sha256Impl::Update(const void *data, size_t size) {
/* Verify we're in a state to update. */
AMS_ASSERT(m_state == State_Initialized);
/* Advance our input bit count. */
m_bits_consumed += BITSIZEOF(u8) * (((m_buffered_bytes + size) / BlockSize) * BlockSize);
/* Process anything we have buffered. */
const u8 *data8 = static_cast<const u8 *>(data);
size_t remaining = size;
if (m_buffered_bytes > 0) {
const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
std::memcpy(m_buffer + m_buffered_bytes, data8, copy_size);
data8 += copy_size;
remaining -= copy_size;
m_buffered_bytes += copy_size;
/* Process a block, if we filled one. */
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock(m_buffer);
m_buffered_bytes = 0;
}
}
/* Process blocks, if we have any. */
if (remaining >= BlockSize) {
const size_t blocks = remaining / BlockSize;
this->ProcessBlocks(data8, blocks);
data8 += BlockSize * blocks;
remaining -= BlockSize * blocks;
}
/* Copy any leftover data to our buffer. */
if (remaining > 0) {
m_buffered_bytes = remaining;
std::memcpy(m_buffer, data8, remaining);
}
}
void Sha256Impl::GetHash(void *dst, size_t size) {
/* Verify we're in a state to get hash. */
AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
AMS_ASSERT(size >= HashSize);
AMS_UNUSED(size);
/* If we need to, process the last block. */
if (m_state == State_Initialized) {
this->ProcessLastBlock();
m_state = State_Done;
}
/* Copy the output hash. */
if constexpr (util::IsLittleEndian()) {
static_assert(HashSize % sizeof(u32) == 0);
u32 *dst_32 = static_cast<u32 *>(dst);
for (size_t i = 0; i < HashSize / sizeof(u32); ++i) {
dst_32[i] = util::LoadBigEndian<u32>(m_intermediate_hash + i);
}
} else {
std::memcpy(dst, m_intermediate_hash, HashSize);
}
}
void Sha256Impl::InitializeWithContext(const Sha256Context *context) {
/* Copy state in from the context. */
std::memcpy(m_intermediate_hash, context->intermediate_hash, sizeof(m_intermediate_hash));
m_bits_consumed = context->bits_consumed;
/* Reset other fields. */
m_buffered_bytes = 0;
m_state = State_Initialized;
}
size_t Sha256Impl::GetContext(Sha256Context *context) const {
/* Check our state. */
AMS_ASSERT(m_state == State_Initialized);
/* Copy out the context. */
std::memcpy(context->intermediate_hash, m_intermediate_hash, sizeof(context->intermediate_hash));
context->bits_consumed = m_bits_consumed;
return m_buffered_bytes;
}
ALWAYS_INLINE void Sha256Impl::ProcessBlock(const void *data) {
return this->ProcessBlocks(static_cast<const u8 *>(data), 1);
}
void Sha256Impl::ProcessBlocks(const u8 *data, size_t block_count) {
/* Load previous hash with intermediate state, current hash with zeroes. */
uint32x4_t prev_hash0 = vld1q_u32(m_intermediate_hash + 0);
uint32x4_t prev_hash1 = vld1q_u32(m_intermediate_hash + 4);
uint32x4_t cur_hash0 = vdupq_n_u32(0);
uint32x4_t cur_hash1 = vdupq_n_u32(0);
/* Process blocks. */
do {
uint32x4_t round_constant0, round_constant1;
uint32x4_t data0, data1, data2, data3;
uint32x4_t tmp0, tmp1, tmp2, tmp3;
uint32x4_t tmp_hash;
/* Use optimized ASM implementation to process the block. */
__asm__ __volatile__ (
"ldp %q[data0], %q[data1], [%[data]], #0x20\n"
"ldp %q[data2], %q[data3], [%[data]], #0x20\n"
"add %[cur_hash0].4s, %[cur_hash0].4s, %[prev_hash0].4s\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x00]\n"
"add %[cur_hash1].4s, %[cur_hash1].4s, %[prev_hash1].4s\n"
"rev32 %[data0].16b, %[data0].16b\n"
"rev32 %[data1].16b, %[data1].16b\n"
"rev32 %[data2].16b, %[data2].16b\n"
"rev32 %[data3].16b, %[data3].16b\n"
"add %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
"add %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x20]\n"
"sha256su0 %[data0].4s, %[data1].4s\n"
"mov %[prev_hash0].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
"mov %[prev_hash1].16b, %[cur_hash1].16b\n"
"sha256h2 %q[cur_hash1], %q[prev_hash0], %[tmp0].4s\n"
"sha256su0 %[data1].4s, %[data2].4s\n"
"sha256su1 %[data0].4s, %[data2].4s, %[data3].4s\n"
"add %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
"sha256su0 %[data2].4s, %[data3].4s\n"
"sha256su1 %[data1].4s, %[data3].4s, %[data0].4s\n"
"add %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x40]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
"sha256su0 %[data3].4s, %[data0].4s\n"
"sha256su1 %[data2].4s, %[data0].4s, %[data1].4s\n"
"add %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
"sha256su0 %[data0].4s, %[data1].4s\n"
"sha256su1 %[data3].4s, %[data1].4s, %[data2].4s\n"
"add %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x60]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp0].4s\n"
"sha256su0 %[data1].4s, %[data2].4s\n"
"sha256su1 %[data0].4s, %[data2].4s, %[data3].4s\n"
"add %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
"sha256su0 %[data2].4s, %[data3].4s\n"
"sha256su1 %[data1].4s, %[data3].4s, %[data0].4s\n"
"add %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x80]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
"sha256su0 %[data3].4s, %[data0].4s\n"
"sha256su1 %[data2].4s, %[data0].4s, %[data1].4s\n"
"add %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
"sha256su0 %[data0].4s, %[data1].4s\n"
"sha256su1 %[data3].4s, %[data1].4s, %[data2].4s\n"
"add %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0xA0]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp0].4s\n"
"sha256su0 %[data1].4s, %[data2].4s\n"
"sha256su1 %[data0].4s, %[data2].4s, %[data3].4s\n"
"add %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
"sha256su0 %[data2].4s, %[data3].4s\n"
"sha256su1 %[data1].4s, %[data3].4s, %[data0].4s\n"
"add %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0xC0]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
"sha256su0 %[data3].4s, %[data0].4s\n"
"sha256su1 %[data2].4s, %[data0].4s, %[data1].4s\n"
"add %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
"sha256su1 %[data3].4s, %[data1].4s, %[data2].4s\n"
"add %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0xE0]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp0].4s\n"
"add %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
"add %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
: [data0]"=w"(data0), [data1]"=w"(data1), [data2]"=w"(data2), [data3]"=w"(data3),
[tmp0]"=w"(tmp0), [tmp1]"=w"(tmp1), [tmp2]"=w"(tmp2), [tmp3]"=w"(tmp3),
[round_constant0]"=w"(round_constant0), [round_constant1]"=w"(round_constant1),
[cur_hash0]"+w"(cur_hash0), [cur_hash1]"+w"(cur_hash1),
[prev_hash0]"+w"(prev_hash0), [prev_hash1]"+w"(prev_hash1),
[tmp_hash]"=w"(tmp_hash), [data]"+r"(data)
: "m"(*(const u8 (*)[block_count*BlockSize])data), [round_constants]"r"(RoundConstants)
:
);
} while (--block_count != 0);
/* Add hashes together, and store. */
cur_hash0 = vaddq_u32(prev_hash0, cur_hash0);
cur_hash1 = vaddq_u32(prev_hash1, cur_hash1);
vst1q_u32(m_intermediate_hash + 0, cur_hash0);
vst1q_u32(m_intermediate_hash + 4, cur_hash1);
}
void Sha256Impl::ProcessLastBlock() {
/* Setup the final block. */
constexpr const auto BlockSizeWithoutSizeField = BlockSize - sizeof(u64);
/* Increment our bits consumed. */
m_bits_consumed += BITSIZEOF(u8) * m_buffered_bytes;
/* Add 0x80 terminator. */
m_buffer[m_buffered_bytes++] = 0x80;
/* If we can process the size field directly, do so, otherwise set up to process it. */
if (m_buffered_bytes <= BlockSizeWithoutSizeField) {
/* Clear up to size field. */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSizeWithoutSizeField - m_buffered_bytes);
} else {
/* Consume full block */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSize - m_buffered_bytes);
this->ProcessBlock(m_buffer);
/* Clear up to size field. */
std::memset(m_buffer, 0, BlockSizeWithoutSizeField);
}
/* Store the size field. */
util::StoreBigEndian<u64>(reinterpret_cast<u64 *>(m_buffer + BlockSizeWithoutSizeField), m_bits_consumed);
/* Process the final block. */
this->ProcessBlock(m_buffer);
}
}
#endif

View File

@@ -1,260 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
namespace ams::crypto::impl {
namespace {
alignas(Sha256Impl::BlockSize) constexpr const u32 RoundConstants[0x40] = {
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
};
constexpr ALWAYS_INLINE u32 Choose(u32 x, u32 y, u32 z) {
return (x & y) ^ ((~x) & z);
}
constexpr ALWAYS_INLINE u32 Majority(u32 x, u32 y, u32 z) {
return (x & y) ^ (x & z) ^ (y & z);
}
constexpr ALWAYS_INLINE u32 LargeSigma0(u32 x) {
return util::RotateRight<u32>(x, 2) ^ util::RotateRight<u32>(x, 13) ^ util::RotateRight<u32>(x, 22);
}
constexpr ALWAYS_INLINE u32 LargeSigma1(u32 x) {
return util::RotateRight<u32>(x, 6) ^ util::RotateRight<u32>(x, 11) ^ util::RotateRight<u32>(x, 25);
}
constexpr ALWAYS_INLINE u32 SmallSigma0(u32 x) {
return util::RotateRight<u32>(x, 7) ^ util::RotateRight<u32>(x, 18) ^ (x >> 3);
}
constexpr ALWAYS_INLINE u32 SmallSigma1(u32 x) {
return util::RotateRight<u32>(x, 17) ^ util::RotateRight<u32>(x, 19) ^ (x >> 10);
}
}
void Sha256Impl::Initialize() {
/* Reset buffered bytes/bits. */
m_buffered_bytes = 0;
m_bits_consumed = 0;
/* Set intermediate hash. */
m_intermediate_hash[0] = 0x6A09E667;
m_intermediate_hash[1] = 0xBB67AE85;
m_intermediate_hash[2] = 0x3C6EF372;
m_intermediate_hash[3] = 0xA54FF53A;
m_intermediate_hash[4] = 0x510E527F;
m_intermediate_hash[5] = 0x9B05688C;
m_intermediate_hash[6] = 0x1F83D9AB;
m_intermediate_hash[7] = 0x5BE0CD19;
/* Set state. */
m_state = State_Initialized;
}
void Sha256Impl::Update(const void *data, size_t size) {
/* Verify we're in a state to update. */
AMS_ASSERT(m_state == State_Initialized);
/* Advance our input bit count. */
m_bits_consumed += BITSIZEOF(u8) * (((m_buffered_bytes + size) / BlockSize) * BlockSize);
/* Process anything we have buffered. */
const u8 *data8 = static_cast<const u8 *>(data);
size_t remaining = size;
if (m_buffered_bytes > 0) {
const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
std::memcpy(m_buffer + m_buffered_bytes, data8, copy_size);
data8 += copy_size;
remaining -= copy_size;
m_buffered_bytes += copy_size;
/* Process a block, if we filled one. */
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock(m_buffer);
m_buffered_bytes = 0;
}
}
/* Process blocks, if we have any. */
while (remaining >= BlockSize) {
this->ProcessBlock(data8);
data8 += BlockSize;
remaining -= BlockSize;
}
/* Copy any leftover data to our buffer. */
if (remaining > 0) {
m_buffered_bytes = remaining;
std::memcpy(m_buffer, data8, remaining);
}
}
void Sha256Impl::GetHash(void *dst, size_t size) {
/* Verify we're in a state to get hash. */
AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
AMS_ASSERT(size >= HashSize);
AMS_UNUSED(size);
/* If we need to, process the last block. */
if (m_state == State_Initialized) {
this->ProcessLastBlock();
m_state = State_Done;
}
/* Copy the output hash. */
if constexpr (util::IsLittleEndian()) {
static_assert(HashSize % sizeof(u32) == 0);
u32 *dst_32 = static_cast<u32 *>(dst);
for (size_t i = 0; i < HashSize / sizeof(u32); ++i) {
dst_32[i] = util::LoadBigEndian<u32>(m_intermediate_hash + i);
}
} else {
std::memcpy(dst, m_intermediate_hash, HashSize);
}
}
void Sha256Impl::InitializeWithContext(const Sha256Context *context) {
/* Copy state in from the context. */
std::memcpy(m_intermediate_hash, context->intermediate_hash, sizeof(m_intermediate_hash));
m_bits_consumed = context->bits_consumed;
/* Reset other fields. */
m_buffered_bytes = 0;
m_state = State_Initialized;
}
size_t Sha256Impl::GetContext(Sha256Context *context) const {
/* Check our state. */
AMS_ASSERT(m_state == State_Initialized);
/* Copy out the context. */
std::memcpy(context->intermediate_hash, m_intermediate_hash, sizeof(context->intermediate_hash));
context->bits_consumed = m_bits_consumed;
return m_buffered_bytes;
}
void Sha256Impl::ProcessBlock(const void *data) {
/* Load work variables. */
u32 a = m_intermediate_hash[0];
u32 b = m_intermediate_hash[1];
u32 c = m_intermediate_hash[2];
u32 d = m_intermediate_hash[3];
u32 e = m_intermediate_hash[4];
u32 f = m_intermediate_hash[5];
u32 g = m_intermediate_hash[6];
u32 h = m_intermediate_hash[7];
u32 tmp[2];
size_t i;
/* Copy the input. */
u32 w[64];
if constexpr (util::IsLittleEndian()) {
static_assert(BlockSize % sizeof(u32) == 0);
const u32 *src_32 = static_cast<const u32 *>(data);
for (size_t i = 0; i < BlockSize / sizeof(u32); ++i) {
w[i] = util::LoadBigEndian<u32>(src_32 + i);
}
} else {
std::memcpy(w, data, BlockSize);
}
/* Initialize the rest of w. */
for (i = BlockSize / sizeof(u32); i < util::size(w); ++i) {
const u32 *prev = w + (i - BlockSize / sizeof(u32));
w[i] = prev[0] + SmallSigma0(prev[1]) + prev[9] + SmallSigma1(prev[14]);
}
/* Perform rounds. */
for (i = 0; i < 64; ++i) {
tmp[0] = h + LargeSigma1(e) + Choose(e, f, g) + RoundConstants[i] + w[i];
tmp[1] = LargeSigma0(a) + Majority(a, b, c);
h = g;
g = f;
f = e;
e = d + tmp[0];
d = c;
c = b;
b = a;
a = tmp[0] + tmp[1];
}
/* Update intermediate hash. */
m_intermediate_hash[0] += a;
m_intermediate_hash[1] += b;
m_intermediate_hash[2] += c;
m_intermediate_hash[3] += d;
m_intermediate_hash[4] += e;
m_intermediate_hash[5] += f;
m_intermediate_hash[6] += g;
m_intermediate_hash[7] += h;
}
void Sha256Impl::ProcessLastBlock() {
/* Setup the final block. */
constexpr const auto BlockSizeWithoutSizeField = BlockSize - sizeof(u64);
/* Increment our bits consumed. */
m_bits_consumed += BITSIZEOF(u8) * m_buffered_bytes;
/* Add 0x80 terminator. */
m_buffer[m_buffered_bytes++] = 0x80;
/* If we can process the size field directly, do so, otherwise set up to process it. */
if (m_buffered_bytes <= BlockSizeWithoutSizeField) {
/* Clear up to size field. */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSizeWithoutSizeField - m_buffered_bytes);
} else {
/* Consume full block */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSize - m_buffered_bytes);
this->ProcessBlock(m_buffer);
/* Clear up to size field. */
std::memset(m_buffer, 0, BlockSizeWithoutSizeField);
}
/* Store the size field. */
util::StoreBigEndian<u64>(reinterpret_cast<u64 *>(m_buffer + BlockSizeWithoutSizeField), m_bits_consumed);
/* Process the final block. */
this->ProcessBlock(m_buffer);
}
}

View File

@@ -1,240 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
namespace ams::crypto::impl {
namespace {
constexpr auto NumRounds = 24;
constexpr const u64 IotaRoundConstant[NumRounds] = {
UINT64_C(0x0000000000000001), UINT64_C(0x0000000000008082),
UINT64_C(0x800000000000808A), UINT64_C(0x8000000080008000),
UINT64_C(0x000000000000808B), UINT64_C(0x0000000080000001),
UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008009),
UINT64_C(0x000000000000008A), UINT64_C(0x0000000000000088),
UINT64_C(0x0000000080008009), UINT64_C(0x000000008000000A),
UINT64_C(0x000000008000808B), UINT64_C(0x800000000000008B),
UINT64_C(0x8000000000008089), UINT64_C(0x8000000000008003),
UINT64_C(0x8000000000008002), UINT64_C(0x8000000000000080),
UINT64_C(0x000000000000800A), UINT64_C(0x800000008000000A),
UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008080),
UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008008)
};
constexpr const int RhoShiftBit[NumRounds] = {
1, 3, 6, 10, 15, 21, 28, 36,
45, 55, 2, 14, 27, 41, 56, 8,
25, 43, 62, 18, 39, 61, 20, 44
};
constexpr const int RhoNextIndex[NumRounds] = {
10, 7, 11, 17, 18, 3, 5, 16,
8, 21, 24, 4, 15, 23, 19, 13,
12, 2, 20, 14, 22, 9, 6, 1
};
}
template<size_t HashSize>
void Sha3Impl<HashSize>::Initialize() {
/* Clear internal state. */
std::memset(m_internal_state, 0, sizeof(m_internal_state));
/* Reset buffered bytes. */
m_buffered_bytes = 0;
/* Set state. */
m_state = State_Initialized;
}
template<size_t HashSize>
void Sha3Impl<HashSize>::Update(const void *data, size_t size) {
/* Verify we're in a state to update. */
AMS_ASSERT(m_state == State_Initialized);
/* Process we have anything buffered. */
const u8 *data8 = static_cast<const u8 *>(data);
size_t remaining = size;
if (m_buffered_bytes > 0) {
/* Determine how much we can copy. */
const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
/* Mix the bytes into our state. */
u8 *dst8 = reinterpret_cast<u8 *>(m_internal_state) + m_buffered_bytes;
for (size_t i = 0; i < copy_size; ++i) {
dst8[i] ^= data8[i];
}
/* Advance. */
data8 += copy_size;
remaining -= copy_size;
m_buffered_bytes += copy_size;
/* Process a block, if we filled one. */
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock();
m_buffered_bytes = 0;
}
}
/* Process blocks, if we have any. */
while (remaining >= BlockSize) {
/* Mix the bytes into our state. */
u8 *dst8 = reinterpret_cast<u8 *>(m_internal_state);
for (size_t i = 0; i < BlockSize; ++i) {
dst8[i] ^= data8[i];
}
this->ProcessBlock();
data8 += BlockSize;
remaining -= BlockSize;
}
/* Copy any leftover data to our buffer. */
if (remaining > 0) {
u8 *dst8 = reinterpret_cast<u8 *>(m_internal_state);
for (size_t i = 0; i < remaining; ++i) {
dst8[i] ^= data8[i];
}
m_buffered_bytes = remaining;
}
}
template<size_t HashSize>
void Sha3Impl<HashSize>::GetHash(void *dst, size_t size) {
/* Verify we're in a state to get hash. */
AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
AMS_ASSERT(size >= HashSize);
AMS_UNUSED(size);
/* If we need to, process the last block. */
if (m_state == State_Initialized) {
this->ProcessLastBlock();
m_state = State_Done;
}
/* Copy the output hash. */
std::memcpy(dst, m_internal_state, HashSize);
}
template<size_t HashSize>
void Sha3Impl<HashSize>::InitializeWithContext(const Sha3Context *context) {
/* Check the context is for the right hash size. */
AMS_ASSERT(context->hash_size == HashSize);
/* Set buffered bytes. */
m_buffered_bytes = context->buffered_bytes;
/* Copy state in from the context. */
std::memcpy(m_internal_state, context->internal_state, sizeof(m_internal_state));
/* Reset other fields. */
m_state = State_Initialized;
}
template<size_t HashSize>
void Sha3Impl<HashSize>::GetContext(Sha3Context *context) const {
/* Check our state. */
AMS_ASSERT(m_state == State_Initialized);
/* Set the output hash size. */
context->hash_size = HashSize;
/* Set buffered bytes. */
context->buffered_bytes = m_buffered_bytes;
/* Copy out the context. */
std::memcpy(context->internal_state, m_internal_state, sizeof(context->internal_state));
}
template<size_t HashSize>
void Sha3Impl<HashSize>::ProcessBlock() {
/* Ensure correct endianness. */
if constexpr (util::IsBigEndian()) {
for (size_t i = 0; i < util::size(m_internal_state); ++i) {
m_internal_state[i] = util::LoadLittleEndian<u64>(m_internal_state + i);
}
}
/* Perform all rounds. */
uint64_t tmp, C[5];
for (auto round = 0; round < NumRounds; ++round) {
/* Handle theta. */
for (size_t i = 0; i < 5; ++i) {
C[i] = m_internal_state[i] ^ m_internal_state[i + 5] ^ m_internal_state[i + 10] ^ m_internal_state[i + 15] ^ m_internal_state[i + 20];
}
for (size_t i = 0; i < 5; ++i) {
tmp = C[(i + 4) % 5] ^ util::RotateLeft<u64>(C[(i + 1) % 5], 1);
for (size_t j = 0; j < 5; ++j) {
m_internal_state[5 * j + i] ^= tmp;
}
}
/* Handle rho/pi. */
tmp = m_internal_state[1];
for (size_t i = 0; i < NumRounds; ++i) {
const auto rho_next_idx = RhoNextIndex[i];
C[0] = m_internal_state[rho_next_idx];
m_internal_state[rho_next_idx] = util::RotateLeft<u64>(tmp, RhoShiftBit[i]);
tmp = C[0];
}
/* Handle chi. */
for (size_t i = 0; i < 5; ++i) {
for (size_t j = 0; j < 5; ++j) {
C[j] = m_internal_state[5 * i + j];
}
for (size_t j = 0; j < 5; ++j) {
m_internal_state[5 * i + j] ^= (~C[(j + 1) % 5]) & C[(j + 2) % 5];
}
}
/* Handle iota. */
m_internal_state[0] ^= IotaRoundConstant[round];
}
/* Ensure correct endianness. */
if constexpr (util::IsBigEndian()) {
for (size_t i = 0; i < util::size(m_internal_state); ++i) {
util::StoreLittleEndian<u64>(m_internal_state + i, m_internal_state[i]);
}
}
}
template<size_t HashSize>
void Sha3Impl<HashSize>::ProcessLastBlock() {
/* Mix final bits (011) into our state. */
reinterpret_cast<u8 *>(m_internal_state)[m_buffered_bytes] ^= 0b110;
/* Mix in the high bit of the last word in our block. */
constexpr u64 FinalMask = UINT64_C(0x8000000000000000);
m_internal_state[(BlockSize / sizeof(u64)) - 1] ^= FinalMask;
/* Process the last block. */
this->ProcessBlock();
}
/* Explicitly instantiate the supported hash sizes. */
template class Sha3Impl<224 / BITSIZEOF(u8)>;
template class Sha3Impl<256 / BITSIZEOF(u8)>;
template class Sha3Impl<384 / BITSIZEOF(u8)>;
template class Sha3Impl<512 / BITSIZEOF(u8)>;
}

View File

@@ -1,97 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <vapours.hpp>
namespace ams::crypto::impl {
template<typename Cipher, typename Self>
void UpdateImpl(Self *self, const void *src, size_t src_size) {
const size_t BlockSize = self->GetBlockSize();
const u8 *src_u8 = static_cast<const u8 *>(src);
size_t remaining = src_size;
if (const size_t buffered = self->GetBufferedDataSize(); buffered > 0) {
const size_t partial = std::min(BlockSize - buffered, remaining);
self->ProcessPartialData(src_u8, partial);
src_u8 += partial;
remaining -= partial;
}
if (remaining >= BlockSize) {
const size_t num_blocks = remaining / BlockSize;
self->template ProcessBlocks<Cipher>(src_u8, num_blocks);
const size_t processed = num_blocks * BlockSize;
src_u8 += processed;
remaining -= processed;
}
if (remaining > 0) {
self->ProcessRemainingData(src_u8, remaining);
}
}
template<typename Cipher, typename Self>
size_t UpdateImpl(Self *self, void *dst, size_t dst_size, const void *src, size_t src_size) {
AMS_UNUSED(dst_size);
const size_t BlockSize = self->GetBlockSize();
const u8 *src_u8 = static_cast<const u8 *>(src);
u8 *dst_u8 = static_cast<u8 *>(dst);
size_t remaining = src_size;
size_t total_processed = 0;
if (const size_t buffered = self->GetBufferedDataSize(); buffered > 0) {
const size_t partial = std::min(BlockSize - buffered, remaining);
const size_t processed = self->ProcessPartialData(dst_u8, src_u8, partial);
dst_u8 += processed;
total_processed += processed;
src_u8 += partial;
remaining -= partial;
}
if (remaining >= BlockSize) {
const size_t num_blocks = remaining / BlockSize;
const size_t input_size = num_blocks * BlockSize;
const size_t processed = self->template ProcessBlocks<Cipher>(dst_u8, src_u8, num_blocks);
dst_u8 += processed;
total_processed += processed;
src_u8 += input_size;
remaining -= input_size;
}
if (remaining > 0) {
const size_t processed = self->ProcessRemainingData(dst_u8, src_u8, remaining);
total_processed += processed;
}
return total_processed;
}
}

View File

@@ -1,57 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
#include "crypto_update_impl.hpp"
namespace ams::crypto::impl {
size_t XtsModeImpl::UpdateGeneric(void *dst, size_t dst_size, const void *src, size_t src_size) {
AMS_ASSERT(m_state == State_Initialized || m_state == State_Processing);
return UpdateImpl<void>(this, dst, dst_size, src, src_size);
}
size_t XtsModeImpl::ProcessBlocksGeneric(u8 *dst, const u8 *src, size_t num_blocks) {
size_t processed = BlockSize * (num_blocks - 1);
if (m_state == State_Processing) {
this->ProcessBlock(dst, m_last_block);
dst += BlockSize;
processed += BlockSize;
}
while ((--num_blocks) > 0) {
this->ProcessBlock(dst, src);
dst += BlockSize;
src += BlockSize;
}
std::memcpy(m_last_block, src, BlockSize);
m_state = State_Processing;
return processed;
}
template<> size_t XtsModeImpl::Update<AesEncryptor128>(void *dst, size_t dst_size, const void *src, size_t src_size) { return this->UpdateGeneric(dst, dst_size, src, src_size); }
template<> size_t XtsModeImpl::Update<AesEncryptor192>(void *dst, size_t dst_size, const void *src, size_t src_size) { return this->UpdateGeneric(dst, dst_size, src, src_size); }
template<> size_t XtsModeImpl::Update<AesEncryptor256>(void *dst, size_t dst_size, const void *src, size_t src_size) { return this->UpdateGeneric(dst, dst_size, src, src_size); }
template<> size_t XtsModeImpl::Update<AesDecryptor128>(void *dst, size_t dst_size, const void *src, size_t src_size) { return this->UpdateGeneric(dst, dst_size, src, src_size); }
template<> size_t XtsModeImpl::Update<AesDecryptor192>(void *dst, size_t dst_size, const void *src, size_t src_size) { return this->UpdateGeneric(dst, dst_size, src, src_size); }
template<> size_t XtsModeImpl::Update<AesDecryptor256>(void *dst, size_t dst_size, const void *src, size_t src_size) { return this->UpdateGeneric(dst, dst_size, src, src_size); }
}

View File

@@ -1,144 +0,0 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
namespace ams::crypto::impl {
namespace {
/* TODO: Support non-Nintendo Endianness */
void MultiplyTweakGeneric(u64 *tweak) {
const u64 carry = tweak[1] & (static_cast<u64>(1) << (BITSIZEOF(u64) - 1));
tweak[1] = ((tweak[1] << 1) | (tweak[0] >> (BITSIZEOF(u64) - 1)));
tweak[0] = (tweak[0] << 1);
if (carry) {
tweak[0] ^= static_cast<u64>(0x87);
}
}
}
void XtsModeImpl::ProcessBlock(u8 *dst, const u8 *src) {
u8 tmp[BlockSize];
/* Xor. */
for (size_t i = 0; i < BlockSize; i++) {
tmp[i] = m_tweak[i] ^ src[i];
}
/* Crypt */
m_cipher_func(tmp, tmp, m_cipher_ctx);
/* Xor. */
for (size_t i = 0; i < BlockSize; i++) {
dst[i] = m_tweak[i] ^ tmp[i];
}
MultiplyTweakGeneric(reinterpret_cast<u64 *>(m_tweak));
}
size_t XtsModeImpl::FinalizeEncryption(void *dst, size_t dst_size) {
AMS_ASSERT(m_state == State_Processing);
AMS_UNUSED(dst_size);
u8 *dst_u8 = static_cast<u8 *>(dst);
size_t processed = 0;
if (m_num_buffered == 0) {
this->ProcessBlock(dst_u8, m_last_block);
processed = BlockSize;
} else {
this->ProcessBlock(m_last_block, m_last_block);
std::memcpy(m_buffer + m_num_buffered, m_last_block + m_num_buffered, BlockSize - m_num_buffered);
this->ProcessBlock(dst_u8, m_buffer);
std::memcpy(dst_u8 + BlockSize, m_last_block, m_num_buffered);
processed = BlockSize + m_num_buffered;
}
m_state = State_Done;
return processed;
}
size_t XtsModeImpl::FinalizeDecryption(void *dst, size_t dst_size) {
AMS_ASSERT(m_state == State_Processing);
AMS_UNUSED(dst_size);
u8 *dst_u8 = static_cast<u8 *>(dst);
size_t processed = 0;
if (m_num_buffered == 0) {
this->ProcessBlock(dst_u8, m_last_block);
processed = BlockSize;
} else {
u8 tmp_tweak[BlockSize];
std::memcpy(tmp_tweak, m_tweak, BlockSize);
MultiplyTweakGeneric(reinterpret_cast<u64 *>(m_tweak));
this->ProcessBlock(m_last_block, m_last_block);
std::memcpy(m_buffer + m_num_buffered, m_last_block + m_num_buffered, BlockSize - m_num_buffered);
std::memcpy(m_tweak, tmp_tweak, BlockSize);
this->ProcessBlock(dst_u8, m_buffer);
std::memcpy(dst_u8 + BlockSize, m_last_block, m_num_buffered);
processed = BlockSize + m_num_buffered;
}
m_state = State_Done;
return processed;
}
size_t XtsModeImpl::ProcessPartialData(u8 *dst, const u8 *src, size_t size) {
size_t processed = 0;
std::memcpy(m_buffer + m_num_buffered, src, size);
m_num_buffered += size;
if (m_num_buffered == BlockSize) {
if (m_state == State_Processing) {
this->ProcessBlock(dst, m_last_block);
processed += BlockSize;
}
std::memcpy(m_last_block, m_buffer, BlockSize);
m_num_buffered = 0;
m_state = State_Processing;
}
return processed;
}
size_t XtsModeImpl::ProcessRemainingData(u8 *dst, const u8 *src, size_t size) {
AMS_UNUSED(dst);
std::memcpy(m_buffer, src, size);
m_num_buffered = size;
return 0;
}
}