ams: support building unit test programs on windows/linux/macos

This commit is contained in:
Michael Scire
2022-03-06 12:08:20 -08:00
committed by SciresM
parent 9a38be201a
commit 64a97576d0
756 changed files with 33359 additions and 9372 deletions

View File

@@ -14,15 +14,111 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
#if defined(ATMOSPHERE_IS_STRATOSPHERE)
#include <arm_neon.h>
#endif
namespace ams::crypto::impl {
#ifdef ATMOSPHERE_IS_STRATOSPHERE
#if defined(ATMOSPHERE_IS_STRATOSPHERE)
namespace {
constexpr bool IsSupportedKeySize(size_t size) {
return size == 16 || size == 24 || size == 32;
/* Helper macros to setup for inline AES asm */
#define AES_ENC_DEC_SETUP_VARS() \
const auto *ctx = reinterpret_cast<const RoundKeyHelper<KeySize> *>(m_round_keys); \
static_assert(sizeof(*ctx) == sizeof(m_round_keys)); \
\
uint8x16_t tmp = vld1q_u8((const uint8_t *)src); \
uint8x16_t tmp2
#define AES_ENC_DEC_OUTPUT_VARS() \
[tmp]"+w"(tmp), [tmp2]"=w"(tmp2)
#define AES_ENC_DEC_STORE_RESULT() \
vst1q_u8((uint8_t *)dst, tmp)
/* Helper macros to do AES encryption, via inline asm. */
#define AES_ENC_ROUND(n) \
"ldr %q[tmp2], %[round_key_" #n "]\n" \
"aese %[tmp].16b, %[tmp2].16b\n" \
"aesmc %[tmp].16b, %[tmp].16b\n"
#define AES_ENC_FINAL_ROUND() \
"ldr %q[tmp2], %[round_key_second_last]\n" \
"aese %[tmp].16b, %[tmp2].16b\n" \
"ldr %q[tmp2], %[round_key_last]\n" \
"eor %[tmp].16b, %[tmp].16b, %[tmp2].16b"
#define AES_ENC_INPUT_ROUND_KEY(num_rounds, n) \
[round_key_##n]"m"(ctx->round_keys[(n-1)])
#define AES_ENC_INPUT_LAST_ROUND_KEYS(num_rounds) \
[round_key_second_last]"m"(ctx->round_keys[(num_rounds - 1)]), \
[round_key_last]"m"(ctx->round_keys[(num_rounds)])
/* Helper macros to do AES decryption, via inline asm. */
#define AES_DEC_ROUND(n) \
"ldr %q[tmp2], %[round_key_" #n "]\n" \
"aesd %[tmp].16b, %[tmp2].16b\n" \
"aesimc %[tmp].16b, %[tmp].16b\n"
#define AES_DEC_FINAL_ROUND() \
"ldr %q[tmp2], %[round_key_second_last]\n" \
"aesd %[tmp].16b, %[tmp2].16b\n" \
"ldr %q[tmp2], %[round_key_last]\n" \
"eor %[tmp].16b, %[tmp].16b, %[tmp2].16b"
#define AES_DEC_INPUT_ROUND_KEY(num_rounds, n) \
[round_key_##n]"m"(ctx->round_keys[(num_rounds + 1 - n)])
#define AES_DEC_INPUT_LAST_ROUND_KEYS(num_rounds) \
[round_key_second_last]"m"(ctx->round_keys[1]), \
[round_key_last]"m"(ctx->round_keys[0])
constexpr const u8 RoundKeyRcon0[] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, 0x2F,
0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, 0xD4, 0xB3, 0x7D, 0xFA, 0xEF, 0xC5, 0x91,
};
constexpr const u8 SubBytesTable[0x100] = {
0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16,
};
constexpr auto AesWordByte0Shift = 0 * BITSIZEOF(u8);
constexpr auto AesWordByte1Shift = 1 * BITSIZEOF(u8);
constexpr auto AesWordByte2Shift = 2 * BITSIZEOF(u8);
constexpr auto AesWordByte3Shift = 3 * BITSIZEOF(u8);
constexpr u32 SubBytesAndRotate(u32 v) {
return (static_cast<u32>(SubBytesTable[(v >> AesWordByte0Shift) & 0xFFu]) << AesWordByte3Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte1Shift) & 0xFFu]) << AesWordByte0Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte2Shift) & 0xFFu]) << AesWordByte1Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte3Shift) & 0xFFu]) << AesWordByte2Shift);
}
constexpr u32 SubBytes(u32 v) {
return (static_cast<u32>(SubBytesTable[(v >> AesWordByte0Shift) & 0xFFu]) << AesWordByte0Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte1Shift) & 0xFFu]) << AesWordByte1Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte2Shift) & 0xFFu]) << AesWordByte2Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte3Shift) & 0xFFu]) << AesWordByte3Shift);
}
}
@@ -34,78 +130,360 @@ namespace ams::crypto::impl {
template<size_t KeySize>
void AesImpl<KeySize>::Initialize(const void *key, size_t key_size, bool is_encrypt) {
static_assert(IsSupportedKeySize(KeySize));
/* Check pre-conditions. */
AMS_ASSERT(key_size == KeySize);
AMS_UNUSED(key_size);
if constexpr (KeySize == 16) {
/* Aes 128. */
static_assert(sizeof(m_round_keys) == sizeof(::Aes128Context));
aes128ContextCreate(reinterpret_cast<Aes128Context *>(m_round_keys), key, is_encrypt);
} else if constexpr (KeySize == 24) {
/* Aes 192. */
static_assert(sizeof(m_round_keys) == sizeof(::Aes192Context));
aes192ContextCreate(reinterpret_cast<Aes192Context *>(m_round_keys), key, is_encrypt);
} else if constexpr (KeySize == 32) {
/* Aes 256. */
static_assert(sizeof(m_round_keys) == sizeof(::Aes256Context));
aes256ContextCreate(reinterpret_cast<Aes256Context *>(m_round_keys), key, is_encrypt);
} else {
/* Invalid key size. */
static_assert(!std::is_same<AesImpl<KeySize>, AesImpl<KeySize>>::value);
/* Set up key. */
u32 *dst = m_round_keys;
std::memcpy(dst, key, KeySize);
/* Perform key scheduling. */
constexpr auto InitialKeyWords = KeySize / sizeof(u32);
u32 tmp = dst[InitialKeyWords - 1];
for (auto i = InitialKeyWords; i < (RoundCount + 1) * 4; ++i) {
const auto idx_in_key = i % InitialKeyWords;
if (idx_in_key == 0) {
/* At start of key word, we need to handle sub/rotate/rcon. */
tmp = SubBytesAndRotate(tmp);
tmp ^= (RoundKeyRcon0[i / InitialKeyWords - 1] << AesWordByte0Shift);
} else if ((InitialKeyWords > 6) && idx_in_key == 4) {
/* Halfway into a 256-bit key word, we need to do an additional subbytes. */
tmp = SubBytes(tmp);
}
/* Set the key word. */
tmp ^= dst[i - InitialKeyWords];
dst[i] = tmp;
}
/* If decrypting, perform inverse mix columns on all round keys. */
if (!is_encrypt) {
auto *key8 = reinterpret_cast<u8 *>(m_round_keys) + BlockSize;
for (auto i = 1; i < RoundCount; ++i) {
vst1q_u8(key8, vaesimcq_u8(vld1q_u8(key8)));
key8 += BlockSize;
}
}
}
template<size_t KeySize>
void AesImpl<KeySize>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
static_assert(IsSupportedKeySize(KeySize));
AMS_ASSERT(src_size >= BlockSize);
AMS_ASSERT(dst_size >= BlockSize);
AMS_UNUSED(src_size, dst_size);
if constexpr (KeySize == 16) {
/* Aes 128. */
static_assert(sizeof(m_round_keys) == sizeof(::Aes128Context));
aes128EncryptBlock(reinterpret_cast<const Aes128Context *>(m_round_keys), dst, src);
} else if constexpr (KeySize == 24) {
/* Aes 192. */
static_assert(sizeof(m_round_keys) == sizeof(::Aes192Context));
aes192EncryptBlock(reinterpret_cast<const Aes192Context *>(m_round_keys), dst, src);
} else if constexpr (KeySize == 32) {
/* Aes 256. */
static_assert(sizeof(m_round_keys) == sizeof(::Aes256Context));
aes256EncryptBlock(reinterpret_cast<const Aes256Context *>(m_round_keys), dst, src);
} else {
/* Invalid key size. */
static_assert(!std::is_same<AesImpl<KeySize>, AesImpl<KeySize>>::value);
/* Get the key. */
const u8 *key8 = reinterpret_cast<const u8 *>(m_round_keys);
/* Read the block. */
uint8x16_t block = vld1q_u8(static_cast<const u8 *>(src));
/* Encrypt block. */
for (auto round = 1; round < RoundCount; ++round) {
/* Do aes round. */
block = vaeseq_u8(block, vld1q_u8(key8));
key8 += BlockSize;
/* Do mix columns. */
block = vaesmcq_u8(block);
}
/* Do last aes round. */
block = vaeseq_u8(block, vld1q_u8(key8));
key8 += BlockSize;
/* Add the final round key. */
block = veorq_u8(block, vld1q_u8(key8));
/* Store the block. */
vst1q_u8(static_cast<u8 *>(dst), block);
}
template<size_t KeySize>
void AesImpl<KeySize>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
static_assert(IsSupportedKeySize(KeySize));
AMS_ASSERT(src_size >= BlockSize);
AMS_ASSERT(dst_size >= BlockSize);
AMS_UNUSED(src_size, dst_size);
if constexpr (KeySize == 16) {
/* Aes 128. */
static_assert(sizeof(m_round_keys) == sizeof(::Aes128Context));
aes128DecryptBlock(reinterpret_cast<const Aes128Context *>(m_round_keys), dst, src);
} else if constexpr (KeySize == 24) {
/* Aes 192. */
static_assert(sizeof(m_round_keys) == sizeof(::Aes192Context));
aes192DecryptBlock(reinterpret_cast<const Aes192Context *>(m_round_keys), dst, src);
} else if constexpr (KeySize == 32) {
/* Aes 256. */
static_assert(sizeof(m_round_keys) == sizeof(::Aes256Context));
aes256DecryptBlock(reinterpret_cast<const Aes256Context *>(m_round_keys), dst, src);
} else {
/* Invalid key size. */
static_assert(!std::is_same<AesImpl<KeySize>, AesImpl<KeySize>>::value);
/* Get the key. */
const u8 *key8 = reinterpret_cast<const u8 *>(m_round_keys) + (RoundCount + BlockSize);
/* Read the block. */
uint8x16_t block = vld1q_u8(static_cast<const u8 *>(src));
/* Encrypt block. */
for (auto round = RoundCount; round > 1; --round) {
/* Do aes round. */
block = vaesdq_u8(block, vld1q_u8(key8));
key8 -= BlockSize;
/* Do mix columns. */
block = vaesimcq_u8(block);
}
/* Do last aes round. */
block = vaesdq_u8(block, vld1q_u8(key8));
key8 -= BlockSize;
/* Add the first round key. */
block = veorq_u8(block, vld1q_u8(key8));
/* Store the block. */
vst1q_u8(static_cast<u8 *>(dst), block);
}
/* Specializations when building specifically for cortex-a57 (or for apple M* processors). */
#if defined(ATMOSPHERE_CPU_CORTEX_A57) || defined(ATMOSPHERE_OS_MACOS)
namespace {
template<size_t KeySize>
struct RoundKeyHelper {
u8 round_keys[AesImpl<KeySize>::RoundCount + 1][AesImpl<KeySize>::BlockSize];
};
}
template<>
void AesImpl<16>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Setup for asm */
AES_ENC_DEC_SETUP_VARS();
/* Use optimized assembly to do all rounds. */
__asm__ __volatile__ (
AES_ENC_ROUND(1)
AES_ENC_ROUND(2)
AES_ENC_ROUND(3)
AES_ENC_ROUND(4)
AES_ENC_ROUND(5)
AES_ENC_ROUND(6)
AES_ENC_ROUND(7)
AES_ENC_ROUND(8)
AES_ENC_ROUND(9)
AES_ENC_FINAL_ROUND()
: AES_ENC_DEC_OUTPUT_VARS()
: AES_ENC_INPUT_ROUND_KEY(RoundCount, 1),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 2),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 3),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 4),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 5),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 6),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 7),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 8),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 9),
AES_ENC_INPUT_LAST_ROUND_KEYS(RoundCount)
);
/* Store result. */
AES_ENC_DEC_STORE_RESULT();
}
template<>
void AesImpl<24>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Setup for asm */
AES_ENC_DEC_SETUP_VARS();
/* Use optimized assembly to do all rounds. */
__asm__ __volatile__ (
AES_ENC_ROUND(1)
AES_ENC_ROUND(2)
AES_ENC_ROUND(3)
AES_ENC_ROUND(4)
AES_ENC_ROUND(5)
AES_ENC_ROUND(6)
AES_ENC_ROUND(7)
AES_ENC_ROUND(8)
AES_ENC_ROUND(9)
AES_ENC_ROUND(10)
AES_ENC_ROUND(11)
AES_ENC_FINAL_ROUND()
: AES_ENC_DEC_OUTPUT_VARS()
: AES_ENC_INPUT_ROUND_KEY(RoundCount, 1),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 2),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 3),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 4),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 5),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 6),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 7),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 8),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 9),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 10),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 11),
AES_ENC_INPUT_LAST_ROUND_KEYS(RoundCount)
);
/* Store result. */
AES_ENC_DEC_STORE_RESULT();
}
template<>
void AesImpl<32>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Setup for asm */
AES_ENC_DEC_SETUP_VARS();
/* Use optimized assembly to do all rounds. */
__asm__ __volatile__ (
AES_ENC_ROUND(1)
AES_ENC_ROUND(2)
AES_ENC_ROUND(3)
AES_ENC_ROUND(4)
AES_ENC_ROUND(5)
AES_ENC_ROUND(6)
AES_ENC_ROUND(7)
AES_ENC_ROUND(8)
AES_ENC_ROUND(9)
AES_ENC_ROUND(10)
AES_ENC_ROUND(11)
AES_ENC_ROUND(12)
AES_ENC_ROUND(13)
AES_ENC_FINAL_ROUND()
: AES_ENC_DEC_OUTPUT_VARS()
: AES_ENC_INPUT_ROUND_KEY(RoundCount, 1),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 2),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 3),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 4),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 5),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 6),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 7),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 8),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 9),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 10),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 11),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 12),
AES_ENC_INPUT_ROUND_KEY(RoundCount, 13),
AES_ENC_INPUT_LAST_ROUND_KEYS(RoundCount)
);
/* Store result. */
AES_ENC_DEC_STORE_RESULT();
}
template<>
void AesImpl<16>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Setup for asm */
AES_ENC_DEC_SETUP_VARS();
/* Use optimized assembly to do all rounds. */
__asm__ __volatile__ (
AES_DEC_ROUND(1)
AES_DEC_ROUND(2)
AES_DEC_ROUND(3)
AES_DEC_ROUND(4)
AES_DEC_ROUND(5)
AES_DEC_ROUND(6)
AES_DEC_ROUND(7)
AES_DEC_ROUND(8)
AES_DEC_ROUND(9)
AES_DEC_FINAL_ROUND()
: AES_ENC_DEC_OUTPUT_VARS()
: AES_DEC_INPUT_ROUND_KEY(RoundCount, 1),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 2),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 3),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 4),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 5),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 6),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 7),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 8),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 9),
AES_DEC_INPUT_LAST_ROUND_KEYS(RoundCount)
);
/* Store result. */
AES_ENC_DEC_STORE_RESULT();
}
template<>
void AesImpl<24>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Setup for asm */
AES_ENC_DEC_SETUP_VARS();
/* Use optimized assembly to do all rounds. */
__asm__ __volatile__ (
AES_DEC_ROUND(1)
AES_DEC_ROUND(2)
AES_DEC_ROUND(3)
AES_DEC_ROUND(4)
AES_DEC_ROUND(5)
AES_DEC_ROUND(6)
AES_DEC_ROUND(7)
AES_DEC_ROUND(8)
AES_DEC_ROUND(9)
AES_DEC_ROUND(10)
AES_DEC_ROUND(11)
AES_DEC_FINAL_ROUND()
: AES_ENC_DEC_OUTPUT_VARS()
: AES_DEC_INPUT_ROUND_KEY(RoundCount, 1),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 2),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 3),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 4),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 5),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 6),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 7),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 8),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 9),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 10),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 11),
AES_DEC_INPUT_LAST_ROUND_KEYS(RoundCount)
);
/* Store result. */
AES_ENC_DEC_STORE_RESULT();
}
template<>
void AesImpl<32>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
AMS_UNUSED(src_size, dst_size);
/* Setup for asm */
AES_ENC_DEC_SETUP_VARS();
/* Use optimized assembly to do all rounds. */
__asm__ __volatile__ (
AES_DEC_ROUND(1)
AES_DEC_ROUND(2)
AES_DEC_ROUND(3)
AES_DEC_ROUND(4)
AES_DEC_ROUND(5)
AES_DEC_ROUND(6)
AES_DEC_ROUND(7)
AES_DEC_ROUND(8)
AES_DEC_ROUND(9)
AES_DEC_ROUND(10)
AES_DEC_ROUND(11)
AES_DEC_ROUND(12)
AES_DEC_ROUND(13)
AES_DEC_FINAL_ROUND()
: AES_ENC_DEC_OUTPUT_VARS()
: AES_DEC_INPUT_ROUND_KEY(RoundCount, 1),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 2),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 3),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 4),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 5),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 6),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 7),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 8),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 9),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 10),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 11),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 12),
AES_DEC_INPUT_ROUND_KEY(RoundCount, 13),
AES_DEC_INPUT_LAST_ROUND_KEYS(RoundCount)
);
/* Store result. */
AES_ENC_DEC_STORE_RESULT();
}
#endif
/* Explicitly instantiate the three supported key sizes. */
template class AesImpl<16>;

View File

@@ -0,0 +1,435 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
#include "crypto_aes_impl.arch.x64.hpp"
namespace ams::crypto::impl {
namespace {
constexpr bool IsSupportedKeySize(size_t size) {
return size == 16 || size == 24 || size == 32;
}
constexpr int WordsPerBlock = AesImpl<16>::BlockSize / sizeof(u32);
static_assert(AesImpl<16>::BlockSize == AesImpl<24>::BlockSize);
static_assert(AesImpl<16>::BlockSize == AesImpl<32>::BlockSize);
bool GetAesNiAvailabilityImpl() {
/* Call cpu id. */
int a = 0, b = 0, c = 0, d = 0;
__asm__ __volatile__("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(1) : "memory");
/* Check for AES-NI and SSE2. */
return (c & (1 << 25)) && (d & (1 << 26));
}
static_assert(util::IsLittleEndian());
constexpr const u8 RoundKeyRcon0[] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, 0x2F,
0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, 0xD4, 0xB3, 0x7D, 0xFA, 0xEF, 0xC5, 0x91,
};
constexpr const u8 SubBytesTable[0x100] = {
0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16,
};
constexpr const u8 InvSubBytesTable[0x100] = {
0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB,
0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB,
0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E,
0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25,
0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92,
0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84,
0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06,
0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B,
0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73,
0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E,
0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B,
0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4,
0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F,
0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF,
0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61,
0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D,
};
constexpr bool IsSubBytesTableValid() {
for (size_t i = 0; i < 0x100; ++i) {
if (SubBytesTable[InvSubBytesTable[i]] != i) {
return false;
}
if (InvSubBytesTable[SubBytesTable[i]] != i) {
return false;
}
}
return true;
}
static_assert(IsSubBytesTableValid());
constexpr const u32 EncryptTable[0x100] = {
0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591,
0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC,
0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB,
0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B,
0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83,
0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A,
0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F,
0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA,
0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B,
0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413,
0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6,
0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85,
0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511,
0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B,
0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1,
0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF,
0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E,
0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6,
0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B,
0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD,
0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8,
0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2,
0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949,
0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810,
0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697,
0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F,
0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C,
0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27,
0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433,
0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5,
0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0,
0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C,
};
constexpr const u32 DecryptTable[0x100] = {
0x50A7F451, 0x5365417E, 0xC3A4171A, 0x965E273A, 0xCB6BAB3B, 0xF1459D1F, 0xAB58FAAC, 0x9303E34B,
0x55FA3020, 0xF66D76AD, 0x9176CC88, 0x254C02F5, 0xFCD7E54F, 0xD7CB2AC5, 0x80443526, 0x8FA362B5,
0x495AB1DE, 0x671BBA25, 0x980EEA45, 0xE1C0FE5D, 0x02752FC3, 0x12F04C81, 0xA397468D, 0xC6F9D36B,
0xE75F8F03, 0x959C9215, 0xEB7A6DBF, 0xDA595295, 0x2D83BED4, 0xD3217458, 0x2969E049, 0x44C8C98E,
0x6A89C275, 0x78798EF4, 0x6B3E5899, 0xDD71B927, 0xB64FE1BE, 0x17AD88F0, 0x66AC20C9, 0xB43ACE7D,
0x184ADF63, 0x82311AE5, 0x60335197, 0x457F5362, 0xE07764B1, 0x84AE6BBB, 0x1CA081FE, 0x942B08F9,
0x58684870, 0x19FD458F, 0x876CDE94, 0xB7F87B52, 0x23D373AB, 0xE2024B72, 0x578F1FE3, 0x2AAB5566,
0x0728EBB2, 0x03C2B52F, 0x9A7BC586, 0xA50837D3, 0xF2872830, 0xB2A5BF23, 0xBA6A0302, 0x5C8216ED,
0x2B1CCF8A, 0x92B479A7, 0xF0F207F3, 0xA1E2694E, 0xCDF4DA65, 0xD5BE0506, 0x1F6234D1, 0x8AFEA6C4,
0x9D532E34, 0xA055F3A2, 0x32E18A05, 0x75EBF6A4, 0x39EC830B, 0xAAEF6040, 0x069F715E, 0x51106EBD,
0xF98A213E, 0x3D06DD96, 0xAE053EDD, 0x46BDE64D, 0xB58D5491, 0x055DC471, 0x6FD40604, 0xFF155060,
0x24FB9819, 0x97E9BDD6, 0xCC434089, 0x779ED967, 0xBD42E8B0, 0x888B8907, 0x385B19E7, 0xDBEEC879,
0x470A7CA1, 0xE90F427C, 0xC91E84F8, 0x00000000, 0x83868009, 0x48ED2B32, 0xAC70111E, 0x4E725A6C,
0xFBFF0EFD, 0x5638850F, 0x1ED5AE3D, 0x27392D36, 0x64D90F0A, 0x21A65C68, 0xD1545B9B, 0x3A2E3624,
0xB1670A0C, 0x0FE75793, 0xD296EEB4, 0x9E919B1B, 0x4FC5C080, 0xA220DC61, 0x694B775A, 0x161A121C,
0x0ABA93E2, 0xE52AA0C0, 0x43E0223C, 0x1D171B12, 0x0B0D090E, 0xADC78BF2, 0xB9A8B62D, 0xC8A91E14,
0x8519F157, 0x4C0775AF, 0xBBDD99EE, 0xFD607FA3, 0x9F2601F7, 0xBCF5725C, 0xC53B6644, 0x347EFB5B,
0x7629438B, 0xDCC623CB, 0x68FCEDB6, 0x63F1E4B8, 0xCADC31D7, 0x10856342, 0x40229713, 0x2011C684,
0x7D244A85, 0xF83DBBD2, 0x1132F9AE, 0x6DA129C7, 0x4B2F9E1D, 0xF330B2DC, 0xEC52860D, 0xD0E3C177,
0x6C16B32B, 0x99B970A9, 0xFA489411, 0x2264E947, 0xC48CFCA8, 0x1A3FF0A0, 0xD82C7D56, 0xEF903322,
0xC74E4987, 0xC1D138D9, 0xFEA2CA8C, 0x360BD498, 0xCF81F5A6, 0x28DE7AA5, 0x268EB7DA, 0xA4BFAD3F,
0xE49D3A2C, 0x0D927850, 0x9BCC5F6A, 0x62467E54, 0xC2138DF6, 0xE8B8D890, 0x5EF7392E, 0xF5AFC382,
0xBE805D9F, 0x7C93D069, 0xA92DD56F, 0xB31225CF, 0x3B99ACC8, 0xA77D1810, 0x6E639CE8, 0x7BBB3BDB,
0x097826CD, 0xF418596E, 0x01B79AEC, 0xA89A4F83, 0x656E95E6, 0x7EE6FFAA, 0x08CFBC21, 0xE6E815EF,
0xD99BE7BA, 0xCE366F4A, 0xD4099FEA, 0xD67CB029, 0xAFB2A431, 0x31233F2A, 0x3094A5C6, 0xC066A235,
0x37BC4E74, 0xA6CA82FC, 0xB0D090E0, 0x15D8A733, 0x4A9804F1, 0xF7DAEC41, 0x0E50CD7F, 0x2FF69117,
0x8DD64D76, 0x4DB0EF43, 0x544DAACC, 0xDF0496E4, 0xE3B5D19E, 0x1B886A4C, 0xB81F2CC1, 0x7F516546,
0x04EA5E9D, 0x5D358C01, 0x737487FA, 0x2E410BFB, 0x5A1D67B3, 0x52D2DB92, 0x335610E9, 0x1347D66D,
0x8C61D79A, 0x7A0CA137, 0x8E14F859, 0x893C13EB, 0xEE27A9CE, 0x35C961B7, 0xEDE51CE1, 0x3CB1477A,
0x59DFD29C, 0x3F73F255, 0x79CE1418, 0xBF37C773, 0xEACDF753, 0x5BAAFD5F, 0x146F3DDF, 0x86DB4478,
0x81F3AFCA, 0x3EC468B9, 0x2C342438, 0x5F40A3C2, 0x72C31D16, 0x0C25E2BC, 0x8B493C28, 0x41950DFF,
0x7101A839, 0xDEB30C08, 0x9CE4B4D8, 0x90C15664, 0x6184CB7B, 0x70B632D5, 0x745C6C48, 0x4257B8D0,
};
constexpr auto AesWordByte0Shift = 0 * BITSIZEOF(u8);
constexpr auto AesWordByte1Shift = 1 * BITSIZEOF(u8);
constexpr auto AesWordByte2Shift = 2 * BITSIZEOF(u8);
constexpr auto AesWordByte3Shift = 3 * BITSIZEOF(u8);
constexpr auto AesMixShift = 3 * BITSIZEOF(u8);
constexpr void InverseMixColumns(u32 *dst, const u32 *src) {
for (auto i = 0; i < WordsPerBlock; ++i) {
const u32 v0 = src[i];
const u32 v1 = (((v0 & 0x7F7F7F7Fu) << 1) ^ (((v0 & 0x80808080) >> 7) * 0x1B));
const u32 v2 = (((v1 & 0x7F7F7F7Fu) << 1) ^ (((v1 & 0x80808080) >> 7) * 0x1B));
const u32 v3 = (((v2 & 0x7F7F7F7Fu) << 1) ^ (((v2 & 0x80808080) >> 7) * 0x1B));
u32 v = v0 ^ v3;
v ^= util::RotateLeft(v, AesMixShift) ^ v2;
v ^= util::RotateLeft(v, AesMixShift) ^ v1;
v ^= util::RotateLeft(v, AesMixShift) ^ v0;
dst[i] = v;
}
}
constexpr u32 SubBytesAndRotate(u32 v) {
return (static_cast<u32>(SubBytesTable[(v >> AesWordByte0Shift) & 0xFFu]) << AesWordByte3Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte1Shift) & 0xFFu]) << AesWordByte0Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte2Shift) & 0xFFu]) << AesWordByte1Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte3Shift) & 0xFFu]) << AesWordByte2Shift);
}
constexpr u32 SubBytes(u32 v) {
return (static_cast<u32>(SubBytesTable[(v >> AesWordByte0Shift) & 0xFFu]) << AesWordByte0Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte1Shift) & 0xFFu]) << AesWordByte1Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte2Shift) & 0xFFu]) << AesWordByte2Shift) ^
(static_cast<u32>(SubBytesTable[(v >> AesWordByte3Shift) & 0xFFu]) << AesWordByte3Shift);
}
constexpr u32 ShiftSubMix(u32 v0, u32 v1, u32 v2, u32 v3) {
return (util::RotateLeft(static_cast<u32>(EncryptTable[(v0 >> AesWordByte0Shift) & 0xFFu]), AesWordByte0Shift)) ^
(util::RotateLeft(static_cast<u32>(EncryptTable[(v1 >> AesWordByte1Shift) & 0xFFu]), AesWordByte1Shift)) ^
(util::RotateLeft(static_cast<u32>(EncryptTable[(v2 >> AesWordByte2Shift) & 0xFFu]), AesWordByte2Shift)) ^
(util::RotateLeft(static_cast<u32>(EncryptTable[(v3 >> AesWordByte3Shift) & 0xFFu]), AesWordByte3Shift));
}
constexpr u32 ShiftSub(u32 v0, u32 v1, u32 v2, u32 v3) {
return (static_cast<u32>(SubBytesTable[(v0 >> AesWordByte0Shift) & 0xFFu]) << AesWordByte0Shift) ^
(static_cast<u32>(SubBytesTable[(v1 >> AesWordByte1Shift) & 0xFFu]) << AesWordByte1Shift) ^
(static_cast<u32>(SubBytesTable[(v2 >> AesWordByte2Shift) & 0xFFu]) << AesWordByte2Shift) ^
(static_cast<u32>(SubBytesTable[(v3 >> AesWordByte3Shift) & 0xFFu]) << AesWordByte3Shift);
}
constexpr u32 InvShiftSubMix(u32 v0, u32 v1, u32 v2, u32 v3) {
return (util::RotateLeft(static_cast<u32>(DecryptTable[(v0 >> AesWordByte0Shift) & 0xFFu]), AesWordByte0Shift)) ^
(util::RotateLeft(static_cast<u32>(DecryptTable[(v1 >> AesWordByte1Shift) & 0xFFu]), AesWordByte1Shift)) ^
(util::RotateLeft(static_cast<u32>(DecryptTable[(v2 >> AesWordByte2Shift) & 0xFFu]), AesWordByte2Shift)) ^
(util::RotateLeft(static_cast<u32>(DecryptTable[(v3 >> AesWordByte3Shift) & 0xFFu]), AesWordByte3Shift));
}
constexpr u32 InvShiftSub(u32 v0, u32 v1, u32 v2, u32 v3) {
return (static_cast<u32>(InvSubBytesTable[(v0 >> AesWordByte0Shift) & 0xFFu]) << AesWordByte0Shift) ^
(static_cast<u32>(InvSubBytesTable[(v1 >> AesWordByte1Shift) & 0xFFu]) << AesWordByte1Shift) ^
(static_cast<u32>(InvSubBytesTable[(v2 >> AesWordByte2Shift) & 0xFFu]) << AesWordByte2Shift) ^
(static_cast<u32>(InvSubBytesTable[(v3 >> AesWordByte3Shift) & 0xFFu]) << AesWordByte3Shift);
}
}
const bool g_is_aes_ni_available = GetAesNiAvailabilityImpl();
template<size_t KeySize>
AesImpl<KeySize>::~AesImpl() {
ClearMemory(this, sizeof(*this));
}
template<size_t KeySize>
void AesImpl<KeySize>::Initialize(const void *key, size_t key_size, bool is_encrypt) {
/* Check pre-conditions. */
static_assert(IsSupportedKeySize(KeySize));
AMS_ASSERT(key != nullptr);
AMS_ASSERT(key_size == KeySize);
AMS_UNUSED(key_size);
/* Set up key. */
u32 *dst = m_round_keys;
std::memcpy(dst, key, KeySize);
/* Perform key scheduling. */
constexpr auto InitialKeyWords = KeySize / sizeof(u32);
u32 tmp = dst[InitialKeyWords - 1];
for (auto i = InitialKeyWords; i < (RoundCount + 1) * 4; ++i) {
const auto idx_in_key = i % InitialKeyWords;
if (idx_in_key == 0) {
/* At start of key word, we need to handle sub/rotate/rcon. */
tmp = SubBytesAndRotate(tmp);
tmp ^= (RoundKeyRcon0[i / InitialKeyWords - 1] << AesWordByte0Shift);
} else if ((InitialKeyWords > 6) && idx_in_key == 4) {
/* Halfway into a 256-bit key word, we need to do an additional subbytes. */
tmp = SubBytes(tmp);
}
/* Set the key word. */
tmp ^= dst[i - InitialKeyWords];
dst[i] = tmp;
}
/* If decrypting, perform inverse mix columns on all round keys. */
if (!is_encrypt) {
if (IsAesNiAvailable()) {
auto *key8 = reinterpret_cast<u8 *>(m_round_keys) + BlockSize;
for (auto i = 1; i < RoundCount; ++i) {
auto * const key128 = reinterpret_cast<__m128i *>(key8);
_mm_storeu_si128(key128, _mm_aesimc_si128(_mm_loadu_si128(key128)));
key8 += BlockSize;
}
} else {
for (auto i = 1; i < RoundCount; ++i) {
InverseMixColumns(m_round_keys + WordsPerBlock * i, m_round_keys + WordsPerBlock * i);
}
}
}
}
template<size_t KeySize>
void AesImpl<KeySize>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
static_assert(IsSupportedKeySize(KeySize));
AMS_ASSERT(dst_size == BlockSize && src_size == BlockSize);
AMS_UNUSED(dst_size, src_size);
/* Perform block encryption. */
if (IsAesNiAvailable()) {
const auto *key8 = reinterpret_cast<const u8 *>(m_round_keys);
/* Load the block. */
auto block = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
/* Add the first round key. */
block = _mm_xor_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
key8 += BlockSize;
/* Perform aes round on remaining round keys. */
for (auto i = 1; i < RoundCount; ++i) {
block = _mm_aesenc_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
key8 += BlockSize;
}
/* Do final update. */
block = _mm_aesenclast_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
/* Store the output. */
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), block);
} else {
static_assert(WordsPerBlock == 4);
/* Without AES-NI, we'll operate on words. */
const u32 *key32 = m_round_keys;
const u32 *src32 = static_cast<const u32 *>(src);
u32 *dst32 = static_cast< u32 *>(dst);
/* Add the first round key. */
u32 v0 = src32[0] ^ key32[0];
u32 v1 = src32[1] ^ key32[1];
u32 v2 = src32[2] ^ key32[2];
u32 v3 = src32[3] ^ key32[3];
key32 += 4;
/* Perform each round. */
auto round = RoundCount;
while (--round > 0) {
/* Perform aes round. */
const u32 e0 = ShiftSubMix(v0, v1, v2, v3);
const u32 e1 = ShiftSubMix(v1, v2, v3, v0);
const u32 e2 = ShiftSubMix(v2, v3, v0, v1);
const u32 e3 = ShiftSubMix(v3, v0, v1, v2);
/* Add the round key. */
v0 = e0 ^ key32[0];
v1 = e1 ^ key32[1];
v2 = e2 ^ key32[2];
v3 = e3 ^ key32[3];
key32 += 4;
}
/* Perform the final round. */
dst32[0] = key32[0] ^ ShiftSub(v0, v1, v2, v3);
dst32[1] = key32[1] ^ ShiftSub(v1, v2, v3, v0);
dst32[2] = key32[2] ^ ShiftSub(v2, v3, v0, v1);
dst32[3] = key32[3] ^ ShiftSub(v3, v0, v1, v2);
}
}
template<size_t KeySize>
void AesImpl<KeySize>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
static_assert(IsSupportedKeySize(KeySize));
AMS_ASSERT(dst_size == BlockSize && src_size == BlockSize);
AMS_UNUSED(dst_size, src_size);
/* Perform block decryption. */
if (IsAesNiAvailable()) {
const auto *key8 = reinterpret_cast<const u8 *>(m_round_keys) + (RoundCount * BlockSize);
/* Load the block. */
auto block = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
/* Add the final round key. */
block = _mm_xor_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
key8 -= BlockSize;
/* Perform aes invround on remaining round keys. */
for (auto i = RoundCount; i > 1; --i) {
block = _mm_aesdec_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
key8 -= BlockSize;
}
/* Do final update. */
block = _mm_aesdeclast_si128(block, _mm_loadu_si128(reinterpret_cast<const __m128i *>(key8)));
/* Store the output. */
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), block);
} else {
static_assert(WordsPerBlock == 4);
/* Without AES-NI, we'll operate on words. */
const u32 *key32 = m_round_keys + WordsPerBlock * RoundCount;
const u32 *src32 = static_cast<const u32 *>(src);
u32 *dst32 = static_cast< u32 *>(dst);
/* Add the final round key. */
u32 v0 = src32[0] ^ key32[0];
u32 v1 = src32[1] ^ key32[1];
u32 v2 = src32[2] ^ key32[2];
u32 v3 = src32[3] ^ key32[3];
key32 -= 4;
/* Perform each round. */
auto round = RoundCount;
while (--round > 0) {
/* Perform aes inv round. */
const u32 e0 = InvShiftSubMix(v0, v3, v2, v1);
const u32 e1 = InvShiftSubMix(v1, v0, v3, v2);
const u32 e2 = InvShiftSubMix(v2, v1, v0, v3);
const u32 e3 = InvShiftSubMix(v3, v2, v1, v0);
/* Add the round key. */
v0 = e0 ^ key32[0];
v1 = e1 ^ key32[1];
v2 = e2 ^ key32[2];
v3 = e3 ^ key32[3];
key32 -= 4;
}
/* Perform the final round. */
dst32[0] = key32[0] ^ InvShiftSub(v0, v3, v2, v1);
dst32[1] = key32[1] ^ InvShiftSub(v1, v0, v3, v2);
dst32[2] = key32[2] ^ InvShiftSub(v2, v1, v0, v3);
dst32[3] = key32[3] ^ InvShiftSub(v3, v2, v1, v0);
}
}
/* Explicitly instantiate the three supported key sizes. */
template class AesImpl<16>;
template class AesImpl<24>;
template class AesImpl<32>;
}

View File

@@ -0,0 +1,28 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <vapours.hpp>
#include <x86intrin.h>
namespace ams::crypto::impl {
extern const bool g_is_aes_ni_available;
ALWAYS_INLINE bool IsAesNiAvailable() {
return g_is_aes_ni_available;
}
}

View File

@@ -45,7 +45,7 @@ namespace ams::crypto::impl {
return static_cast<BigNum::Word>(half) << BITSIZEOF(BigNum::HalfWord);
}
constexpr ALWAYS_INLINE BigNum::Word ToLowerHalf(BigNum::HalfWord half) {
[[maybe_unused]] constexpr ALWAYS_INLINE BigNum::Word ToLowerHalf(BigNum::HalfWord half) {
static_assert(sizeof(BigNum::Word) == sizeof(BigNum::HalfWord) * 2);
return static_cast<BigNum::Word>(half);
}
@@ -422,4 +422,69 @@ namespace ams::crypto::impl {
return true;
}
#if !defined(ATMOSPHERE_ARCH_ARM64)
BigNum::Word BigNum::Add(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) {
Word carry = 0;
for (size_t i = 0; i < num_words; ++i) {
Word v;
if ((v = lhs[i] + carry) < carry) {
v = rhs[i];
} else if ((v += rhs[i]) < rhs[i]) {
carry = 1;
} else {
carry = 0;
}
dst[i] = v;
}
return carry;
}
BigNum::Word BigNum::Sub(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) {
Word borrow = 0;
for (size_t i = 0; i < num_words; ++i) {
Word v;
if ((v = lhs[i] - borrow) > (BigNum::MaxWord - borrow)) {
v = BigNum::MaxWord - rhs[i];
} else if ((v -= rhs[i]) > (BigNum::MaxWord - rhs[i])) {
borrow = 1;
} else {
borrow = 0;
}
dst[i] = v;
}
return borrow;
}
BigNum::Word BigNum::MultAdd(Word *dst, const Word *w, size_t num_words, Word mult) {
/* If multiplying by zero, nothing to do. */
if (mult == 0) {
return 0;
}
Word carry = 0, work[2];
for (size_t i = 0; i < num_words; i++) {
/* Multiply, calculate carry for next. */
MultWord(work, mult, w[i]);
if ((dst[i] += carry) < carry) {
carry = 1;
} else {
carry = 0;
}
if ((dst[i] += work[0]) < work[0]) {
carry++;
}
carry += work[1];
}
return carry;
}
#endif
}

View File

@@ -15,9 +15,14 @@
*/
/* ams::crypto::impl::BigNum::Add(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) */
#if !defined(ATMOSPHERE_OS_MACOS)
.section .text._ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m, "ax", %progbits
.global _ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m
.type _ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m, %function
#else
.text
.global _ZN3ams6crypto4impl6BigNum3AddEPjPKjS5_m
#endif
.balign 0x10
_ZN3ams6crypto4impl6BigNum3AddEPjPKjPKjm:
/* Check if we have anything to do at all. */
@@ -106,9 +111,14 @@ _ZN3ams6crypto4impl6BigNum3AddEPjPKjPKjm:
ret
/* ams::crypto::impl::BigNum::Sub(Word *dst, const Word *lhs, const Word *rhs, size_t num_words) */
#if !defined(ATMOSPHERE_OS_MACOS)
.section .text._ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m, "ax", %progbits
.global _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m
.type _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m, %function
#else
.text
.global _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m
#endif
.balign 0x10
_ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m:
/* Check if we have anything to do at all. */
@@ -198,9 +208,14 @@ _ZN3ams6crypto4impl6BigNum3SubEPjPKjS5_m:
ret
/* ams::crypto::impl::BigNum::MultAdd(Word *dst, const Word *w, size_t num_words, Word mult) */
#if !defined(ATMOSPHERE_OS_MACOS)
.section .text._ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj, "ax", %progbits
.global _ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj
.type _ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj, %function
#else
.text
.global _ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj
#endif
.balign 0x10
_ZN3ams6crypto4impl6BigNum7MultAddEPjPKjmj:
/* Check if we have anything to do at all. */

View File

@@ -114,11 +114,11 @@ namespace ams::crypto::impl {
while (num_blocks >= 3) {
/* Read blocks in. Keep them in registers for XOR later. */
const uint8x16_t block0 = vld1q_u8(src);
src += AES_BLOCK_SIZE;
src += AesEncryptor128::BlockSize;
const uint8x16_t block1 = vld1q_u8(src);
src += AES_BLOCK_SIZE;
src += AesEncryptor128::BlockSize;
const uint8x16_t block2 = vld1q_u8(src);
src += AES_BLOCK_SIZE;
src += AesEncryptor128::BlockSize;
/* We'll be encrypting the three CTRs. */
uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
@@ -178,11 +178,11 @@ namespace ams::crypto::impl {
/* Store to output. */
vst1q_u8(dst, tmp0);
dst += AES_BLOCK_SIZE;
dst += AesEncryptor128::BlockSize;
vst1q_u8(dst, tmp1);
dst += AES_BLOCK_SIZE;
dst += AesEncryptor128::BlockSize;
vst1q_u8(dst, tmp2);
dst += AES_BLOCK_SIZE;
dst += AesEncryptor128::BlockSize;
num_blocks -= 3;
}
@@ -191,7 +191,7 @@ namespace ams::crypto::impl {
while (num_blocks >= 1) {
/* Read block in, keep in register for XOR. */
const uint8x16_t block0 = vld1q_u8(src);
src += AES_BLOCK_SIZE;
src += AesEncryptor128::BlockSize;
/* We'll be encrypting the CTR. */
uint8x16_t tmp0 = ctr0;
@@ -232,7 +232,7 @@ namespace ams::crypto::impl {
/* Store to output. */
vst1q_u8(dst, tmp0);
dst += AES_BLOCK_SIZE;
dst += AesEncryptor128::BlockSize;
num_blocks--;
}
@@ -270,11 +270,11 @@ namespace ams::crypto::impl {
while (num_blocks >= 3) {
/* Read blocks in. Keep them in registers for XOR later. */
const uint8x16_t block0 = vld1q_u8(src);
src += AES_BLOCK_SIZE;
src += AesEncryptor192::BlockSize;
const uint8x16_t block1 = vld1q_u8(src);
src += AES_BLOCK_SIZE;
src += AesEncryptor192::BlockSize;
const uint8x16_t block2 = vld1q_u8(src);
src += AES_BLOCK_SIZE;
src += AesEncryptor192::BlockSize;
/* We'll be encrypting the three CTRs. */
uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
@@ -338,11 +338,11 @@ namespace ams::crypto::impl {
/* Store to output. */
vst1q_u8(dst, tmp0);
dst += AES_BLOCK_SIZE;
dst += AesEncryptor192::BlockSize;
vst1q_u8(dst, tmp1);
dst += AES_BLOCK_SIZE;
dst += AesEncryptor192::BlockSize;
vst1q_u8(dst, tmp2);
dst += AES_BLOCK_SIZE;
dst += AesEncryptor192::BlockSize;
num_blocks -= 3;
}
@@ -351,7 +351,7 @@ namespace ams::crypto::impl {
while (num_blocks >= 1) {
/* Read block in, keep in register for XOR. */
const uint8x16_t block0 = vld1q_u8(src);
src += AES_BLOCK_SIZE;
src += AesEncryptor192::BlockSize;
/* We'll be encrypting the CTR. */
uint8x16_t tmp0 = ctr0;
@@ -396,7 +396,7 @@ namespace ams::crypto::impl {
/* Store to output. */
vst1q_u8(dst, tmp0);
dst += AES_BLOCK_SIZE;
dst += AesEncryptor192::BlockSize;
num_blocks--;
}
@@ -436,11 +436,11 @@ namespace ams::crypto::impl {
while (num_blocks >= 3) {
/* Read blocks in. Keep them in registers for XOR later. */
const uint8x16_t block0 = vld1q_u8(src);
src += AES_BLOCK_SIZE;
src += AesEncryptor256::BlockSize;
const uint8x16_t block1 = vld1q_u8(src);
src += AES_BLOCK_SIZE;
src += AesEncryptor256::BlockSize;
const uint8x16_t block2 = vld1q_u8(src);
src += AES_BLOCK_SIZE;
src += AesEncryptor256::BlockSize;
/* We'll be encrypting the three CTRs. */
uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
@@ -509,11 +509,11 @@ namespace ams::crypto::impl {
/* Store to output. */
vst1q_u8(dst, tmp0);
dst += AES_BLOCK_SIZE;
dst += AesEncryptor256::BlockSize;
vst1q_u8(dst, tmp1);
dst += AES_BLOCK_SIZE;
dst += AesEncryptor256::BlockSize;
vst1q_u8(dst, tmp2);
dst += AES_BLOCK_SIZE;
dst += AesEncryptor256::BlockSize;
num_blocks -= 3;
}
@@ -522,7 +522,7 @@ namespace ams::crypto::impl {
while (num_blocks >= 1) {
/* Read block in, keep in register for XOR. */
const uint8x16_t block0 = vld1q_u8(src);
src += AES_BLOCK_SIZE;
src += AesEncryptor256::BlockSize;
/* We'll be encrypting the CTR. */
uint8x16_t tmp0 = ctr0;
@@ -571,7 +571,7 @@ namespace ams::crypto::impl {
/* Store to output. */
vst1q_u8(dst, tmp0);
dst += AES_BLOCK_SIZE;
dst += AesEncryptor256::BlockSize;
num_blocks--;
}

View File

@@ -0,0 +1,269 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
#include "crypto_aes_impl.arch.x64.hpp"
namespace ams::crypto::impl {
template<> void CtrModeImpl<AesEncryptor128>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) {
/* Check pre-conditions. */
AMS_ASSERT(src != nullptr);
AMS_ASSERT(dst != nullptr);
/* If we have aes-ni, use an optimized impl. */
if (IsAesNiAvailable()) {
/* Load all keys into sse2 registers. */
const u8 *raw_round_keys = m_block_cipher->GetRoundKey();
const __m128i round_keys[AesEncryptor128::RoundKeySize / BlockSize] = {
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 0)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 1)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 2)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 3)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 4)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 5)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 6)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 7)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 8)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 9)),
_mm_loadu_si128(reinterpret_cast<const __m128i *>(raw_round_keys + BlockSize * 10)),
};
static_assert(AesEncryptor128::RoundKeySize / BlockSize == 11);
/* Declare constant for counter math. */
const __m128i One = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
/* Process eight blocks at a time, while we can. */
constexpr const auto UnrolledBlockCount = 8;
constexpr const auto CounterThreshold = static_cast<u8>(0x100 - UnrolledBlockCount);
/* Load the counter. */
auto counter = _mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter));
size_t cur_blocks;
for (cur_blocks = 0; cur_blocks + UnrolledBlockCount <= num_blocks; cur_blocks += UnrolledBlockCount) {
__m128i b0;
__m128i b1;
__m128i b2;
__m128i b3;
__m128i b4;
__m128i b5;
__m128i b6;
__m128i b7;
__m128i key = round_keys[0];
/* Get the last byte of the block. */
static_assert(util::IsLittleEndian());
const u8 counter_val = _mm_extract_epi16(counter, 7) >> BITSIZEOF(u8);
/* Do initial encryption of each block. */
if (CounterThreshold <= counter_val) {
/* We'll overwrap, so take slow path for counter. */
_mm_storeu_si128(reinterpret_cast<__m128i *>(m_counter), counter);
b0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b4 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b5 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b6 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
b7 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter)), key);
this->IncrementCounter();
counter = _mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter));
} else {
/* We can take the fast path for the counter. */
b0 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b1 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b2 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b3 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b4 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b5 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b6 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
b7 = _mm_xor_si128(counter, key);
counter = _mm_add_epi64(counter, One);
}
/* Do encryption for all rounds. */
key = round_keys[1];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[2];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[3];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[4];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[5];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[6];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[7];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[8];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[9];
b0 = _mm_aesenc_si128(b0, key);
b1 = _mm_aesenc_si128(b1, key);
b2 = _mm_aesenc_si128(b2, key);
b3 = _mm_aesenc_si128(b3, key);
b4 = _mm_aesenc_si128(b4, key);
b5 = _mm_aesenc_si128(b5, key);
b6 = _mm_aesenc_si128(b6, key);
b7 = _mm_aesenc_si128(b7, key);
key = round_keys[10];
b0 = _mm_aesenclast_si128(b0, key);
b1 = _mm_aesenclast_si128(b1, key);
b2 = _mm_aesenclast_si128(b2, key);
b3 = _mm_aesenclast_si128(b3, key);
b4 = _mm_aesenclast_si128(b4, key);
b5 = _mm_aesenclast_si128(b5, key);
b6 = _mm_aesenclast_si128(b6, key);
b7 = _mm_aesenclast_si128(b7, key);
/* Write the blocks. */
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 0), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 0)), b0));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 1), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 1)), b1));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 2), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 2)), b2));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 3), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 3)), b3));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 4), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 4)), b4));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 5), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 5)), b5));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 6), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 6)), b6));
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + BlockSize * 7), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + BlockSize * 7)), b7));
src += BlockSize * UnrolledBlockCount;
dst += BlockSize * UnrolledBlockCount;
}
/* Store the updated counter. */
_mm_storeu_si128(reinterpret_cast<__m128i *>(m_counter), counter);
/* Process blocks one at a time. */
for (/* ... */; cur_blocks < num_blocks; ++cur_blocks) {
/* Load current counter. */
__m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i *>(m_counter));
/* Do aes rounds. */
b = _mm_xor_si128(b, round_keys[0]);
b = _mm_aesenc_si128(b, round_keys[1]);
b = _mm_aesenc_si128(b, round_keys[2]);
b = _mm_aesenc_si128(b, round_keys[3]);
b = _mm_aesenc_si128(b, round_keys[4]);
b = _mm_aesenc_si128(b, round_keys[5]);
b = _mm_aesenc_si128(b, round_keys[6]);
b = _mm_aesenc_si128(b, round_keys[7]);
b = _mm_aesenc_si128(b, round_keys[8]);
b = _mm_aesenc_si128(b, round_keys[9]);
b = _mm_aesenclast_si128(b, round_keys[10]);
/* Write the block. */
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src)), b));
/* Advance. */
src += BlockSize;
dst += BlockSize;
this->IncrementCounter();
}
} else {
/* Fall back to the default implementation. */
while (num_blocks--) {
this->ProcessBlock(dst, src, BlockSize);
dst += BlockSize;
src += BlockSize;
}
}
}
}

View File

@@ -15,31 +15,233 @@
*/
#include <vapours.hpp>
#if defined(ATMOSPHERE_IS_STRATOSPHERE)
#include <arm_neon.h>
namespace ams::crypto::impl {
#ifdef ATMOSPHERE_IS_STRATOSPHERE
namespace {
constexpr const u32 RoundConstants[4] = {
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
};
/* Define for loading work var from message. */
#define SHA1_LOAD_W_FROM_MESSAGE(which) \
w[which] = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data))); \
data += 0x10
#define SHA1_CALCULATE_W_FROM_PREVIOUS(i) \
w[i] = vsha1su1q_u32(vsha1su0q_u32(w[i-4], w[i-3], w[i-2]), w[i-1])
/* Define for doing four rounds of SHA1. */
#define SHA1_DO_ROUND(r, insn, constant) \
do { \
const u32 a = vgetq_lane_u32(cur_abcd, 0); \
cur_abcd = v##insn##q_u32(cur_abcd, cur_e, vaddq_u32(w[r], constant)); \
cur_e = vsha1h_u32(a); \
} while (0)
}
void Sha1Impl::Initialize() {
static_assert(sizeof(m_state) == sizeof(::Sha1Context));
::sha1ContextCreate(reinterpret_cast<::Sha1Context *>(std::addressof(m_state)));
/* Reset buffered bytes/bits. */
m_buffered_bytes = 0;
m_bits_consumed = 0;
/* Set intermediate hash. */
m_intermediate_hash[0] = 0x67452301;
m_intermediate_hash[1] = 0xEFCDAB89;
m_intermediate_hash[2] = 0x98BADCFE;
m_intermediate_hash[3] = 0x10325476;
m_intermediate_hash[4] = 0xC3D2E1F0;
/* Set state. */
m_state = State_Initialized;
}
void Sha1Impl::Update(const void *data, size_t size) {
static_assert(sizeof(m_state) == sizeof(::Sha1Context));
::sha1ContextUpdate(reinterpret_cast<::Sha1Context *>(std::addressof(m_state)), data, size);
/* Verify we're in a state to update. */
AMS_ASSERT(m_state == State_Initialized);
/* Advance our input bit count. */
m_bits_consumed += BITSIZEOF(u8) * (((m_buffered_bytes + size) / BlockSize) * BlockSize);
/* Process anything we have buffered. */
const u8 *data8 = static_cast<const u8 *>(data);
size_t remaining = size;
if (m_buffered_bytes > 0) {
const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
std::memcpy(m_buffer + m_buffered_bytes, data8, copy_size);
data8 += copy_size;
remaining -= copy_size;
m_buffered_bytes += copy_size;
/* Process a block, if we filled one. */
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock(m_buffer);
m_buffered_bytes = 0;
}
}
/* Process blocks, if we have any. */
if (remaining >= BlockSize) {
const size_t blocks = remaining / BlockSize;
this->ProcessBlocks(data8, blocks);
data8 += BlockSize * blocks;
remaining -= BlockSize * blocks;
}
/* Copy any leftover data to our buffer. */
if (remaining > 0) {
m_buffered_bytes = remaining;
std::memcpy(m_buffer, data8, remaining);
}
}
void Sha1Impl::GetHash(void *dst, size_t size) {
static_assert(sizeof(m_state) == sizeof(::Sha1Context));
/* Verify we're in a state to get hash. */
AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
AMS_ASSERT(size >= HashSize);
AMS_UNUSED(size);
::sha1ContextGetHash(reinterpret_cast<::Sha1Context *>(std::addressof(m_state)), dst);
/* If we need to, process the last block. */
if (m_state == State_Initialized) {
this->ProcessLastBlock();
m_state = State_Done;
}
/* Copy the output hash. */
if constexpr (util::IsLittleEndian()) {
static_assert(HashSize % sizeof(u32) == 0);
u32 *dst_32 = static_cast<u32 *>(dst);
for (size_t i = 0; i < HashSize / sizeof(u32); ++i) {
dst_32[i] = util::LoadBigEndian<u32>(m_intermediate_hash + i);
}
} else {
std::memcpy(dst, m_intermediate_hash, HashSize);
}
}
#else
ALWAYS_INLINE void Sha1Impl::ProcessBlock(const void *data) {
return this->ProcessBlocks(static_cast<const u8 *>(data), 1);
}
/* TODO: Non-EL0 implementation. */
void Sha1Impl::ProcessBlocks(const u8 *data, size_t block_count) {
/* Setup round constants. */
const uint32x4_t k0 = vdupq_n_u32(RoundConstants[0]);
const uint32x4_t k1 = vdupq_n_u32(RoundConstants[1]);
const uint32x4_t k2 = vdupq_n_u32(RoundConstants[2]);
const uint32x4_t k3 = vdupq_n_u32(RoundConstants[3]);
#endif
/* Load hash variables with intermediate state. */
uint32x4_t cur_abcd = vld1q_u32(m_intermediate_hash + 0);
u32 cur_e = m_intermediate_hash[4];
}
/* Actually do hash processing blocks. */
do {
/* Save current state. */
const uint32x4_t prev_abcd = cur_abcd;
const u32 prev_e = cur_e;
uint32x4_t w[20];
/* Setup w[0-3] with message. */
SHA1_LOAD_W_FROM_MESSAGE(0);
SHA1_LOAD_W_FROM_MESSAGE(1);
SHA1_LOAD_W_FROM_MESSAGE(2);
SHA1_LOAD_W_FROM_MESSAGE(3);
/* Calculate w[4-19], w[i] = sha1su1(sha1su0(w[i-4], w[i-3], w[i-2]), w[i-1]); */
SHA1_CALCULATE_W_FROM_PREVIOUS(4);
SHA1_CALCULATE_W_FROM_PREVIOUS(5);
SHA1_CALCULATE_W_FROM_PREVIOUS(6);
SHA1_CALCULATE_W_FROM_PREVIOUS(7);
SHA1_CALCULATE_W_FROM_PREVIOUS(8);
SHA1_CALCULATE_W_FROM_PREVIOUS(9);
SHA1_CALCULATE_W_FROM_PREVIOUS(10);
SHA1_CALCULATE_W_FROM_PREVIOUS(11);
SHA1_CALCULATE_W_FROM_PREVIOUS(12);
SHA1_CALCULATE_W_FROM_PREVIOUS(13);
SHA1_CALCULATE_W_FROM_PREVIOUS(14);
SHA1_CALCULATE_W_FROM_PREVIOUS(15);
SHA1_CALCULATE_W_FROM_PREVIOUS(16);
SHA1_CALCULATE_W_FROM_PREVIOUS(17);
SHA1_CALCULATE_W_FROM_PREVIOUS(18);
SHA1_CALCULATE_W_FROM_PREVIOUS(19);
/* Do round calculations 0-20. Uses sha1c, k0. */
SHA1_DO_ROUND(0, sha1c, k0);
SHA1_DO_ROUND(1, sha1c, k0);
SHA1_DO_ROUND(2, sha1c, k0);
SHA1_DO_ROUND(3, sha1c, k0);
SHA1_DO_ROUND(4, sha1c, k0);
/* Do round calculations 20-40. Uses sha1p, k1. */
SHA1_DO_ROUND(5, sha1p, k1);
SHA1_DO_ROUND(6, sha1p, k1);
SHA1_DO_ROUND(7, sha1p, k1);
SHA1_DO_ROUND(8, sha1p, k1);
SHA1_DO_ROUND(9, sha1p, k1);
/* Do round calculations 40-60. Uses sha1m, k2. */
SHA1_DO_ROUND(10, sha1m, k2);
SHA1_DO_ROUND(11, sha1m, k2);
SHA1_DO_ROUND(12, sha1m, k2);
SHA1_DO_ROUND(13, sha1m, k2);
SHA1_DO_ROUND(14, sha1m, k2);
/* Do round calculations 60-80. Uses sha1p, k3. */
SHA1_DO_ROUND(15, sha1p, k3);
SHA1_DO_ROUND(16, sha1p, k3);
SHA1_DO_ROUND(17, sha1p, k3);
SHA1_DO_ROUND(18, sha1p, k3);
SHA1_DO_ROUND(19, sha1p, k3);
/* Add to previous. */
cur_abcd = vaddq_u32(cur_abcd, prev_abcd);
cur_e = cur_e + prev_e;
} while (--block_count != 0);
/* Save result to intermediate hash. */
vst1q_u32(m_intermediate_hash, cur_abcd);
m_intermediate_hash[4] = cur_e;
}
void Sha1Impl::ProcessLastBlock() {
/* Setup the final block. */
constexpr const auto BlockSizeWithoutSizeField = BlockSize - sizeof(u64);
/* Increment our bits consumed. */
m_bits_consumed += BITSIZEOF(u8) * m_buffered_bytes;
/* Add 0x80 terminator. */
m_buffer[m_buffered_bytes++] = 0x80;
/* If we can process the size field directly, do so, otherwise set up to process it. */
if (m_buffered_bytes <= BlockSizeWithoutSizeField) {
/* Clear up to size field. */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSizeWithoutSizeField - m_buffered_bytes);
} else {
/* Consume full block */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSize - m_buffered_bytes);
this->ProcessBlock(m_buffer);
/* Clear up to size field. */
std::memset(m_buffer, 0, BlockSizeWithoutSizeField);
}
/* Store the size field. */
util::StoreBigEndian<u64>(reinterpret_cast<u64 *>(m_buffer + BlockSizeWithoutSizeField), m_bits_consumed);
/* Process the final block. */
this->ProcessBlock(m_buffer);
}
}
#endif

View File

@@ -0,0 +1,225 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
namespace ams::crypto::impl {
namespace {
constexpr const u32 RoundConstants[4] = {
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
};
constexpr ALWAYS_INLINE u32 Choose(u32 x, u32 y, u32 z) {
return (x & y) ^ ((~x) & z);
}
constexpr ALWAYS_INLINE u32 Majority(u32 x, u32 y, u32 z) {
return (x & y) ^ (x & z) ^ (y & z);
}
constexpr ALWAYS_INLINE u32 Parity(u32 x, u32 y, u32 z) {
return x ^ y ^ z;
}
}
void Sha1Impl::Initialize() {
/* Reset buffered bytes/bits. */
m_buffered_bytes = 0;
m_bits_consumed = 0;
/* Set intermediate hash. */
m_intermediate_hash[0] = 0x67452301;
m_intermediate_hash[1] = 0xEFCDAB89;
m_intermediate_hash[2] = 0x98BADCFE;
m_intermediate_hash[3] = 0x10325476;
m_intermediate_hash[4] = 0xC3D2E1F0;
/* Set state. */
m_state = State_Initialized;
}
void Sha1Impl::Update(const void *data, size_t size) {
/* Verify we're in a state to update. */
AMS_ASSERT(m_state == State_Initialized);
/* Advance our input bit count. */
m_bits_consumed += BITSIZEOF(u8) * (((m_buffered_bytes + size) / BlockSize) * BlockSize);
/* Process anything we have buffered. */
const u8 *data8 = static_cast<const u8 *>(data);
size_t remaining = size;
if (m_buffered_bytes > 0) {
const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
std::memcpy(m_buffer + m_buffered_bytes, data8, copy_size);
data8 += copy_size;
remaining -= copy_size;
m_buffered_bytes += copy_size;
/* Process a block, if we filled one. */
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock(m_buffer);
m_buffered_bytes = 0;
}
}
/* Process blocks, while we have any. */
while (remaining >= BlockSize) {
this->ProcessBlock(data8);
data8 += BlockSize;
remaining -= BlockSize;
}
/* Copy any leftover data to our buffer. */
if (remaining > 0) {
m_buffered_bytes = remaining;
std::memcpy(m_buffer, data8, remaining);
}
}
void Sha1Impl::GetHash(void *dst, size_t size) {
/* Verify we're in a state to get hash. */
AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
AMS_ASSERT(size >= HashSize);
AMS_UNUSED(size);
/* If we need to, process the last block. */
if (m_state == State_Initialized) {
this->ProcessLastBlock();
m_state = State_Done;
}
/* Copy the output hash. */
if constexpr (util::IsLittleEndian()) {
static_assert(HashSize % sizeof(u32) == 0);
u32 *dst_32 = static_cast<u32 *>(dst);
for (size_t i = 0; i < HashSize / sizeof(u32); ++i) {
dst_32[i] = util::LoadBigEndian<u32>(m_intermediate_hash + i);
}
} else {
std::memcpy(dst, m_intermediate_hash, HashSize);
}
}
void Sha1Impl::ProcessBlock(const void *data) {
/* Load work variables. */
u32 a = m_intermediate_hash[0];
u32 b = m_intermediate_hash[1];
u32 c = m_intermediate_hash[2];
u32 d = m_intermediate_hash[3];
u32 e = m_intermediate_hash[4];
u32 tmp;
size_t i;
/* Copy the input. */
u32 w[80];
if constexpr (util::IsLittleEndian()) {
static_assert(BlockSize % sizeof(u32) == 0);
const u32 *src_32 = static_cast<const u32 *>(data);
for (size_t i = 0; i < BlockSize / sizeof(u32); ++i) {
w[i] = util::LoadBigEndian<u32>(src_32 + i);
}
} else {
std::memcpy(w, data, BlockSize);
}
/* Initialize the rest of w. */
for (i = BlockSize / sizeof(u32); i < util::size(w); ++i) {
const u32 *prev = w + (i - BlockSize / sizeof(u32));
w[i] = util::RotateLeft<u32>(prev[0] ^ prev[2] ^ prev[8] ^ prev[13], 1);
}
/* Perform rounds. */
for (i = 0; i < 20; ++i) {
tmp = util::RotateLeft<u32>(a, 5) + Choose(b, c, d) + e + w[i] + RoundConstants[0];
e = d;
d = c;
c = util::RotateLeft<u32>(b, 30);
b = a;
a = tmp;
}
for (/* ... */; i < 40; ++i) {
tmp = util::RotateLeft<u32>(a, 5) + Parity(b, c, d) + e + w[i] + RoundConstants[1];
e = d;
d = c;
c = util::RotateLeft<u32>(b, 30);
b = a;
a = tmp;
}
for (/* ... */; i < 60; ++i) {
tmp = util::RotateLeft<u32>(a, 5) + Majority(b, c, d) + e + w[i] + RoundConstants[2];
e = d;
d = c;
c = util::RotateLeft<u32>(b, 30);
b = a;
a = tmp;
}
for (/* ... */; i < 80; ++i) {
tmp = util::RotateLeft<u32>(a, 5) + Parity(b, c, d) + e + w[i] + RoundConstants[3];
e = d;
d = c;
c = util::RotateLeft<u32>(b, 30);
b = a;
a = tmp;
}
/* Update intermediate hash. */
m_intermediate_hash[0] += a;
m_intermediate_hash[1] += b;
m_intermediate_hash[2] += c;
m_intermediate_hash[3] += d;
m_intermediate_hash[4] += e;
}
void Sha1Impl::ProcessLastBlock() {
/* Setup the final block. */
constexpr const auto BlockSizeWithoutSizeField = BlockSize - sizeof(u64);
/* Increment our bits consumed. */
m_bits_consumed += BITSIZEOF(u8) * m_buffered_bytes;
/* Add 0x80 terminator. */
m_buffer[m_buffered_bytes++] = 0x80;
/* If we can process the size field directly, do so, otherwise set up to process it. */
if (m_buffered_bytes <= BlockSizeWithoutSizeField) {
/* Clear up to size field. */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSizeWithoutSizeField - m_buffered_bytes);
} else {
/* Consume full block */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSize - m_buffered_bytes);
this->ProcessBlock(m_buffer);
/* Clear up to size field. */
std::memset(m_buffer, 0, BlockSizeWithoutSizeField);
}
/* Store the size field. */
util::StoreBigEndian<u64>(reinterpret_cast<u64 *>(m_buffer + BlockSizeWithoutSizeField), m_bits_consumed);
/* Process the final block. */
this->ProcessBlock(m_buffer);
}
}

View File

@@ -15,53 +15,313 @@
*/
#include <vapours.hpp>
#if defined(ATMOSPHERE_IS_STRATOSPHERE)
#include <arm_neon.h>
namespace ams::crypto::impl {
#ifdef ATMOSPHERE_IS_STRATOSPHERE
namespace {
alignas(Sha256Impl::BlockSize) constexpr const u32 RoundConstants[0x40] = {
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
};
}
void Sha256Impl::Initialize() {
static_assert(sizeof(m_state) == sizeof(::Sha256Context));
::sha256ContextCreate(reinterpret_cast<::Sha256Context *>(std::addressof(m_state)));
/* Reset buffered bytes/bits. */
m_buffered_bytes = 0;
m_bits_consumed = 0;
/* Set intermediate hash. */
m_intermediate_hash[0] = 0x6A09E667;
m_intermediate_hash[1] = 0xBB67AE85;
m_intermediate_hash[2] = 0x3C6EF372;
m_intermediate_hash[3] = 0xA54FF53A;
m_intermediate_hash[4] = 0x510E527F;
m_intermediate_hash[5] = 0x9B05688C;
m_intermediate_hash[6] = 0x1F83D9AB;
m_intermediate_hash[7] = 0x5BE0CD19;
/* Set state. */
m_state = State_Initialized;
}
void Sha256Impl::Update(const void *data, size_t size) {
static_assert(sizeof(m_state) == sizeof(::Sha256Context));
::sha256ContextUpdate(reinterpret_cast<::Sha256Context *>(std::addressof(m_state)), data, size);
/* Verify we're in a state to update. */
AMS_ASSERT(m_state == State_Initialized);
/* Advance our input bit count. */
m_bits_consumed += BITSIZEOF(u8) * (((m_buffered_bytes + size) / BlockSize) * BlockSize);
/* Process anything we have buffered. */
const u8 *data8 = static_cast<const u8 *>(data);
size_t remaining = size;
if (m_buffered_bytes > 0) {
const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
std::memcpy(m_buffer + m_buffered_bytes, data8, copy_size);
data8 += copy_size;
remaining -= copy_size;
m_buffered_bytes += copy_size;
/* Process a block, if we filled one. */
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock(m_buffer);
m_buffered_bytes = 0;
}
}
/* Process blocks, if we have any. */
if (remaining >= BlockSize) {
const size_t blocks = remaining / BlockSize;
this->ProcessBlocks(data8, blocks);
data8 += BlockSize * blocks;
remaining -= BlockSize * blocks;
}
/* Copy any leftover data to our buffer. */
if (remaining > 0) {
m_buffered_bytes = remaining;
std::memcpy(m_buffer, data8, remaining);
}
}
void Sha256Impl::GetHash(void *dst, size_t size) {
static_assert(sizeof(m_state) == sizeof(::Sha256Context));
/* Verify we're in a state to get hash. */
AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
AMS_ASSERT(size >= HashSize);
AMS_UNUSED(size);
::sha256ContextGetHash(reinterpret_cast<::Sha256Context *>(std::addressof(m_state)), dst);
/* If we need to, process the last block. */
if (m_state == State_Initialized) {
this->ProcessLastBlock();
m_state = State_Done;
}
/* Copy the output hash. */
if constexpr (util::IsLittleEndian()) {
static_assert(HashSize % sizeof(u32) == 0);
u32 *dst_32 = static_cast<u32 *>(dst);
for (size_t i = 0; i < HashSize / sizeof(u32); ++i) {
dst_32[i] = util::LoadBigEndian<u32>(m_intermediate_hash + i);
}
} else {
std::memcpy(dst, m_intermediate_hash, HashSize);
}
}
void Sha256Impl::InitializeWithContext(const Sha256Context *context) {
static_assert(sizeof(m_state) == sizeof(::Sha256Context));
/* Copy state in from the context. */
std::memcpy(m_state.intermediate_hash, context->intermediate_hash, sizeof(m_state.intermediate_hash));
m_state.bits_consumed = context->bits_consumed;
std::memcpy(m_intermediate_hash, context->intermediate_hash, sizeof(m_intermediate_hash));
m_bits_consumed = context->bits_consumed;
/* Clear the rest of state. */
std::memset(m_state.buffer, 0, sizeof(m_state.buffer));
m_state.num_buffered = 0;
m_state.finalized = false;
/* Reset other fields. */
m_buffered_bytes = 0;
m_state = State_Initialized;
}
size_t Sha256Impl::GetContext(Sha256Context *context) const {
static_assert(sizeof(m_state) == sizeof(::Sha256Context));
std::memcpy(context->intermediate_hash, m_state.intermediate_hash, sizeof(context->intermediate_hash));
context->bits_consumed = m_state.bits_consumed;
/* Check our state. */
AMS_ASSERT(m_state == State_Initialized);
return m_state.num_buffered;
/* Copy out the context. */
std::memcpy(context->intermediate_hash, m_intermediate_hash, sizeof(context->intermediate_hash));
context->bits_consumed = m_bits_consumed;
return m_buffered_bytes;
}
#else
ALWAYS_INLINE void Sha256Impl::ProcessBlock(const void *data) {
return this->ProcessBlocks(static_cast<const u8 *>(data), 1);
}
/* TODO: Non-EL0 implementation. */
void Sha256Impl::ProcessBlocks(const u8 *data, size_t block_count) {
/* Load previous hash with intermediate state, current hash with zeroes. */
uint32x4_t prev_hash0 = vld1q_u32(m_intermediate_hash + 0);
uint32x4_t prev_hash1 = vld1q_u32(m_intermediate_hash + 4);
uint32x4_t cur_hash0 = vdupq_n_u32(0);
uint32x4_t cur_hash1 = vdupq_n_u32(0);
#endif
/* Process blocks. */
do {
uint32x4_t round_constant0, round_constant1;
uint32x4_t data0, data1, data2, data3;
uint32x4_t tmp0, tmp1, tmp2, tmp3;
uint32x4_t tmp_hash;
}
/* Use optimized ASM implementation to process the block. */
__asm__ __volatile__ (
"ldp %q[data0], %q[data1], [%[data]], #0x20\n"
"ldp %q[data2], %q[data3], [%[data]], #0x20\n"
"add %[cur_hash0].4s, %[cur_hash0].4s, %[prev_hash0].4s\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x00]\n"
"add %[cur_hash1].4s, %[cur_hash1].4s, %[prev_hash1].4s\n"
"rev32 %[data0].16b, %[data0].16b\n"
"rev32 %[data1].16b, %[data1].16b\n"
"rev32 %[data2].16b, %[data2].16b\n"
"rev32 %[data3].16b, %[data3].16b\n"
"add %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
"add %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x20]\n"
"sha256su0 %[data0].4s, %[data1].4s\n"
"mov %[prev_hash0].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
"mov %[prev_hash1].16b, %[cur_hash1].16b\n"
"sha256h2 %q[cur_hash1], %q[prev_hash0], %[tmp0].4s\n"
"sha256su0 %[data1].4s, %[data2].4s\n"
"sha256su1 %[data0].4s, %[data2].4s, %[data3].4s\n"
"add %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
"sha256su0 %[data2].4s, %[data3].4s\n"
"sha256su1 %[data1].4s, %[data3].4s, %[data0].4s\n"
"add %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x40]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
"sha256su0 %[data3].4s, %[data0].4s\n"
"sha256su1 %[data2].4s, %[data0].4s, %[data1].4s\n"
"add %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
"sha256su0 %[data0].4s, %[data1].4s\n"
"sha256su1 %[data3].4s, %[data1].4s, %[data2].4s\n"
"add %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x60]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp0].4s\n"
"sha256su0 %[data1].4s, %[data2].4s\n"
"sha256su1 %[data0].4s, %[data2].4s, %[data3].4s\n"
"add %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
"sha256su0 %[data2].4s, %[data3].4s\n"
"sha256su1 %[data1].4s, %[data3].4s, %[data0].4s\n"
"add %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0x80]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
"sha256su0 %[data3].4s, %[data0].4s\n"
"sha256su1 %[data2].4s, %[data0].4s, %[data1].4s\n"
"add %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
"sha256su0 %[data0].4s, %[data1].4s\n"
"sha256su1 %[data3].4s, %[data1].4s, %[data2].4s\n"
"add %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0xA0]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp0].4s\n"
"sha256su0 %[data1].4s, %[data2].4s\n"
"sha256su1 %[data0].4s, %[data2].4s, %[data3].4s\n"
"add %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
"sha256su0 %[data2].4s, %[data3].4s\n"
"sha256su1 %[data1].4s, %[data3].4s, %[data0].4s\n"
"add %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0xC0]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
"sha256su0 %[data3].4s, %[data0].4s\n"
"sha256su1 %[data2].4s, %[data0].4s, %[data1].4s\n"
"add %[tmp0].4s, %[data0].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
"sha256su1 %[data3].4s, %[data1].4s, %[data2].4s\n"
"add %[tmp1].4s, %[data1].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"ldp %q[round_constant0], %q[round_constant1], [%[round_constants], 0xE0]\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp0].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp0].4s\n"
"add %[tmp2].4s, %[data2].4s, %[round_constant0].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp1].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp1].4s\n"
"add %[tmp3].4s, %[data3].4s, %[round_constant1].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp2].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp2].4s\n"
"mov %[tmp_hash].16b, %[cur_hash0].16b\n"
"sha256h %q[cur_hash0], %q[cur_hash1], %[tmp3].4s\n"
"sha256h2 %q[cur_hash1], %q[tmp_hash], %[tmp3].4s\n"
: [data0]"=w"(data0), [data1]"=w"(data1), [data2]"=w"(data2), [data3]"=w"(data3),
[tmp0]"=w"(tmp0), [tmp1]"=w"(tmp1), [tmp2]"=w"(tmp2), [tmp3]"=w"(tmp3),
[round_constant0]"=w"(round_constant0), [round_constant1]"=w"(round_constant1),
[cur_hash0]"+w"(cur_hash0), [cur_hash1]"+w"(cur_hash1),
[prev_hash0]"+w"(prev_hash0), [prev_hash1]"+w"(prev_hash1),
[tmp_hash]"=w"(tmp_hash), [data]"+r"(data)
: [round_constants]"r"(RoundConstants)
:
);
} while (--block_count != 0);
/* Add hashes together, and store. */
cur_hash0 = vaddq_u32(prev_hash0, cur_hash0);
cur_hash1 = vaddq_u32(prev_hash1, cur_hash1);
vst1q_u32(m_intermediate_hash + 0, cur_hash0);
vst1q_u32(m_intermediate_hash + 4, cur_hash1);
}
void Sha256Impl::ProcessLastBlock() {
/* Setup the final block. */
constexpr const auto BlockSizeWithoutSizeField = BlockSize - sizeof(u64);
/* Increment our bits consumed. */
m_bits_consumed += BITSIZEOF(u8) * m_buffered_bytes;
/* Add 0x80 terminator. */
m_buffer[m_buffered_bytes++] = 0x80;
/* If we can process the size field directly, do so, otherwise set up to process it. */
if (m_buffered_bytes <= BlockSizeWithoutSizeField) {
/* Clear up to size field. */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSizeWithoutSizeField - m_buffered_bytes);
} else {
/* Consume full block */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSize - m_buffered_bytes);
this->ProcessBlock(m_buffer);
/* Clear up to size field. */
std::memset(m_buffer, 0, BlockSizeWithoutSizeField);
}
/* Store the size field. */
util::StoreBigEndian<u64>(reinterpret_cast<u64 *>(m_buffer + BlockSizeWithoutSizeField), m_bits_consumed);
/* Process the final block. */
this->ProcessBlock(m_buffer);
}
}
#endif

View File

@@ -0,0 +1,260 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
namespace ams::crypto::impl {
namespace {
alignas(Sha256Impl::BlockSize) constexpr const u32 RoundConstants[0x40] = {
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
};
constexpr ALWAYS_INLINE u32 Choose(u32 x, u32 y, u32 z) {
return (x & y) ^ ((~x) & z);
}
constexpr ALWAYS_INLINE u32 Majority(u32 x, u32 y, u32 z) {
return (x & y) ^ (x & z) ^ (y & z);
}
constexpr ALWAYS_INLINE u32 LargeSigma0(u32 x) {
return util::RotateRight<u32>(x, 2) ^ util::RotateRight<u32>(x, 13) ^ util::RotateRight<u32>(x, 22);
}
constexpr ALWAYS_INLINE u32 LargeSigma1(u32 x) {
return util::RotateRight<u32>(x, 6) ^ util::RotateRight<u32>(x, 11) ^ util::RotateRight<u32>(x, 25);
}
constexpr ALWAYS_INLINE u32 SmallSigma0(u32 x) {
return util::RotateRight<u32>(x, 7) ^ util::RotateRight<u32>(x, 18) ^ (x >> 3);
}
constexpr ALWAYS_INLINE u32 SmallSigma1(u32 x) {
return util::RotateRight<u32>(x, 17) ^ util::RotateRight<u32>(x, 19) ^ (x >> 10);
}
}
void Sha256Impl::Initialize() {
/* Reset buffered bytes/bits. */
m_buffered_bytes = 0;
m_bits_consumed = 0;
/* Set intermediate hash. */
m_intermediate_hash[0] = 0x6A09E667;
m_intermediate_hash[1] = 0xBB67AE85;
m_intermediate_hash[2] = 0x3C6EF372;
m_intermediate_hash[3] = 0xA54FF53A;
m_intermediate_hash[4] = 0x510E527F;
m_intermediate_hash[5] = 0x9B05688C;
m_intermediate_hash[6] = 0x1F83D9AB;
m_intermediate_hash[7] = 0x5BE0CD19;
/* Set state. */
m_state = State_Initialized;
}
void Sha256Impl::Update(const void *data, size_t size) {
/* Verify we're in a state to update. */
AMS_ASSERT(m_state == State_Initialized);
/* Advance our input bit count. */
m_bits_consumed += BITSIZEOF(u8) * (((m_buffered_bytes + size) / BlockSize) * BlockSize);
/* Process anything we have buffered. */
const u8 *data8 = static_cast<const u8 *>(data);
size_t remaining = size;
if (m_buffered_bytes > 0) {
const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
std::memcpy(m_buffer + m_buffered_bytes, data8, copy_size);
data8 += copy_size;
remaining -= copy_size;
m_buffered_bytes += copy_size;
/* Process a block, if we filled one. */
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock(m_buffer);
m_buffered_bytes = 0;
}
}
/* Process blocks, if we have any. */
while (remaining >= BlockSize) {
this->ProcessBlock(data8);
data8 += BlockSize;
remaining -= BlockSize;
}
/* Copy any leftover data to our buffer. */
if (remaining > 0) {
m_buffered_bytes = remaining;
std::memcpy(m_buffer, data8, remaining);
}
}
void Sha256Impl::GetHash(void *dst, size_t size) {
/* Verify we're in a state to get hash. */
AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
AMS_ASSERT(size >= HashSize);
AMS_UNUSED(size);
/* If we need to, process the last block. */
if (m_state == State_Initialized) {
this->ProcessLastBlock();
m_state = State_Done;
}
/* Copy the output hash. */
if constexpr (util::IsLittleEndian()) {
static_assert(HashSize % sizeof(u32) == 0);
u32 *dst_32 = static_cast<u32 *>(dst);
for (size_t i = 0; i < HashSize / sizeof(u32); ++i) {
dst_32[i] = util::LoadBigEndian<u32>(m_intermediate_hash + i);
}
} else {
std::memcpy(dst, m_intermediate_hash, HashSize);
}
}
void Sha256Impl::InitializeWithContext(const Sha256Context *context) {
/* Copy state in from the context. */
std::memcpy(m_intermediate_hash, context->intermediate_hash, sizeof(m_intermediate_hash));
m_bits_consumed = context->bits_consumed;
/* Reset other fields. */
m_buffered_bytes = 0;
m_state = State_Initialized;
}
size_t Sha256Impl::GetContext(Sha256Context *context) const {
/* Check our state. */
AMS_ASSERT(m_state == State_Initialized);
/* Copy out the context. */
std::memcpy(context->intermediate_hash, m_intermediate_hash, sizeof(context->intermediate_hash));
context->bits_consumed = m_bits_consumed;
return m_buffered_bytes;
}
void Sha256Impl::ProcessBlock(const void *data) {
/* Load work variables. */
u32 a = m_intermediate_hash[0];
u32 b = m_intermediate_hash[1];
u32 c = m_intermediate_hash[2];
u32 d = m_intermediate_hash[3];
u32 e = m_intermediate_hash[4];
u32 f = m_intermediate_hash[5];
u32 g = m_intermediate_hash[6];
u32 h = m_intermediate_hash[7];
u32 tmp[2];
size_t i;
/* Copy the input. */
u32 w[64];
if constexpr (util::IsLittleEndian()) {
static_assert(BlockSize % sizeof(u32) == 0);
const u32 *src_32 = static_cast<const u32 *>(data);
for (size_t i = 0; i < BlockSize / sizeof(u32); ++i) {
w[i] = util::LoadBigEndian<u32>(src_32 + i);
}
} else {
std::memcpy(w, data, BlockSize);
}
/* Initialize the rest of w. */
for (i = BlockSize / sizeof(u32); i < util::size(w); ++i) {
const u32 *prev = w + (i - BlockSize / sizeof(u32));
w[i] = prev[0] + SmallSigma0(prev[1]) + prev[9] + SmallSigma1(prev[14]);
}
/* Perform rounds. */
for (i = 0; i < 64; ++i) {
tmp[0] = h + LargeSigma1(e) + Choose(e, f, g) + RoundConstants[i] + w[i];
tmp[1] = LargeSigma0(a) + Majority(a, b, c);
h = g;
g = f;
f = e;
e = d + tmp[0];
d = c;
c = b;
b = a;
a = tmp[0] + tmp[1];
}
/* Update intermediate hash. */
m_intermediate_hash[0] += a;
m_intermediate_hash[1] += b;
m_intermediate_hash[2] += c;
m_intermediate_hash[3] += d;
m_intermediate_hash[4] += e;
m_intermediate_hash[5] += f;
m_intermediate_hash[6] += g;
m_intermediate_hash[7] += h;
}
void Sha256Impl::ProcessLastBlock() {
/* Setup the final block. */
constexpr const auto BlockSizeWithoutSizeField = BlockSize - sizeof(u64);
/* Increment our bits consumed. */
m_bits_consumed += BITSIZEOF(u8) * m_buffered_bytes;
/* Add 0x80 terminator. */
m_buffer[m_buffered_bytes++] = 0x80;
/* If we can process the size field directly, do so, otherwise set up to process it. */
if (m_buffered_bytes <= BlockSizeWithoutSizeField) {
/* Clear up to size field. */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSizeWithoutSizeField - m_buffered_bytes);
} else {
/* Consume full block */
std::memset(m_buffer + m_buffered_bytes, 0, BlockSize - m_buffered_bytes);
this->ProcessBlock(m_buffer);
/* Clear up to size field. */
std::memset(m_buffer, 0, BlockSizeWithoutSizeField);
}
/* Store the size field. */
util::StoreBigEndian<u64>(reinterpret_cast<u64 *>(m_buffer + BlockSizeWithoutSizeField), m_bits_consumed);
/* Process the final block. */
this->ProcessBlock(m_buffer);
}
}

View File

@@ -0,0 +1,240 @@
/*
* Copyright (c) Atmosphère-NX
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vapours.hpp>
namespace ams::crypto::impl {
namespace {
constexpr auto NumRounds = 24;
constexpr const u64 IotaRoundConstant[NumRounds] = {
UINT64_C(0x0000000000000001), UINT64_C(0x0000000000008082),
UINT64_C(0x800000000000808A), UINT64_C(0x8000000080008000),
UINT64_C(0x000000000000808B), UINT64_C(0x0000000080000001),
UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008009),
UINT64_C(0x000000000000008A), UINT64_C(0x0000000000000088),
UINT64_C(0x0000000080008009), UINT64_C(0x000000008000000A),
UINT64_C(0x000000008000808B), UINT64_C(0x800000000000008B),
UINT64_C(0x8000000000008089), UINT64_C(0x8000000000008003),
UINT64_C(0x8000000000008002), UINT64_C(0x8000000000000080),
UINT64_C(0x000000000000800A), UINT64_C(0x800000008000000A),
UINT64_C(0x8000000080008081), UINT64_C(0x8000000000008080),
UINT64_C(0x0000000080000001), UINT64_C(0x8000000080008008)
};
constexpr const int RhoShiftBit[NumRounds] = {
1, 3, 6, 10, 15, 21, 28, 36,
45, 55, 2, 14, 27, 41, 56, 8,
25, 43, 62, 18, 39, 61, 20, 44
};
constexpr const int RhoNextIndex[NumRounds] = {
10, 7, 11, 17, 18, 3, 5, 16,
8, 21, 24, 4, 15, 23, 19, 13,
12, 2, 20, 14, 22, 9, 6, 1
};
}
template<size_t HashSize>
void Sha3Impl<HashSize>::Initialize() {
/* Clear internal state. */
std::memset(m_internal_state, 0, sizeof(m_internal_state));
/* Reset buffered bytes. */
m_buffered_bytes = 0;
/* Set state. */
m_state = State_Initialized;
}
template<size_t HashSize>
void Sha3Impl<HashSize>::Update(const void *data, size_t size) {
/* Verify we're in a state to update. */
AMS_ASSERT(m_state == State_Initialized);
/* Process we have anything buffered. */
const u8 *data8 = static_cast<const u8 *>(data);
size_t remaining = size;
if (m_buffered_bytes > 0) {
/* Determine how much we can copy. */
const size_t copy_size = std::min(BlockSize - m_buffered_bytes, remaining);
/* Mix the bytes into our state. */
u8 *dst8 = reinterpret_cast<u8 *>(m_internal_state) + m_buffered_bytes;
for (size_t i = 0; i < copy_size; ++i) {
dst8[i] ^= data8[i];
}
/* Advance. */
data8 += copy_size;
remaining -= copy_size;
m_buffered_bytes += copy_size;
/* Process a block, if we filled one. */
if (m_buffered_bytes == BlockSize) {
this->ProcessBlock();
m_buffered_bytes = 0;
}
}
/* Process blocks, if we have any. */
while (remaining >= BlockSize) {
/* Mix the bytes into our state. */
u8 *dst8 = reinterpret_cast<u8 *>(m_internal_state);
for (size_t i = 0; i < BlockSize; ++i) {
dst8[i] ^= data8[i];
}
this->ProcessBlock();
data8 += BlockSize;
remaining -= BlockSize;
}
/* Copy any leftover data to our buffer. */
if (remaining > 0) {
u8 *dst8 = reinterpret_cast<u8 *>(m_internal_state);
for (size_t i = 0; i < remaining; ++i) {
dst8[i] ^= data8[i];
}
m_buffered_bytes = remaining;
}
}
template<size_t HashSize>
void Sha3Impl<HashSize>::GetHash(void *dst, size_t size) {
/* Verify we're in a state to get hash. */
AMS_ASSERT(m_state == State_Initialized || m_state == State_Done);
AMS_ASSERT(size >= HashSize);
AMS_UNUSED(size);
/* If we need to, process the last block. */
if (m_state == State_Initialized) {
this->ProcessLastBlock();
m_state = State_Done;
}
/* Copy the output hash. */
std::memcpy(dst, m_internal_state, HashSize);
}
template<size_t HashSize>
void Sha3Impl<HashSize>::InitializeWithContext(const Sha3Context *context) {
/* Check the context is for the right hash size. */
AMS_ASSERT(context->hash_size == HashSize);
/* Set buffered bytes. */
m_buffered_bytes = context->buffered_bytes;
/* Copy state in from the context. */
std::memcpy(m_internal_state, context->internal_state, sizeof(m_internal_state));
/* Reset other fields. */
m_state = State_Initialized;
}
template<size_t HashSize>
void Sha3Impl<HashSize>::GetContext(Sha3Context *context) const {
/* Check our state. */
AMS_ASSERT(m_state == State_Initialized);
/* Set the output hash size. */
context->hash_size = HashSize;
/* Set buffered bytes. */
context->buffered_bytes = m_buffered_bytes;
/* Copy out the context. */
std::memcpy(context->internal_state, m_internal_state, sizeof(context->internal_state));
}
template<size_t HashSize>
void Sha3Impl<HashSize>::ProcessBlock() {
/* Ensure correct endianness. */
if constexpr (util::IsBigEndian()) {
for (size_t i = 0; i < util::size(m_internal_state); ++i) {
m_internal_state[i] = util::LoadLittleEndian<u64>(m_internal_state + i);
}
}
/* Perform all rounds. */
uint64_t tmp, C[5];
for (auto round = 0; round < NumRounds; ++round) {
/* Handle theta. */
for (size_t i = 0; i < 5; ++i) {
C[i] = m_internal_state[i] ^ m_internal_state[i + 5] ^ m_internal_state[i + 10] ^ m_internal_state[i + 15] ^ m_internal_state[i + 20];
}
for (size_t i = 0; i < 5; ++i) {
tmp = C[(i + 4) % 5] ^ util::RotateLeft<u64>(C[(i + 1) % 5], 1);
for (size_t j = 0; j < 5; ++j) {
m_internal_state[5 * j + i] ^= tmp;
}
}
/* Handle rho/pi. */
tmp = m_internal_state[1];
for (size_t i = 0; i < NumRounds; ++i) {
const auto rho_next_idx = RhoNextIndex[i];
C[0] = m_internal_state[rho_next_idx];
m_internal_state[rho_next_idx] = util::RotateLeft<u64>(tmp, RhoShiftBit[i]);
tmp = C[0];
}
/* Handle chi. */
for (size_t i = 0; i < 5; ++i) {
for (size_t j = 0; j < 5; ++j) {
C[j] = m_internal_state[5 * i + j];
}
for (size_t j = 0; j < 5; ++j) {
m_internal_state[5 * i + j] ^= (~C[(j + 1) % 5]) & C[(j + 2) % 5];
}
}
/* Handle iota. */
m_internal_state[0] ^= IotaRoundConstant[round];
}
/* Ensure correct endianness. */
if constexpr (util::IsBigEndian()) {
for (size_t i = 0; i < util::size(m_internal_state); ++i) {
util::StoreLittleEndian<u64>(m_internal_state + i, m_internal_state[i]);
}
}
}
template<size_t HashSize>
void Sha3Impl<HashSize>::ProcessLastBlock() {
/* Mix final bits (011) into our state. */
reinterpret_cast<u8 *>(m_internal_state)[m_buffered_bytes] ^= 0b110;
/* Mix in the high bit of the last word in our block. */
constexpr u64 FinalMask = UINT64_C(0x8000000000000000);
m_internal_state[(BlockSize / sizeof(u64)) - 1] ^= FinalMask;
/* Process the last block. */
this->ProcessBlock();
}
/* Explicitly instantiate the supported hash sizes. */
template class Sha3Impl<224 / BITSIZEOF(u8)>;
template class Sha3Impl<256 / BITSIZEOF(u8)>;
template class Sha3Impl<384 / BITSIZEOF(u8)>;
template class Sha3Impl<512 / BITSIZEOF(u8)>;
}