1079 lines
32 KiB
C
1079 lines
32 KiB
C
/*
|
|
* Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
|
|
*
|
|
* Copyright (c) 20xx KazushiMe
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*
|
|
* pthread fork by sun409 (https://github.com/sun409/tinymembench-pthread)
|
|
*
|
|
* Switch port by Kazushi and built with libnx.
|
|
*/
|
|
|
|
// Include the most common headers from the C standard library
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
#include <math.h>
|
|
#include <sys/time.h>
|
|
|
|
// Multi-thread support
|
|
#include <pthread.h>
|
|
#include <sched.h>
|
|
#include <semaphore.h>
|
|
|
|
#define __ASM_OPT_H__
|
|
#define SIZE (32 * 1024 * 1024)
|
|
#define BLOCKSIZE 2048
|
|
#ifndef MAXREPEATS
|
|
# define MAXREPEATS 10
|
|
#endif
|
|
#ifndef LATBENCH_COUNT
|
|
# define LATBENCH_COUNT 10000000
|
|
#endif
|
|
|
|
#define ALIGN_PADDING 0x100000
|
|
#define CACHE_LINE_SIZE 128
|
|
|
|
#include "aarch64-asm.h"
|
|
#include <switch.h>
|
|
|
|
PadState pad;
|
|
|
|
struct f_data
|
|
{
|
|
void (*func)(int64_t *, int64_t *, int);
|
|
int64_t *arg1;
|
|
int64_t *arg2;
|
|
int arg3;
|
|
};
|
|
|
|
pthread_cond_t p_ready, p_start;
|
|
pthread_mutex_t p_lock;
|
|
pthread_t *p_worker = NULL;
|
|
struct f_data *worker_data = NULL;
|
|
int p_worker_not_ready, p_workers_ready;
|
|
|
|
void *thread_func(void *data)
|
|
{
|
|
struct f_data *data_ptr = data;
|
|
|
|
pthread_mutex_lock(&p_lock);
|
|
p_worker_not_ready--;
|
|
|
|
if (!p_worker_not_ready)
|
|
pthread_cond_signal(&p_ready);
|
|
|
|
while (p_workers_ready != 1)
|
|
pthread_cond_wait(&p_start, &p_lock);
|
|
|
|
pthread_mutex_unlock(&p_lock);
|
|
|
|
(data_ptr->func)(data_ptr->arg1, data_ptr->arg2, data_ptr->arg3);
|
|
|
|
pthread_exit(NULL);
|
|
}
|
|
|
|
static void parallel_run(void)
|
|
{
|
|
pthread_mutex_lock(&p_lock);
|
|
p_workers_ready = 1;
|
|
pthread_mutex_unlock(&p_lock);
|
|
pthread_cond_broadcast(&p_start);
|
|
}
|
|
|
|
static void parallel_init(int threads)
|
|
{
|
|
pthread_attr_t attr;
|
|
|
|
pthread_cond_init(&p_ready, NULL);
|
|
pthread_cond_init(&p_start, NULL);
|
|
pthread_mutex_init(&p_lock, NULL);
|
|
p_worker_not_ready = threads;
|
|
p_workers_ready = 0;
|
|
pthread_attr_init(&attr);
|
|
|
|
if (!p_worker || !worker_data)
|
|
{
|
|
p_worker = (pthread_t *)malloc(threads * sizeof(pthread_t));
|
|
worker_data = (struct f_data *)malloc(threads * sizeof(struct f_data));
|
|
}
|
|
|
|
for (int i = 0; i < threads; i++)
|
|
{
|
|
pthread_create(p_worker + i, &attr, thread_func, worker_data + i);
|
|
}
|
|
|
|
pthread_mutex_lock(&p_lock);
|
|
while (p_worker_not_ready != 0)
|
|
pthread_cond_wait(&p_ready, &p_lock);
|
|
|
|
pthread_mutex_unlock(&p_lock);
|
|
}
|
|
|
|
typedef struct
|
|
{
|
|
const char *description;
|
|
int use_tmpbuf;
|
|
void (*f)(int64_t *, int64_t *, int);
|
|
} bench_info;
|
|
|
|
static char *align_up(char *ptr, int align)
|
|
{
|
|
return (char *)(((uintptr_t)ptr + align - 1) & ~(uintptr_t)(align - 1));
|
|
}
|
|
|
|
void aligned_block_copy(int64_t * __restrict dst_,
|
|
int64_t * __restrict src,
|
|
int size)
|
|
{
|
|
volatile int64_t *dst = dst_;
|
|
int64_t t1, t2, t3, t4;
|
|
while ((size -= 64) >= 0)
|
|
{
|
|
t1 = *src++;
|
|
t2 = *src++;
|
|
t3 = *src++;
|
|
t4 = *src++;
|
|
*dst++ = t1;
|
|
*dst++ = t2;
|
|
*dst++ = t3;
|
|
*dst++ = t4;
|
|
t1 = *src++;
|
|
t2 = *src++;
|
|
t3 = *src++;
|
|
t4 = *src++;
|
|
*dst++ = t1;
|
|
*dst++ = t2;
|
|
*dst++ = t3;
|
|
*dst++ = t4;
|
|
}
|
|
}
|
|
|
|
void aligned_block_copy_backwards(int64_t * __restrict dst_,
|
|
int64_t * __restrict src,
|
|
int size)
|
|
{
|
|
volatile int64_t *dst = dst_;
|
|
int64_t t1, t2, t3, t4;
|
|
src += size / 8 - 1;
|
|
dst += size / 8 - 1;
|
|
while ((size -= 64) >= 0)
|
|
{
|
|
t1 = *src--;
|
|
t2 = *src--;
|
|
t3 = *src--;
|
|
t4 = *src--;
|
|
*dst-- = t1;
|
|
*dst-- = t2;
|
|
*dst-- = t3;
|
|
*dst-- = t4;
|
|
t1 = *src--;
|
|
t2 = *src--;
|
|
t3 = *src--;
|
|
t4 = *src--;
|
|
*dst-- = t1;
|
|
*dst-- = t2;
|
|
*dst-- = t3;
|
|
*dst-- = t4;
|
|
}
|
|
}
|
|
|
|
void aligned_block_copy_backwards_bs32(int64_t * __restrict dst_,
|
|
int64_t * __restrict src,
|
|
int size)
|
|
{
|
|
volatile int64_t *dst = dst_;
|
|
int64_t t1, t2, t3, t4;
|
|
src += size / 8 - 8;
|
|
dst += size / 8 - 8;
|
|
while ((size -= 64) >= 0)
|
|
{
|
|
t1 = src[4];
|
|
t2 = src[5];
|
|
t3 = src[6];
|
|
t4 = src[7];
|
|
dst[4] = t1;
|
|
dst[5] = t2;
|
|
dst[6] = t3;
|
|
dst[7] = t4;
|
|
t1 = src[0];
|
|
t2 = src[1];
|
|
t3 = src[2];
|
|
t4 = src[3];
|
|
dst[0] = t1;
|
|
dst[1] = t2;
|
|
dst[2] = t3;
|
|
dst[3] = t4;
|
|
src -= 8;
|
|
dst -= 8;
|
|
}
|
|
}
|
|
|
|
void aligned_block_copy_backwards_bs64(int64_t * __restrict dst_,
|
|
int64_t * __restrict src,
|
|
int size)
|
|
{
|
|
volatile int64_t *dst = dst_;
|
|
int64_t t1, t2, t3, t4;
|
|
src += size / 8 - 8;
|
|
dst += size / 8 - 8;
|
|
while ((size -= 64) >= 0)
|
|
{
|
|
t1 = src[0];
|
|
t2 = src[1];
|
|
t3 = src[2];
|
|
t4 = src[3];
|
|
dst[0] = t1;
|
|
dst[1] = t2;
|
|
dst[2] = t3;
|
|
dst[3] = t4;
|
|
t1 = src[4];
|
|
t2 = src[5];
|
|
t3 = src[6];
|
|
t4 = src[7];
|
|
dst[4] = t1;
|
|
dst[5] = t2;
|
|
dst[6] = t3;
|
|
dst[7] = t4;
|
|
src -= 8;
|
|
dst -= 8;
|
|
}
|
|
}
|
|
|
|
void aligned_block_copy_pf32(int64_t * __restrict dst_,
|
|
int64_t * __restrict src,
|
|
int size)
|
|
{
|
|
volatile int64_t *dst = dst_;
|
|
int64_t t1, t2, t3, t4;
|
|
while ((size -= 64) >= 0)
|
|
{
|
|
__builtin_prefetch(src + 32, 0, 0);
|
|
t1 = *src++;
|
|
t2 = *src++;
|
|
t3 = *src++;
|
|
t4 = *src++;
|
|
*dst++ = t1;
|
|
*dst++ = t2;
|
|
*dst++ = t3;
|
|
*dst++ = t4;
|
|
__builtin_prefetch(src + 32, 0, 0);
|
|
t1 = *src++;
|
|
t2 = *src++;
|
|
t3 = *src++;
|
|
t4 = *src++;
|
|
*dst++ = t1;
|
|
*dst++ = t2;
|
|
*dst++ = t3;
|
|
*dst++ = t4;
|
|
}
|
|
}
|
|
|
|
void aligned_block_copy_pf64(int64_t * __restrict dst_,
|
|
int64_t * __restrict src,
|
|
int size)
|
|
{
|
|
volatile int64_t *dst = dst_;
|
|
int64_t t1, t2, t3, t4;
|
|
while ((size -= 64) >= 0)
|
|
{
|
|
__builtin_prefetch(src + 32, 0, 0);
|
|
t1 = *src++;
|
|
t2 = *src++;
|
|
t3 = *src++;
|
|
t4 = *src++;
|
|
*dst++ = t1;
|
|
*dst++ = t2;
|
|
*dst++ = t3;
|
|
*dst++ = t4;
|
|
t1 = *src++;
|
|
t2 = *src++;
|
|
t3 = *src++;
|
|
t4 = *src++;
|
|
*dst++ = t1;
|
|
*dst++ = t2;
|
|
*dst++ = t3;
|
|
*dst++ = t4;
|
|
}
|
|
}
|
|
|
|
void aligned_block_fetch(int64_t * __restrict dst,
|
|
int64_t * __restrict src_,
|
|
int size)
|
|
{
|
|
volatile int64_t *src = src_;
|
|
while ((size -= 64) >= 0)
|
|
{
|
|
*src++;
|
|
*src++;
|
|
*src++;
|
|
*src++;
|
|
*src++;
|
|
*src++;
|
|
*src++;
|
|
*src++;
|
|
}
|
|
}
|
|
|
|
void aligned_block_fill(int64_t * __restrict dst_,
|
|
int64_t * __restrict src,
|
|
int size)
|
|
{
|
|
volatile int64_t *dst = dst_;
|
|
int64_t data = *src;
|
|
while ((size -= 64) >= 0)
|
|
{
|
|
*dst++ = data;
|
|
*dst++ = data;
|
|
*dst++ = data;
|
|
*dst++ = data;
|
|
*dst++ = data;
|
|
*dst++ = data;
|
|
*dst++ = data;
|
|
*dst++ = data;
|
|
}
|
|
}
|
|
|
|
void aligned_block_fill_shuffle16(int64_t * __restrict dst_,
|
|
int64_t * __restrict src,
|
|
int size)
|
|
{
|
|
volatile int64_t *dst = dst_;
|
|
int64_t data = *src;
|
|
while ((size -= 64) >= 0)
|
|
{
|
|
dst[0 + 0] = data;
|
|
dst[1 + 0] = data;
|
|
dst[1 + 2] = data;
|
|
dst[0 + 2] = data;
|
|
dst[1 + 4] = data;
|
|
dst[0 + 4] = data;
|
|
dst[0 + 6] = data;
|
|
dst[1 + 6] = data;
|
|
dst += 8;
|
|
}
|
|
}
|
|
|
|
void aligned_block_fill_shuffle32(int64_t * __restrict dst_,
|
|
int64_t * __restrict src,
|
|
int size)
|
|
{
|
|
volatile int64_t *dst = dst_;
|
|
int64_t data = *src;
|
|
while ((size -= 64) >= 0)
|
|
{
|
|
dst[3 + 0] = data;
|
|
dst[0 + 0] = data;
|
|
dst[2 + 0] = data;
|
|
dst[1 + 0] = data;
|
|
dst[3 + 4] = data;
|
|
dst[0 + 4] = data;
|
|
dst[2 + 4] = data;
|
|
dst[1 + 4] = data;
|
|
dst += 8;
|
|
}
|
|
}
|
|
|
|
void aligned_block_fill_shuffle64(int64_t * __restrict dst_,
|
|
int64_t * __restrict src,
|
|
int size)
|
|
{
|
|
volatile int64_t *dst = dst_;
|
|
int64_t data = *src;
|
|
while ((size -= 64) >= 0)
|
|
{
|
|
dst[5] = data;
|
|
dst[2] = data;
|
|
dst[7] = data;
|
|
dst[6] = data;
|
|
dst[1] = data;
|
|
dst[3] = data;
|
|
dst[0] = data;
|
|
dst[4] = data;
|
|
dst += 8;
|
|
}
|
|
}
|
|
|
|
double gettime(void)
|
|
{
|
|
struct timeval tv;
|
|
gettimeofday(&tv, NULL);
|
|
return (double)((int64_t)tv.tv_sec * 1000000 + tv.tv_usec) / 1000000.;
|
|
}
|
|
|
|
static double bandwidth_bench_helper(int threads,
|
|
int64_t *dstbuf, int64_t *srcbuf,
|
|
int64_t *tmpbuf,
|
|
int size, int blocksize,
|
|
const char *indent_prefix,
|
|
int use_tmpbuf,
|
|
void (*f)(int64_t *, int64_t *, int),
|
|
const char *description)
|
|
{
|
|
int i, j, loopcount, innerloopcount, n;
|
|
double t, t1, t2;
|
|
double speed, maxspeed;
|
|
double s, s0, s1, s2;
|
|
|
|
/* do up to MAXREPEATS measurements */
|
|
s = s0 = s1 = s2 = 0.;
|
|
maxspeed = 0.;
|
|
for (n = 0; n < MAXREPEATS; n++)
|
|
{
|
|
parallel_init(threads);
|
|
for (int pt = 0; pt < threads; pt++)
|
|
{
|
|
(worker_data + pt)->func = f;
|
|
(worker_data + pt)->arg1 = dstbuf + size * pt / sizeof(int64_t);
|
|
(worker_data + pt)->arg2 = srcbuf + size * pt / sizeof(int64_t);
|
|
(worker_data + pt)->arg3 = size;
|
|
}
|
|
parallel_run();
|
|
for (int pt = 0; pt < threads; pt++)
|
|
pthread_join(p_worker[pt], NULL);
|
|
|
|
loopcount = 0;
|
|
innerloopcount = 1;
|
|
t = 0.;
|
|
do
|
|
{
|
|
loopcount += innerloopcount;
|
|
if (use_tmpbuf)
|
|
{
|
|
for (i = 0; i < innerloopcount; i++)
|
|
{
|
|
t1 = gettime();
|
|
for (j = 0; j < size; j += blocksize)
|
|
{
|
|
f(tmpbuf, srcbuf + j / sizeof(int64_t), blocksize);
|
|
f(dstbuf + j / sizeof(int64_t), tmpbuf, blocksize);
|
|
}
|
|
t2 = gettime();
|
|
t += t2 - t1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (i = 0; i < innerloopcount; i++)
|
|
{
|
|
parallel_init(threads);
|
|
for (int pt = 0; pt < threads; ++pt)
|
|
{
|
|
(worker_data + pt)->func = f;
|
|
(worker_data + pt)->arg1 = dstbuf + size * pt / sizeof(int64_t);
|
|
(worker_data + pt)->arg2 = srcbuf + size * pt / sizeof(int64_t);
|
|
(worker_data + pt)->arg3 = size;
|
|
}
|
|
|
|
t1 = gettime();
|
|
parallel_run();
|
|
for (int pt = 0; pt < threads; ++pt)
|
|
pthread_join(p_worker[pt], NULL);
|
|
t2 = gettime();
|
|
t += t2 - t1;
|
|
}
|
|
}
|
|
innerloopcount *= 2;
|
|
} while (t < 0.5);
|
|
speed = (double)size * (use_tmpbuf ? 1 : threads) * loopcount / t / 1000000.;
|
|
|
|
s0 += 1.;
|
|
s1 += speed;
|
|
s2 += speed * speed;
|
|
|
|
if (speed > maxspeed)
|
|
maxspeed = speed;
|
|
|
|
if (s0 > 2.)
|
|
{
|
|
s = sqrt((s0 * s2 - s1 * s1) / (s0 * (s0 - 1)));
|
|
if (s < maxspeed / 1000.)
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (maxspeed > 0 && s / maxspeed * 100. >= 0.1)
|
|
{
|
|
printf("%s%-40s : %8.1f MB/s (%.1f%%)\n", indent_prefix, description,
|
|
maxspeed, s / maxspeed * 100.);
|
|
}
|
|
else
|
|
{
|
|
printf("%s%-40s : %8.1f MB/s\n", indent_prefix, description, maxspeed);
|
|
}
|
|
|
|
consoleUpdate(NULL);
|
|
return maxspeed;
|
|
}
|
|
|
|
void bandwidth_bench(int threads,
|
|
int64_t *dstbuf, int64_t *srcbuf, int64_t *tmpbuf,
|
|
int size, int blocksize, const char *indent_prefix,
|
|
bench_info *bi)
|
|
{
|
|
while (bi->f)
|
|
{
|
|
bandwidth_bench_helper(threads,
|
|
dstbuf, srcbuf, tmpbuf, size, blocksize,
|
|
indent_prefix, bi->use_tmpbuf,
|
|
bi->f,
|
|
bi->description);
|
|
bi++;
|
|
}
|
|
}
|
|
|
|
void memcpy_wrapper(int64_t *dst, int64_t *src, int size)
|
|
{
|
|
memcpy(dst, src, size);
|
|
}
|
|
|
|
void memset_wrapper(int64_t *dst, int64_t *src, int size)
|
|
{
|
|
memset(dst, src[0], size);
|
|
}
|
|
|
|
static bench_info aarch64_neon[] =
|
|
{
|
|
{ "NEON LDP (READ)", 0, aligned_block_read_ldp_q_aarch64 },
|
|
{ "NEON LDP/STP copy (COPY)", 0, aligned_block_copy_ldpstp_q_aarch64 },
|
|
{ "NEON LDP/STP copy pldl2strm (32B step)", 0, aligned_block_copy_ldpstp_q_pf32_l2strm_aarch64 },
|
|
{ "NEON LDP/STP copy pldl2strm (64B step)", 0, aligned_block_copy_ldpstp_q_pf64_l2strm_aarch64 },
|
|
{ "NEON LDP/STP copy pldl1keep (32B step)", 0, aligned_block_copy_ldpstp_q_pf32_l1keep_aarch64 },
|
|
{ "NEON LDP/STP copy pldl1keep (64B step)", 0, aligned_block_copy_ldpstp_q_pf64_l1keep_aarch64 },
|
|
{ "NEON LD1/ST1 copy", 0, aligned_block_copy_ld1st1_aarch64 },
|
|
{ "NEON STP fill (WRITE)", 0, aligned_block_fill_stp_q_aarch64 },
|
|
{ "NEON STNP fill", 0, aligned_block_fill_stnp_q_aarch64 },
|
|
{ "ARM LDP", 0, aligned_block_read_ldp_x_aarch64 },
|
|
{ "ARM LDP/STP copy", 0, aligned_block_copy_ldpstp_x_aarch64 },
|
|
{ "ARM STP fill", 0, aligned_block_fill_stp_x_aarch64 },
|
|
{ "ARM STNP fill", 0, aligned_block_fill_stnp_x_aarch64 },
|
|
{ NULL, 0, NULL }
|
|
};
|
|
|
|
bench_info *get_asm_benchmarks(void)
|
|
{
|
|
return aarch64_neon;
|
|
}
|
|
|
|
static bench_info c_benchmarks[] =
|
|
{
|
|
{ "C copy backwards", 0, aligned_block_copy_backwards },
|
|
{ "C copy backwards (32B blocks)", 0, aligned_block_copy_backwards_bs32 },
|
|
{ "C copy backwards (64B blocks)", 0, aligned_block_copy_backwards_bs64 },
|
|
{ "C copy", 0, aligned_block_copy },
|
|
{ "C copy prefetched (32B step)", 0, aligned_block_copy_pf32 },
|
|
{ "C copy prefetched (64B step)", 0, aligned_block_copy_pf64 },
|
|
// { "C 2-pass copy", 1, aligned_block_copy },
|
|
// { "C 2-pass copy prefetched (32B step)", 1, aligned_block_copy_pf32 },
|
|
// { "C 2-pass copy prefetched (64B step)", 1, aligned_block_copy_pf64 },
|
|
{ "C fetch", 0, aligned_block_fetch },
|
|
{ "C fill", 0, aligned_block_fill },
|
|
{ "C fill (shuffle within 16B blocks)", 0, aligned_block_fill_shuffle16 },
|
|
{ "C fill (shuffle within 32B blocks)", 0, aligned_block_fill_shuffle32 },
|
|
{ "C fill (shuffle within 64B blocks)", 0, aligned_block_fill_shuffle64 },
|
|
{ NULL, 0, NULL }
|
|
};
|
|
|
|
static bench_info libc_benchmarks[] =
|
|
{
|
|
{ "standard memcpy", 0, memcpy_wrapper },
|
|
{ "standard memset", 0, memset_wrapper },
|
|
{ NULL, 0, NULL }
|
|
};
|
|
|
|
void *alloc_four_nonaliased_buffers(void **buf1_, int size1,
|
|
void **buf2_, int size2,
|
|
void **buf3_, int size3,
|
|
void **buf4_, int size4)
|
|
{
|
|
char **buf1 = (char **)buf1_, **buf2 = (char **)buf2_;
|
|
char **buf3 = (char **)buf3_, **buf4 = (char **)buf4_;
|
|
int antialias_pattern_mask = (ALIGN_PADDING - 1) & ~(CACHE_LINE_SIZE - 1);
|
|
char *buf, *ptr;
|
|
|
|
if (!buf1 || size1 < 0)
|
|
size1 = 0;
|
|
if (!buf2 || size2 < 0)
|
|
size2 = 0;
|
|
if (!buf3 || size3 < 0)
|
|
size3 = 0;
|
|
if (!buf4 || size4 < 0)
|
|
size4 = 0;
|
|
|
|
ptr = buf =
|
|
(char *)malloc(size1 + size2 + size3 + size4 + 9 * ALIGN_PADDING);
|
|
memset(buf, 0xCC, size1 + size2 + size3 + size4 + 9 * ALIGN_PADDING);
|
|
|
|
ptr = align_up(ptr, ALIGN_PADDING);
|
|
if (buf1)
|
|
{
|
|
*buf1 = ptr + (0xAAAAAAAA & antialias_pattern_mask);
|
|
ptr = align_up(*buf1 + size1, ALIGN_PADDING);
|
|
}
|
|
if (buf2)
|
|
{
|
|
*buf2 = ptr + (0x55555555 & antialias_pattern_mask);
|
|
ptr = align_up(*buf2 + size2, ALIGN_PADDING);
|
|
}
|
|
if (buf3)
|
|
{
|
|
*buf3 = ptr + (0xCCCCCCCC & antialias_pattern_mask);
|
|
ptr = align_up(*buf3 + size3, ALIGN_PADDING);
|
|
}
|
|
if (buf4)
|
|
{
|
|
*buf4 = ptr + (0x33333333 & antialias_pattern_mask);
|
|
}
|
|
|
|
return buf;
|
|
}
|
|
|
|
#pragma GCC diagnostic push
|
|
static void __attribute__((noinline)) random_read_test(char *zerobuffer,
|
|
int count, int nbits)
|
|
{
|
|
uint32_t seed = 0;
|
|
uintptr_t addrmask = (1 << nbits) - 1;
|
|
uint32_t v;
|
|
|
|
#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
|
|
static volatile uint32_t dummy;
|
|
|
|
#define RANDOM_MEM_ACCESS() \
|
|
seed = seed * 1103515245 + 12345; \
|
|
v = (seed >> 16) & 0xFF; \
|
|
seed = seed * 1103515245 + 12345; \
|
|
v |= (seed >> 8) & 0xFF00; \
|
|
seed = seed * 1103515245 + 12345; \
|
|
v |= seed & 0x7FFF0000; \
|
|
seed |= zerobuffer[v & addrmask];
|
|
|
|
while (count >= 16) {
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
count -= 16;
|
|
}
|
|
dummy = seed;
|
|
#undef RANDOM_MEM_ACCESS
|
|
}
|
|
|
|
static void __attribute__((noinline)) random_dual_read_test(char *zerobuffer,
|
|
int count, int nbits)
|
|
{
|
|
uint32_t seed = 0;
|
|
uintptr_t addrmask = (1 << nbits) - 1;
|
|
uint32_t v1, v2;
|
|
|
|
#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
|
|
static volatile uint32_t dummy;
|
|
|
|
#define RANDOM_MEM_ACCESS() \
|
|
seed = seed * 1103515245 + 12345; \
|
|
v1 = (seed >> 8) & 0xFF00; \
|
|
seed = seed * 1103515245 + 12345; \
|
|
v2 = (seed >> 8) & 0xFF00; \
|
|
seed = seed * 1103515245 + 12345; \
|
|
v1 |= seed & 0x7FFF0000; \
|
|
seed = seed * 1103515245 + 12345; \
|
|
v2 |= seed & 0x7FFF0000; \
|
|
seed = seed * 1103515245 + 12345; \
|
|
v1 |= (seed >> 16) & 0xFF; \
|
|
v2 |= (seed >> 24); \
|
|
v2 &= addrmask; \
|
|
v1 ^= v2; \
|
|
seed |= zerobuffer[v2]; \
|
|
seed += zerobuffer[v1 & addrmask];
|
|
|
|
while (count >= 16) {
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
RANDOM_MEM_ACCESS();
|
|
count -= 16;
|
|
}
|
|
dummy = seed;
|
|
#undef RANDOM_MEM_ACCESS
|
|
}
|
|
#pragma GCC diagnostic pop
|
|
|
|
static uint32_t rand32()
|
|
{
|
|
static int seed = 0;
|
|
uint32_t hi, lo;
|
|
hi = (seed = seed * 1103515245 + 12345) >> 16;
|
|
lo = (seed = seed * 1103515245 + 12345) >> 16;
|
|
return (hi << 16) + lo;
|
|
}
|
|
|
|
int latency_bench(int size, int count, int use_hugepage, int quick)
|
|
{
|
|
double t, t2, t_before, t_after, t_noaccess, t_noaccess2 = 0;
|
|
double xs, xs1, xs2;
|
|
double ys, ys1, ys2;
|
|
double min_t, min_t2;
|
|
int nbits, n;
|
|
char *buffer, *buffer_alloc;
|
|
#if !defined(__linux__) || !defined(MADV_HUGEPAGE)
|
|
if (use_hugepage)
|
|
return 0;
|
|
buffer_alloc = (char *)malloc(size + 4095);
|
|
if (!buffer_alloc)
|
|
return 0;
|
|
buffer = (char *)(((uintptr_t)buffer_alloc + 4095) & ~(uintptr_t)4095);
|
|
#else
|
|
if (posix_memalign((void **)&buffer_alloc, 4 * 1024 * 1024, size) != 0)
|
|
return 0;
|
|
buffer = buffer_alloc;
|
|
if (use_hugepage && madvise(buffer, size, use_hugepage > 0 ?
|
|
MADV_HUGEPAGE : MADV_NOHUGEPAGE) != 0)
|
|
{
|
|
free(buffer_alloc);
|
|
return 0;
|
|
}
|
|
#endif
|
|
memset(buffer, 0, size);
|
|
|
|
for (n = 1; n <= MAXREPEATS; n++)
|
|
{
|
|
t_before = gettime();
|
|
random_read_test(buffer, count, 1);
|
|
t_after = gettime();
|
|
if (n == 1 || t_after - t_before < t_noaccess)
|
|
t_noaccess = t_after - t_before;
|
|
|
|
t_before = gettime();
|
|
random_dual_read_test(buffer, count, 1);
|
|
t_after = gettime();
|
|
if (n == 1 || t_after - t_before < t_noaccess2)
|
|
t_noaccess2 = t_after - t_before;
|
|
}
|
|
|
|
printf("\nblock size : single random read / dual random read");
|
|
if (use_hugepage > 0)
|
|
printf(", [MADV_HUGEPAGE]\n");
|
|
else if (use_hugepage < 0)
|
|
printf(", [MADV_NOHUGEPAGE]\n");
|
|
else
|
|
printf("\n");
|
|
|
|
consoleUpdate(NULL);
|
|
|
|
int start = quick ? 20 : 10;
|
|
for (nbits = start; (1 << nbits) <= size; nbits++)
|
|
{
|
|
int testsize = 1 << nbits;
|
|
xs1 = xs2 = ys = ys1 = ys2 = 0;
|
|
for (n = 1; n <= MAXREPEATS; n++)
|
|
{
|
|
int testoffs = (rand32() % (size / testsize)) * testsize;
|
|
|
|
t_before = gettime();
|
|
random_read_test(buffer + testoffs, count, nbits);
|
|
t_after = gettime();
|
|
t = t_after - t_before - t_noaccess;
|
|
if (t < 0) t = 0;
|
|
|
|
xs1 += t;
|
|
xs2 += t * t;
|
|
|
|
if (n == 1 || t < min_t)
|
|
min_t = t;
|
|
|
|
t_before = gettime();
|
|
random_dual_read_test(buffer + testoffs, count, nbits);
|
|
t_after = gettime();
|
|
t2 = t_after - t_before - t_noaccess2;
|
|
if (t2 < 0) t2 = 0;
|
|
|
|
ys1 += t2;
|
|
ys2 += t2 * t2;
|
|
|
|
if (n == 1 || t2 < min_t2)
|
|
min_t2 = t2;
|
|
|
|
if (n > 2)
|
|
{
|
|
xs = sqrt((xs2 * n - xs1 * xs1) / (n * (n - 1)));
|
|
ys = sqrt((ys2 * n - ys1 * ys1) / (n * (n - 1)));
|
|
if (xs < min_t / 1000. && ys < min_t2 / 1000.)
|
|
break;
|
|
}
|
|
}
|
|
printf("%10d : %6.1f ns / %6.1f ns \n", (1 << nbits),
|
|
min_t * 1000000000. / count, min_t2 * 1000000000. / count);
|
|
|
|
consoleUpdate(NULL);
|
|
}
|
|
free(buffer_alloc);
|
|
return 1;
|
|
}
|
|
|
|
void waitForKeyA() {
|
|
while (appletMainLoop())
|
|
{
|
|
padUpdate(&pad);
|
|
|
|
u64 kDown = padGetButtonsDown(&pad);
|
|
|
|
if (kDown & HidNpadButton_A)
|
|
break;
|
|
else if(kDown)
|
|
{
|
|
consoleExit(NULL);
|
|
exit(0);
|
|
}
|
|
|
|
consoleUpdate(NULL);
|
|
}
|
|
}
|
|
|
|
void printClock()
|
|
{
|
|
int res = 0;
|
|
uint32_t cpu_hz = 0, mem_hz = 0;
|
|
|
|
ClkrstSession clkrstSession;
|
|
res = clkrstInitialize();
|
|
if(R_FAILED(res)) {
|
|
fatalThrow(res);
|
|
}
|
|
|
|
clkrstOpenSession(&clkrstSession, PcvModuleId_CpuBus, 3);
|
|
clkrstGetClockRate(&clkrstSession, &cpu_hz);
|
|
clkrstCloseSession(&clkrstSession);
|
|
clkrstOpenSession(&clkrstSession, PcvModuleId_EMC, 3);
|
|
clkrstGetClockRate(&clkrstSession, &mem_hz);
|
|
clkrstCloseSession(&clkrstSession);
|
|
clkrstExit();
|
|
|
|
printf("== CPU: %u.%u MHz ==\n== MEM: %u.%u MHz ==\n",
|
|
cpu_hz/1000000, cpu_hz/100000 - cpu_hz/1000000*10,
|
|
mem_hz/1000000, mem_hz/100000 - mem_hz/1000000*10);
|
|
consoleUpdate(NULL);
|
|
}
|
|
|
|
// Main program entrypoint
|
|
int main(int argc, char* argv[])
|
|
{
|
|
consoleInit(NULL);
|
|
|
|
printf("TinyMemBenchNX v0.4.11\n\
|
|
(based on tinymembench-pthread, a multi-thread fork of simple benchmark for memory throughput and latency)\n\n");
|
|
printf("Copyright (c) 2011-2016 Siarhei Siamashka\n");
|
|
printf("Copyright (c) 2023 KazushiMe\n");
|
|
printf("Copyright (c) 2023 hanai3Bi\n");
|
|
printf("Copyright (c) 2025 Souldbminer\n");
|
|
|
|
printf("\n");
|
|
consoleUpdate(NULL);
|
|
|
|
padConfigureInput(1, HidNpadStyleSet_NpadStandard);
|
|
|
|
padInitializeDefault(&pad);
|
|
|
|
int64_t *srcbuf, *dstbuf, *tmpbuf;
|
|
void *poolbuf;
|
|
size_t bufsize = SIZE;
|
|
int threads = 0;
|
|
|
|
loop:
|
|
printf("!!! Memory bandwidth heavily depends on CPU clock. !!!\n\n");
|
|
printf("\
|
|
Press A to start quick test.\n\
|
|
Press X to start bandwidth test.\n\
|
|
Press Y to start latency test.\n\
|
|
Press any other key to exit.\n\n");
|
|
consoleUpdate(NULL);
|
|
|
|
while (appletMainLoop())
|
|
{
|
|
padUpdate(&pad);
|
|
|
|
u64 kDown = padGetButtonsDown(&pad);
|
|
|
|
if (kDown & HidNpadButton_A)
|
|
{
|
|
threads = 3;
|
|
goto quick;
|
|
break;
|
|
}
|
|
else if (kDown & HidNpadButton_X)
|
|
{
|
|
threads = 3;
|
|
goto bandwidth;
|
|
break;
|
|
}
|
|
else if (kDown & HidNpadButton_Y)
|
|
{
|
|
threads = 3;
|
|
goto latency;
|
|
break;
|
|
}
|
|
else if (kDown)
|
|
{
|
|
consoleExit(NULL);
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
quick:
|
|
|
|
poolbuf = alloc_four_nonaliased_buffers((void **)&srcbuf, bufsize * threads,
|
|
(void **)&dstbuf, bufsize * threads,
|
|
(void **)&tmpbuf, BLOCKSIZE * threads,
|
|
NULL, 0);
|
|
|
|
printClock();
|
|
printf("== Thread: %d ==\n", threads);
|
|
consoleUpdate(NULL);
|
|
|
|
bandwidth_bench(threads, dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE/2, " ", libc_benchmarks);
|
|
|
|
free(poolbuf);
|
|
|
|
int latbench_size = SIZE * 2, latbench_count = LATBENCH_COUNT;
|
|
|
|
if (!latency_bench(latbench_size, latbench_count, -1, 1) ||
|
|
!latency_bench(latbench_size, latbench_count, 1, 1))
|
|
{
|
|
latency_bench(latbench_size, latbench_count, 0, 1);
|
|
}
|
|
|
|
printf("\nPress A to continue, any other key to exit.\n\n");
|
|
waitForKeyA();
|
|
consoleClear();
|
|
goto loop;
|
|
|
|
bandwidth:
|
|
|
|
printf("==========================================================================\n");
|
|
printf("== Memory bandwidth tests ==\n");
|
|
printf("== ==\n");
|
|
printf("== Note 1: 1MB = 1000000 bytes ==\n");
|
|
printf("== Note 2: Results for 'copy' tests show how many bytes can be ==\n");
|
|
printf("== copied per second (adding together read and writen ==\n");
|
|
printf("== bytes would have provided twice higher numbers) ==\n");
|
|
printf("== Note 3: 2-pass copy means that we are using a small temporary buffer ==\n");
|
|
printf("== to first fetch data into it, and only then write it to the ==\n");
|
|
printf("== destination (source -> L1 cache, L1 cache -> destination) ==\n");
|
|
printf("== Note 4: If sample standard deviation exceeds 0.1%%, it is shown in ==\n");
|
|
printf("== brackets ==\n");
|
|
printf("==========================================================================\n\n");
|
|
|
|
poolbuf = alloc_four_nonaliased_buffers((void **)&srcbuf, bufsize * threads,
|
|
(void **)&dstbuf, bufsize * threads,
|
|
(void **)&tmpbuf, BLOCKSIZE * threads,
|
|
NULL, 0);
|
|
|
|
printClock();
|
|
printf("== Thread: %d ==\n", threads);
|
|
consoleUpdate(NULL);
|
|
|
|
bandwidth_bench(threads, dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", c_benchmarks);
|
|
printf(" ---\n");
|
|
|
|
bandwidth_bench(threads, dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", libc_benchmarks);
|
|
bench_info *bi = get_asm_benchmarks();
|
|
if (bi->f) {
|
|
printf(" ---\n");
|
|
bandwidth_bench(threads, dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", bi);
|
|
}
|
|
|
|
free(poolbuf);
|
|
|
|
printf("\nPress A to continue, any other key to exit.\n\n");
|
|
waitForKeyA();
|
|
consoleClear();
|
|
goto loop;
|
|
|
|
latency:
|
|
|
|
latbench_size = SIZE * 2, latbench_count = LATBENCH_COUNT;
|
|
|
|
printf("\n");
|
|
printf("==========================================================================\n");
|
|
printf("== Memory latency test ==\n");
|
|
printf("== ==\n");
|
|
printf("== Average time is measured for random memory accesses in the buffers ==\n");
|
|
printf("== of different sizes. The larger is the buffer, the more significant ==\n");
|
|
printf("== are relative contributions of TLB, L1/L2 cache misses and SDRAM ==\n");
|
|
printf("== accesses. For extremely large buffer sizes we are expecting to see ==\n");
|
|
printf("== page table walk with several requests to SDRAM for almost every ==\n");
|
|
printf("== memory access (though 64MiB is not nearly large enough to experience ==\n");
|
|
printf("== this effect to its fullest). ==\n");
|
|
printf("== ==\n");
|
|
printf("== Note 1: All the numbers are representing extra time, which needs to ==\n");
|
|
printf("== be added to L1 cache latency. The cycle timings for L1 cache ==\n");
|
|
printf("== latency can be usually found in the processor documentation. ==\n");
|
|
printf("== Note 2: Dual random read means that we are simultaneously performing ==\n");
|
|
printf("== two independent memory accesses at a time. In the case if ==\n");
|
|
printf("== the memory subsystem can't handle multiple outstanding ==\n");
|
|
printf("== requests, dual random read has the same timings as two ==\n");
|
|
printf("== single reads performed one after another. ==\n");
|
|
printf("==========================================================================\n\n");
|
|
|
|
consoleUpdate(NULL);
|
|
printClock();
|
|
|
|
if (!latency_bench(latbench_size, latbench_count, -1, 0) ||
|
|
!latency_bench(latbench_size, latbench_count, 1, 0))
|
|
{
|
|
latency_bench(latbench_size, latbench_count, 0, 0);
|
|
}
|
|
|
|
printf("\nPress A to continue, any other key to exit.\n\n");
|
|
waitForKeyA();
|
|
consoleClear();
|
|
goto loop;
|
|
|
|
// Deinitialize and clean up resources used by the console (important!)
|
|
consoleExit(NULL);
|
|
return 0;
|
|
}
|