diff --git a/SdOut/switch/TinyMemBenchNX.nro b/SdOut/switch/TinyMemBenchNX.nro index 2379d083..77445648 100644 Binary files a/SdOut/switch/TinyMemBenchNX.nro and b/SdOut/switch/TinyMemBenchNX.nro differ diff --git a/Source/Atmosphere/ldr_pcv_patch.cpp b/Source/Atmosphere/ldr_pcv_patch.cpp index a8b3fe50..b91e247a 100644 --- a/Source/Atmosphere/ldr_pcv_patch.cpp +++ b/Source/Atmosphere/ldr_pcv_patch.cpp @@ -214,10 +214,10 @@ namespace ams::ldr { { 0x143AC4, 0x144F04, }, }; - constexpr u32 EmcVoltageMin = 300000; // 250000mV + /* constexpr u32 EmcVoltageMin = 250000; */ constexpr u32 EmcVoltageDef = 650000; // 600000mV static_assert(sizeof(EmcVoltageMinOffsets) == sizeof(EmcVoltageDefOffsets)); - static_assert(EmcVoltageMin <= EmcVoltageDef && NewEmcVoltageDef <= EmcVoltageMax); + static_assert(NewEmcVoltageDef <= EmcVoltageMax); #endif }; @@ -277,7 +277,7 @@ namespace ams::ldr { else if(spl::GetSocType() == spl::SocType_Mariko) { for(u32 j = 0; j < sizeof(Mariko::EmcVoltageMinOffsets[i])/sizeof(u32); j++) { AMS_ABORT_UNLESS(Mariko::EmcVoltageMinOffsets[i][j] <= mapped_size); - std::memcpy(mapped_module + Mariko::EmcVoltageMinOffsets[i][j], &Mariko::EmcVoltageMin, sizeof(Mariko::EmcVoltageMin)); + //std::memcpy(mapped_module + Mariko::EmcVoltageMinOffsets[i][j], &Mariko::EmcVoltageMin, sizeof(Mariko::EmcVoltageMin)); std::memcpy(mapped_module + Mariko::EmcVoltageDefOffsets[i][j], &Mariko::EmcVoltageDef, sizeof(Mariko::EmcVoltageDef)); } } diff --git a/Source/TinyMemBenchNX/Makefile b/Source/TinyMemBenchNX/Makefile index 82a6cf81..a19db49e 100644 --- a/Source/TinyMemBenchNX/Makefile +++ b/Source/TinyMemBenchNX/Makefile @@ -51,7 +51,7 @@ TARGET_VERSION := 0.4.9 #--------------------------------------------------------------------------------- ARCH := -march=armv8-a+crc+crypto -mtune=cortex-a57 -mtp=soft -fPIE -fPIC -CFLAGS := -g -Wall -O3 -ffunction-sections -Wno-unused-variable -Wno-unused-but-set-variable \ +CFLAGS := -g -Wall -O2 -ffunction-sections \ $(ARCH) $(DEFINES) CFLAGS += $(INCLUDE) -D__SWITCH__ @@ -59,9 +59,9 @@ CFLAGS += $(INCLUDE) -D__SWITCH__ CXXFLAGS := $(CFLAGS) -fno-rtti -fno-exceptions ASFLAGS := -g $(ARCH) -LDFLAGS = -specs=$(DEVKITPRO)/libnx/switch.specs -g $(ARCH) -Wl,-Map,$(notdir $*.map) +LDFLAGS = -specs=$(DEVKITPRO)/libnx/switch.specs -g $(ARCH) -Wl,-Map,$(notdir $*.map) -pthread -LIBS := -lnx +LIBS := -lnx -lm #--------------------------------------------------------------------------------- # list of directories containing libraries, this must be the top level containing diff --git a/Source/TinyMemBenchNX/source/aarch64-asm.h b/Source/TinyMemBenchNX/source/aarch64-asm.h index a1a64e31..8681abc2 100644 --- a/Source/TinyMemBenchNX/source/aarch64-asm.h +++ b/Source/TinyMemBenchNX/source/aarch64-asm.h @@ -30,6 +30,12 @@ extern "C" { #endif +void aligned_block_read_ldp_x_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); +void aligned_block_read_ldp_q_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); void aligned_block_copy_ldpstp_x_aarch64(int64_t * __restrict dst, int64_t * __restrict src, int size); diff --git a/Source/TinyMemBenchNX/source/aarch64-asm.s b/Source/TinyMemBenchNX/source/aarch64-asm.s index 842b9e2d..82b98082 100755 --- a/Source/TinyMemBenchNX/source/aarch64-asm.s +++ b/Source/TinyMemBenchNX/source/aarch64-asm.s @@ -23,7 +23,7 @@ #ifdef __aarch64__ - .cpu cortex-a53+fp+simd + .cpu cortex-a57+fp+simd .text .align 2 @@ -39,6 +39,18 @@ SIZE .req x2 .endm +asm_function aligned_block_read_ldp_x_aarch64 +0: + ldp x3, x4, [DST, #(0 * 16)] + ldp x5, x6, [DST, #(1 * 16)] + ldp x7, x8, [DST, #(2 * 16)] + ldp x9, x10, [DST, #(3 * 16)] + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + asm_function aligned_block_copy_ldpstp_x_aarch64 0: ldp x3, x4, [SRC, #(0 * 16)] @@ -56,6 +68,16 @@ asm_function aligned_block_copy_ldpstp_x_aarch64 ret .endfunc +asm_function aligned_block_read_ldp_q_aarch64 +0: + ldp q0, q1, [DST, #(0 * 32)] + ldp q2, q3, [DST, #(1 * 32)] + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + asm_function aligned_block_copy_ldpstp_q_aarch64 0: ldp q0, q1, [SRC, #(0 * 32)] diff --git a/Source/TinyMemBenchNX/source/main.cpp b/Source/TinyMemBenchNX/source/main.c similarity index 78% rename from Source/TinyMemBenchNX/source/main.cpp rename to Source/TinyMemBenchNX/source/main.c index 276a2121..5bb0257f 100644 --- a/Source/TinyMemBenchNX/source/main.cpp +++ b/Source/TinyMemBenchNX/source/main.c @@ -20,6 +20,8 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * + * pthread fork by sun409 (https://github.com/sun409/tinymembench-pthread) + * * Switch port by Kazushi and built with libnx. */ @@ -30,9 +32,13 @@ #include #include #include -#include #include +// Multi-thread support +#include +#include +#include + #define __ASM_OPT_H__ #define SIZE (32 * 1024 * 1024) #define BLOCKSIZE 2048 @@ -49,10 +55,79 @@ #include "aarch64-asm.h" #include -using namespace std; - PadState pad; +struct f_data +{ + void (*func)(int64_t *, int64_t *, int); + int64_t *arg1; + int64_t *arg2; + int arg3; +}; + +pthread_cond_t p_ready, p_start; +pthread_mutex_t p_lock; +pthread_t *p_worker = NULL; +struct f_data *worker_data = NULL; +int p_worker_not_ready, p_workers_ready; + +void *thread_func(void *data) +{ + struct f_data *data_ptr = data; + + pthread_mutex_lock(&p_lock); + p_worker_not_ready--; + + if (!p_worker_not_ready) + pthread_cond_signal(&p_ready); + + while (p_workers_ready != 1) + pthread_cond_wait(&p_start, &p_lock); + + pthread_mutex_unlock(&p_lock); + + (data_ptr->func)(data_ptr->arg1, data_ptr->arg2, data_ptr->arg3); + + pthread_exit(NULL); +} + +static void parallel_run(void) +{ + pthread_mutex_lock(&p_lock); + p_workers_ready = 1; + pthread_mutex_unlock(&p_lock); + pthread_cond_broadcast(&p_start); +} + +static void parallel_init(int threads) +{ + pthread_attr_t attr; + + pthread_cond_init(&p_ready, NULL); + pthread_cond_init(&p_start, NULL); + pthread_mutex_init(&p_lock, NULL); + p_worker_not_ready = threads; + p_workers_ready = 0; + pthread_attr_init(&attr); + + if (!p_worker || !worker_data) + { + p_worker = (pthread_t *)malloc(threads * sizeof(pthread_t)); + worker_data = (struct f_data *)malloc(threads * sizeof(struct f_data)); + } + + for (int i = 0; i < threads; i++) + { + pthread_create(p_worker + i, &attr, thread_func, worker_data + i); + } + + pthread_mutex_lock(&p_lock); + while (p_worker_not_ready != 0) + pthread_cond_wait(&p_ready, &p_lock); + + pthread_mutex_unlock(&p_lock); +} + typedef struct { const char *description; @@ -240,6 +315,24 @@ void aligned_block_copy_pf64(int64_t * __restrict dst_, } } +void aligned_block_fetch(int64_t * __restrict dst, + int64_t * __restrict src_, + int size) +{ + volatile int64_t *src = src_; + while ((size -= 64) >= 0) + { + *src++; + *src++; + *src++; + *src++; + *src++; + *src++; + *src++; + *src++; + } +} + void aligned_block_fill(int64_t * __restrict dst_, int64_t * __restrict src, int size) @@ -326,7 +419,8 @@ double gettime(void) return (double)((int64_t)tv.tv_sec * 1000000 + tv.tv_usec) / 1000000.; } -static double bandwidth_bench_helper(int64_t *dstbuf, int64_t *srcbuf, +static double bandwidth_bench_helper(int threads, + int64_t *dstbuf, int64_t *srcbuf, int64_t *tmpbuf, int size, int blocksize, const char *indent_prefix, @@ -335,19 +429,30 @@ static double bandwidth_bench_helper(int64_t *dstbuf, int64_t *srcbuf, const char *description) { int i, j, loopcount, innerloopcount, n; - double t1, t2; + double t, t1, t2; double speed, maxspeed; double s, s0, s1, s2; /* do up to MAXREPEATS measurements */ - s = s0 = s1 = s2 = 0; - maxspeed = 0; + s = s0 = s1 = s2 = 0.; + maxspeed = 0.; for (n = 0; n < MAXREPEATS; n++) { - f(dstbuf, srcbuf, size); + parallel_init(threads); + for (int pt = 0; pt < threads; pt++) + { + (worker_data + pt)->func = f; + (worker_data + pt)->arg1 = dstbuf + size * pt / sizeof(int64_t); + (worker_data + pt)->arg2 = srcbuf + size * pt / sizeof(int64_t); + (worker_data + pt)->arg3 = size; + } + parallel_run(); + for (int pt = 0; pt < threads; pt++) + pthread_join(p_worker[pt], NULL); + loopcount = 0; innerloopcount = 1; - t1 = gettime(); + t = 0.; do { loopcount += innerloopcount; @@ -355,33 +460,49 @@ static double bandwidth_bench_helper(int64_t *dstbuf, int64_t *srcbuf, { for (i = 0; i < innerloopcount; i++) { + t1 = gettime(); for (j = 0; j < size; j += blocksize) - { + { f(tmpbuf, srcbuf + j / sizeof(int64_t), blocksize); f(dstbuf + j / sizeof(int64_t), tmpbuf, blocksize); } + t2 = gettime(); + t += t2 - t1; } } else { for (i = 0; i < innerloopcount; i++) { - f(dstbuf, srcbuf, size); + parallel_init(threads); + for (int pt = 0; pt < threads; ++pt) + { + (worker_data + pt)->func = f; + (worker_data + pt)->arg1 = dstbuf + size * pt / sizeof(int64_t); + (worker_data + pt)->arg2 = srcbuf + size * pt / sizeof(int64_t); + (worker_data + pt)->arg3 = size; + } + + t1 = gettime(); + parallel_run(); + for (int pt = 0; pt < threads; ++pt) + pthread_join(p_worker[pt], NULL); + t2 = gettime(); + t += t2 - t1; } } innerloopcount *= 2; - t2 = gettime(); - } while (t2 - t1 < 0.5); - speed = (double)size * loopcount / (t2 - t1) / 1000000.; + } while (t < 0.5); + speed = (double)size * (use_tmpbuf ? 1 : threads) * loopcount / t / 1000000.; - s0 += 1; + s0 += 1.; s1 += speed; s2 += speed * speed; if (speed > maxspeed) maxspeed = speed; - if (s0 > 2) + if (s0 > 2.) { s = sqrt((s0 * s2 - s1 * s1) / (s0 * (s0 - 1))); if (s < maxspeed / 1000.) @@ -391,24 +512,27 @@ static double bandwidth_bench_helper(int64_t *dstbuf, int64_t *srcbuf, if (maxspeed > 0 && s / maxspeed * 100. >= 0.1) { - printf("%s%-52s : %8.1f MB/s (%.1f%%)\n", indent_prefix, description, + printf("%s%-40s : %8.1f MB/s (%.1f%%)\n", indent_prefix, description, maxspeed, s / maxspeed * 100.); } else { - printf("%s%-52s : %8.1f MB/s\n", indent_prefix, description, maxspeed); + printf("%s%-40s : %8.1f MB/s\n", indent_prefix, description, maxspeed); } + consoleUpdate(NULL); return maxspeed; } -void bandwidth_bench(int64_t *dstbuf, int64_t *srcbuf, int64_t *tmpbuf, +void bandwidth_bench(int threads, + int64_t *dstbuf, int64_t *srcbuf, int64_t *tmpbuf, int size, int blocksize, const char *indent_prefix, bench_info *bi) { while (bi->f) { - bandwidth_bench_helper(dstbuf, srcbuf, tmpbuf, size, blocksize, + bandwidth_bench_helper(threads, + dstbuf, srcbuf, tmpbuf, size, blocksize, indent_prefix, bi->use_tmpbuf, bi->f, bi->description); @@ -428,14 +552,16 @@ void memset_wrapper(int64_t *dst, int64_t *src, int size) static bench_info aarch64_neon[] = { + { "NEON LDP", 0, aligned_block_read_ldp_q_aarch64 }, { "NEON LDP/STP copy", 0, aligned_block_copy_ldpstp_q_aarch64 }, - { "NEON LDP/STP copy pldl2strm (32 bytes step)", 0, aligned_block_copy_ldpstp_q_pf32_l2strm_aarch64 }, - { "NEON LDP/STP copy pldl2strm (64 bytes step)", 0, aligned_block_copy_ldpstp_q_pf64_l2strm_aarch64 }, - { "NEON LDP/STP copy pldl1keep (32 bytes step)", 0, aligned_block_copy_ldpstp_q_pf32_l1keep_aarch64 }, - { "NEON LDP/STP copy pldl1keep (64 bytes step)", 0, aligned_block_copy_ldpstp_q_pf64_l1keep_aarch64 }, + { "NEON LDP/STP copy pldl2strm (32B step)", 0, aligned_block_copy_ldpstp_q_pf32_l2strm_aarch64 }, + { "NEON LDP/STP copy pldl2strm (64B step)", 0, aligned_block_copy_ldpstp_q_pf64_l2strm_aarch64 }, + { "NEON LDP/STP copy pldl1keep (32B step)", 0, aligned_block_copy_ldpstp_q_pf32_l1keep_aarch64 }, + { "NEON LDP/STP copy pldl1keep (64B step)", 0, aligned_block_copy_ldpstp_q_pf64_l1keep_aarch64 }, { "NEON LD1/ST1 copy", 0, aligned_block_copy_ld1st1_aarch64 }, { "NEON STP fill", 0, aligned_block_fill_stp_q_aarch64 }, { "NEON STNP fill", 0, aligned_block_fill_stnp_q_aarch64 }, + { "ARM LDP", 0, aligned_block_read_ldp_x_aarch64 }, { "ARM LDP/STP copy", 0, aligned_block_copy_ldpstp_x_aarch64 }, { "ARM STP fill", 0, aligned_block_fill_stp_x_aarch64 }, { "ARM STNP fill", 0, aligned_block_fill_stnp_x_aarch64 }, @@ -450,18 +576,19 @@ bench_info *get_asm_benchmarks(void) static bench_info c_benchmarks[] = { { "C copy backwards", 0, aligned_block_copy_backwards }, - { "C copy backwards (32 byte blocks)", 0, aligned_block_copy_backwards_bs32 }, - { "C copy backwards (64 byte blocks)", 0, aligned_block_copy_backwards_bs64 }, + { "C copy backwards (32B blocks)", 0, aligned_block_copy_backwards_bs32 }, + { "C copy backwards (64B blocks)", 0, aligned_block_copy_backwards_bs64 }, { "C copy", 0, aligned_block_copy }, - { "C copy prefetched (32 bytes step)", 0, aligned_block_copy_pf32 }, - { "C copy prefetched (64 bytes step)", 0, aligned_block_copy_pf64 }, - { "C 2-pass copy", 1, aligned_block_copy }, - { "C 2-pass copy prefetched (32 bytes step)", 1, aligned_block_copy_pf32 }, - { "C 2-pass copy prefetched (64 bytes step)", 1, aligned_block_copy_pf64 }, + { "C copy prefetched (32B step)", 0, aligned_block_copy_pf32 }, + { "C copy prefetched (64B step)", 0, aligned_block_copy_pf64 }, + // { "C 2-pass copy", 1, aligned_block_copy }, + // { "C 2-pass copy prefetched (32B step)", 1, aligned_block_copy_pf32 }, + // { "C 2-pass copy prefetched (64B step)", 1, aligned_block_copy_pf64 }, + { "C fetch", 0, aligned_block_fetch }, { "C fill", 0, aligned_block_fill }, - { "C fill (shuffle within 16 byte blocks)", 0, aligned_block_fill_shuffle16 }, - { "C fill (shuffle within 32 byte blocks)", 0, aligned_block_fill_shuffle32 }, - { "C fill (shuffle within 64 byte blocks)", 0, aligned_block_fill_shuffle64 }, + { "C fill (shuffle within 16B blocks)", 0, aligned_block_fill_shuffle16 }, + { "C fill (shuffle within 32B blocks)", 0, aligned_block_fill_shuffle32 }, + { "C fill (shuffle within 64B blocks)", 0, aligned_block_fill_shuffle64 }, { NULL, 0, NULL } }; @@ -525,41 +652,7 @@ static void __attribute__((noinline)) random_read_test(char *zerobuffer, uint32_t seed = 0; uintptr_t addrmask = (1 << nbits) - 1; uint32_t v; - static volatile uint32_t dummy; -#ifdef __arm__ - uint32_t tmp; - __asm__ volatile ( - "subs %[count], %[count], #16\n" - "blt 1f\n" - "0:\n" - "subs %[count], %[count], #16\n" - ".rept 16\n" - "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" - "and %[v], %[xFF], %[seed], lsr #16\n" - "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" - "and %[tmp], %[xFF00], %[seed], lsr #8\n" - "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" - "orr %[v], %[v], %[tmp]\n" - "and %[tmp], %[x7FFF0000], %[seed]\n" - "orr %[v], %[v], %[tmp]\n" - "and %[v], %[v], %[addrmask]\n" - "ldrb %[v], [%[zerobuffer], %[v]]\n" - "orr %[seed], %[seed], %[v]\n" - ".endr\n" - "bge 0b\n" - "1:\n" - "add %[count], %[count], #16\n" - : [count] "+&r" (count), - [seed] "+&r" (seed), [v] "=&r" (v), - [tmp] "=&r" (tmp) - : [c1103515245] "r" (1103515245), [c12345] "r" (12345), - [xFF00] "r" (0xFF00), [xFF] "r" (0xFF), - [x7FFF0000] "r" (0x7FFF0000), - [zerobuffer] "r" (zerobuffer), - [addrmask] "r" (addrmask) - : "cc"); -#else #define RANDOM_MEM_ACCESS() \ seed = seed * 1103515245 + 12345; \ v = (seed >> 16) & 0xFF; \ @@ -588,8 +681,6 @@ static void __attribute__((noinline)) random_read_test(char *zerobuffer, RANDOM_MEM_ACCESS(); count -= 16; } -#endif - dummy = seed; #undef RANDOM_MEM_ACCESS } @@ -599,51 +690,6 @@ static void __attribute__((noinline)) random_dual_read_test(char *zerobuffer, uint32_t seed = 0; uintptr_t addrmask = (1 << nbits) - 1; uint32_t v1, v2; - static volatile uint32_t dummy; - -#ifdef __arm__ - uint32_t tmp; - __asm__ volatile ( - "subs %[count], %[count], #16\n" - "blt 1f\n" - "0:\n" - "subs %[count], %[count], #16\n" - ".rept 16\n" - "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" - "and %[v1], %[xFF00], %[seed], lsr #8\n" - "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" - "and %[v2], %[xFF00], %[seed], lsr #8\n" - "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" - "and %[tmp], %[x7FFF0000], %[seed]\n" - "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" - "orr %[v1], %[v1], %[tmp]\n" - "and %[tmp], %[x7FFF0000], %[seed]\n" - "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" - "orr %[v2], %[v2], %[tmp]\n" - "and %[tmp], %[xFF], %[seed], lsr #16\n" - "orr %[v2], %[v2], %[seed], lsr #24\n" - "orr %[v1], %[v1], %[tmp]\n" - "and %[v2], %[v2], %[addrmask]\n" - "eor %[v1], %[v1], %[v2]\n" - "and %[v1], %[v1], %[addrmask]\n" - "ldrb %[v2], [%[zerobuffer], %[v2]]\n" - "ldrb %[v1], [%[zerobuffer], %[v1]]\n" - "orr %[seed], %[seed], %[v2]\n" - "add %[seed], %[seed], %[v1]\n" - ".endr\n" - "bge 0b\n" - "1:\n" - "add %[count], %[count], #16\n" - : [count] "+&r" (count), - [seed] "+&r" (seed), [v1] "=&r" (v1), [v2] "=&r" (v2), - [tmp] "=&r" (tmp) - : [c1103515245] "r" (1103515245), [c12345] "r" (12345), - [xFF00] "r" (0xFF00), [xFF] "r" (0xFF), - [x7FFF0000] "r" (0x7FFF0000), - [zerobuffer] "r" (zerobuffer), - [addrmask] "r" (addrmask) - : "cc"); -#else #define RANDOM_MEM_ACCESS() \ seed = seed * 1103515245 + 12345; \ v1 = (seed >> 8) & 0xFF00; \ @@ -680,8 +726,6 @@ static void __attribute__((noinline)) random_dual_read_test(char *zerobuffer, RANDOM_MEM_ACCESS(); count -= 16; } -#endif - dummy = seed; #undef RANDOM_MEM_ACCESS } @@ -697,8 +741,8 @@ static uint32_t rand32() int latency_bench(int size, int count, int use_hugepage) { double t, t2, t_before, t_after, t_noaccess, t_noaccess2 = 0; - double xs, xs0, xs1, xs2; - double ys, ys0, ys1, ys2; + double xs, xs1, xs2; + double ys, ys1, ys2; double min_t, min_t2; int nbits, n; char *buffer, *buffer_alloc; @@ -852,13 +896,14 @@ int main(int argc, char* argv[]) int64_t *srcbuf, *dstbuf, *tmpbuf; void *poolbuf; size_t bufsize = SIZE; + int threads = 2; - printf("tinymembench v0.4.9 (simple benchmark for memory throughput and latency)\n"); + printf("TinyMemBenchNX v0.4.10\n\ +(based on tinymembench-pthread, a multi-thread fork of simple benchmark for memory throughput and latency)\n"); - - poolbuf = alloc_four_nonaliased_buffers((void **)&srcbuf, bufsize, - (void **)&dstbuf, bufsize, - (void **)&tmpbuf, BLOCKSIZE, + poolbuf = alloc_four_nonaliased_buffers((void **)&srcbuf, bufsize * threads, + (void **)&dstbuf, bufsize * threads, + (void **)&tmpbuf, BLOCKSIZE * threads, NULL, 0); printf("\n"); printf("==========================================================================\n"); @@ -875,21 +920,49 @@ int main(int argc, char* argv[]) printf("== brackets ==\n"); printf("==========================================================================\n\n"); + printf("!!! Memory bandwidth heavily depends on CPU clock. !!!\n\n"); + printf("\ +Press A to start bandwidth test @ 1 thread.\n\ +Press B to start bandwidth test @ 2 threads.\n\ +Press any other key to exit.\n\n"); + consoleUpdate(NULL); - printf("!!! Memory bandwidth heavily depends on CPU clock. !!!\n\n"); - printClock(); - printf("Press A to start bandwidth test, any other key to exit.\n\n"); - waitForKeyA(); + while (appletMainLoop()) + { + padUpdate(&pad); - bandwidth_bench(dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", c_benchmarks); + u64 kDown = padGetButtonsDown(&pad); + + if (kDown & HidNpadButton_A) + { + threads = 1; + break; + } + else if (kDown & HidNpadButton_B) + { + threads = 2; + break; + } + else if (kDown) + { + consoleExit(NULL); + exit(0); + } + } + + printClock(); + printf("== Thread: %d ==\n", threads); + consoleUpdate(NULL); + + bandwidth_bench(threads, dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", c_benchmarks); printf(" ---\n"); - bandwidth_bench(dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", libc_benchmarks); + bandwidth_bench(threads, dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", libc_benchmarks); bench_info *bi = get_asm_benchmarks(); if (bi->f) { printf(" ---\n"); - bandwidth_bench(dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", bi); + bandwidth_bench(threads, dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", bi); } free(poolbuf);