diff --git a/README.md b/README.md index d90b7f6f..fb920700 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ Support latest Horizon OS (12.1.0) and Atmosphere (0.20.0). - Profile-aware clock override for all games - Game recording and SysDVR streaming @ 60fps with high video bitrate - Remove copyright watermark in screenshots/recordings, courtesy of [HookedBehemoth](https://github.com/HookedBehemoth/exefs_patches) +- **TinyMemBenchNX**: DRAM throughput and latency test based on [tinymembench](https://github.com/ssvb/tinymembench) #### Details diff --git a/SdOut/atmosphere/config/system_settings.ini b/SdOut/atmosphere/config/system_settings.ini index ac378ed6..b15fb71e 100644 --- a/SdOut/atmosphere/config/system_settings.ini +++ b/SdOut/atmosphere/config/system_settings.ini @@ -23,7 +23,7 @@ enable_halfawake = u32!0x0 minimum_interval_normal = u32!0x7FFFFFFF minimum_interval_save = u32!0x7FFFFFFF battery_threshold_save = u32!0x64 -battery_threshold_stop = u32!0x0 +battery_threshold_stop = u32!0x64 [npns] background_processing = u8!0x0 diff --git a/SdOut/atmosphere/contents/00FF0000636C6BFF/exefs.nsp b/SdOut/atmosphere/contents/00FF0000636C6BFF/exefs.nsp index 188e7522..fc4df389 100644 Binary files a/SdOut/atmosphere/contents/00FF0000636C6BFF/exefs.nsp and b/SdOut/atmosphere/contents/00FF0000636C6BFF/exefs.nsp differ diff --git a/SdOut/switch/.overlays/sys-clk-overlay.ovl b/SdOut/switch/.overlays/sys-clk-overlay.ovl index d1c83fb8..c8f02163 100644 Binary files a/SdOut/switch/.overlays/sys-clk-overlay.ovl and b/SdOut/switch/.overlays/sys-clk-overlay.ovl differ diff --git a/SdOut/switch/TinyMemBenchNX.nro b/SdOut/switch/TinyMemBenchNX.nro new file mode 100644 index 00000000..2379d083 Binary files /dev/null and b/SdOut/switch/TinyMemBenchNX.nro differ diff --git a/SdOut/switch/sys-clk-manager.nro b/SdOut/switch/sys-clk-manager.nro index 29610806..c7812a2d 100644 Binary files a/SdOut/switch/sys-clk-manager.nro and b/SdOut/switch/sys-clk-manager.nro differ diff --git a/Source/TinyMemBenchNX/Makefile b/Source/TinyMemBenchNX/Makefile new file mode 100644 index 00000000..82a6cf81 --- /dev/null +++ b/Source/TinyMemBenchNX/Makefile @@ -0,0 +1,224 @@ +#--------------------------------------------------------------------------------- +.SUFFIXES: +#--------------------------------------------------------------------------------- + +ifeq ($(strip $(DEVKITPRO)),) +$(error "Please set DEVKITPRO in your environment. export DEVKITPRO=/devkitpro") +endif + +TOPDIR ?= $(CURDIR) +include $(DEVKITPRO)/libnx/switch_rules + +#--------------------------------------------------------------------------------- +# TARGET is the name of the output +# BUILD is the directory where object files & intermediate files will be placed +# SOURCES is a list of directories containing source code +# DATA is a list of directories containing data files +# INCLUDES is a list of directories containing header files +# ROMFS is the directory containing data to be added to RomFS, relative to the Makefile (Optional) +# +# NO_ICON: if set to anything, do not use icon. +# NO_NACP: if set to anything, no .nacp file is generated. +# APP_TITLE is the name of the app stored in the .nacp file (Optional) +# APP_AUTHOR is the author of the app stored in the .nacp file (Optional) +# APP_VERSION is the version of the app stored in the .nacp file (Optional) +# APP_TITLEID is the titleID of the app stored in the .nacp file (Optional) +# ICON is the filename of the icon (.jpg), relative to the project folder. +# If not set, it attempts to use one of the following (in this order): +# - .jpg +# - icon.jpg +# - /default_icon.jpg +# +# CONFIG_JSON is the filename of the NPDM config file (.json), relative to the project folder. +# If not set, it attempts to use one of the following (in this order): +# - .json +# - config.json +# If a JSON file is provided or autodetected, an ExeFS PFS0 (.nsp) is built instead +# of a homebrew executable (.nro). This is intended to be used for sysmodules. +# NACP building is skipped as well. +#--------------------------------------------------------------------------------- +TARGET := $(notdir $(CURDIR)) +BUILD := build +SOURCES := source +DATA := data +INCLUDES := include +#ROMFS := romfs + +TARGET_VERSION := 0.4.9 + +#--------------------------------------------------------------------------------- +# options for code generation +#--------------------------------------------------------------------------------- +ARCH := -march=armv8-a+crc+crypto -mtune=cortex-a57 -mtp=soft -fPIE -fPIC + +CFLAGS := -g -Wall -O3 -ffunction-sections -Wno-unused-variable -Wno-unused-but-set-variable \ + $(ARCH) $(DEFINES) + +CFLAGS += $(INCLUDE) -D__SWITCH__ + +CXXFLAGS := $(CFLAGS) -fno-rtti -fno-exceptions + +ASFLAGS := -g $(ARCH) +LDFLAGS = -specs=$(DEVKITPRO)/libnx/switch.specs -g $(ARCH) -Wl,-Map,$(notdir $*.map) + +LIBS := -lnx + +#--------------------------------------------------------------------------------- +# list of directories containing libraries, this must be the top level containing +# include and lib +#--------------------------------------------------------------------------------- +LIBDIRS := $(PORTLIBS) $(LIBNX) + + +#--------------------------------------------------------------------------------- +# no real need to edit anything past this point unless you need to add additional +# rules for different file extensions +#--------------------------------------------------------------------------------- +ifneq ($(BUILD),$(notdir $(CURDIR))) +#--------------------------------------------------------------------------------- + +export OUTPUT := $(CURDIR)/$(TARGET) +export TOPDIR := $(CURDIR) + +export VPATH := $(foreach dir,$(SOURCES),$(CURDIR)/$(dir)) \ + $(foreach dir,$(DATA),$(CURDIR)/$(dir)) + +export DEPSDIR := $(CURDIR)/$(BUILD) + +CFILES := $(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.c))) +CPPFILES := $(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.cpp))) +SFILES := $(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.s))) +BINFILES := $(foreach dir,$(DATA),$(notdir $(wildcard $(dir)/*.*))) + +#--------------------------------------------------------------------------------- +# use CXX for linking C++ projects, CC for standard C +#--------------------------------------------------------------------------------- +ifeq ($(strip $(CPPFILES)),) +#--------------------------------------------------------------------------------- + export LD := $(CC) +#--------------------------------------------------------------------------------- +else +#--------------------------------------------------------------------------------- + export LD := $(CXX) +#--------------------------------------------------------------------------------- +endif +#--------------------------------------------------------------------------------- + +export OFILES_BIN := $(addsuffix .o,$(BINFILES)) +export OFILES_SRC := $(CPPFILES:.cpp=.o) $(CFILES:.c=.o) $(SFILES:.s=.o) +export OFILES := $(OFILES_BIN) $(OFILES_SRC) +export HFILES_BIN := $(addsuffix .h,$(subst .,_,$(BINFILES))) + +export INCLUDE := $(foreach dir,$(INCLUDES),-I$(CURDIR)/$(dir)) \ + $(foreach dir,$(LIBDIRS),-I$(dir)/include) \ + -I$(CURDIR)/$(BUILD) + +export LIBPATHS := $(foreach dir,$(LIBDIRS),-L$(dir)/lib) + +ifeq ($(strip $(CONFIG_JSON)),) + jsons := $(wildcard *.json) + ifneq (,$(findstring $(TARGET).json,$(jsons))) + export APP_JSON := $(TOPDIR)/$(TARGET).json + else + ifneq (,$(findstring config.json,$(jsons))) + export APP_JSON := $(TOPDIR)/config.json + endif + endif +else + export APP_JSON := $(TOPDIR)/$(CONFIG_JSON) +endif + +ifeq ($(strip $(ICON)),) + icons := $(wildcard *.jpg) + ifneq (,$(findstring $(TARGET).jpg,$(icons))) + export APP_ICON := $(TOPDIR)/$(TARGET).jpg + else + ifneq (,$(findstring icon.jpg,$(icons))) + export APP_ICON := $(TOPDIR)/icon.jpg + endif + endif +else + export APP_ICON := $(TOPDIR)/$(ICON) +endif + +ifeq ($(strip $(NO_ICON)),) + export NROFLAGS += --icon=$(APP_ICON) +endif + +ifeq ($(strip $(NO_NACP)),) + export NROFLAGS += --nacp=$(CURDIR)/$(TARGET).nacp +endif + +ifneq ($(APP_TITLEID),) + export NACPFLAGS += --titleid=$(APP_TITLEID) +endif + +ifneq ($(ROMFS),) + export NROFLAGS += --romfsdir=$(CURDIR)/$(ROMFS) +endif + +.PHONY: $(BUILD) clean all + +#--------------------------------------------------------------------------------- +all: $(BUILD) + +$(BUILD): + @[ -d $@ ] || mkdir -p $@ + @$(MAKE) --no-print-directory -C $(BUILD) -f $(CURDIR)/Makefile + +#--------------------------------------------------------------------------------- +clean: + @echo clean ... +ifeq ($(strip $(APP_JSON)),) + @rm -fr $(BUILD) $(TARGET).nro $(TARGET).nacp $(TARGET).elf +else + @rm -fr $(BUILD) $(TARGET).nsp $(TARGET).nso $(TARGET).npdm $(TARGET).elf +endif + + +#--------------------------------------------------------------------------------- +else +.PHONY: all + +DEPENDS := $(OFILES:.o=.d) + +#--------------------------------------------------------------------------------- +# main targets +#--------------------------------------------------------------------------------- +ifeq ($(strip $(APP_JSON)),) + +all : $(OUTPUT).nro + +ifeq ($(strip $(NO_NACP)),) +$(OUTPUT).nro : $(OUTPUT).elf $(OUTPUT).nacp +else +$(OUTPUT).nro : $(OUTPUT).elf +endif + +else + +all : $(OUTPUT).nsp + +$(OUTPUT).nsp : $(OUTPUT).nso $(OUTPUT).npdm + +$(OUTPUT).nso : $(OUTPUT).elf + +endif + +$(OUTPUT).elf : $(OFILES) + +$(OFILES_SRC) : $(HFILES_BIN) + +#--------------------------------------------------------------------------------- +# you need a rule like this for each extension you use as binary data +#--------------------------------------------------------------------------------- +%.bin.o %_bin.h : %.bin +#--------------------------------------------------------------------------------- + @echo $(notdir $<) + @$(bin2o) + +-include $(DEPENDS) + +#--------------------------------------------------------------------------------------- +endif +#--------------------------------------------------------------------------------------- \ No newline at end of file diff --git a/Source/TinyMemBenchNX/source/aarch64-asm.h b/Source/TinyMemBenchNX/source/aarch64-asm.h new file mode 100644 index 00000000..a1a64e31 --- /dev/null +++ b/Source/TinyMemBenchNX/source/aarch64-asm.h @@ -0,0 +1,74 @@ +/* + * Copyright © 2016 Siarhei Siamashka + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __AARCH64_ASM_H__ +#define __AARCH64_ASM_H__ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void aligned_block_copy_ldpstp_x_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); +void aligned_block_copy_ldpstp_q_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); +void aligned_block_copy_ld1st1_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); + +void aligned_block_copy_ldpstp_q_pf32_l2strm_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); +void aligned_block_copy_ldpstp_q_pf64_l2strm_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); +void aligned_block_copy_ldpstp_q_pf32_l1keep_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); +void aligned_block_copy_ldpstp_q_pf64_l1keep_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); + +void aligned_block_fill_stp_x_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); +void aligned_block_fill_stp_q_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); + +void aligned_block_fill_stnp_x_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); +void aligned_block_fill_stnp_q_aarch64(int64_t * __restrict dst, + int64_t * __restrict src, + int size); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/Source/TinyMemBenchNX/source/aarch64-asm.s b/Source/TinyMemBenchNX/source/aarch64-asm.s new file mode 100755 index 00000000..842b9e2d --- /dev/null +++ b/Source/TinyMemBenchNX/source/aarch64-asm.s @@ -0,0 +1,185 @@ +/* + * Copyright © 2016 Siarhei Siamashka + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifdef __aarch64__ + + .cpu cortex-a53+fp+simd + .text + .align 2 + +#define PREFETCH_DISTANCE 320 + +.macro asm_function function_name + .global \function_name + .type \function_name,%function +.func \function_name +\function_name: + DST .req x0 + SRC .req x1 + SIZE .req x2 +.endm + +asm_function aligned_block_copy_ldpstp_x_aarch64 +0: + ldp x3, x4, [SRC, #(0 * 16)] + ldp x5, x6, [SRC, #(1 * 16)] + ldp x7, x8, [SRC, #(2 * 16)] + ldp x9, x10, [SRC, #(3 * 16)] + add SRC, SRC, #64 + stp x3, x4, [DST, #(0 * 16)] + stp x5, x6, [DST, #(1 * 16)] + stp x7, x8, [DST, #(2 * 16)] + stp x9, x10, [DST, #(3 * 16)] + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + +asm_function aligned_block_copy_ldpstp_q_aarch64 +0: + ldp q0, q1, [SRC, #(0 * 32)] + ldp q2, q3, [SRC, #(1 * 32)] + add SRC, SRC, #64 + stp q0, q1, [DST, #(0 * 32)] + stp q2, q3, [DST, #(1 * 32)] + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + +asm_function aligned_block_copy_ldpstp_q_pf32_l2strm_aarch64 +0: + prfm pldl2strm, [SRC, #(PREFETCH_DISTANCE + 0)] + ldp q0, q1, [SRC, #(0 * 32)] + prfm pldl2strm, [SRC, #(PREFETCH_DISTANCE + 32)] + ldp q2, q3, [SRC, #(1 * 32)] + add SRC, SRC, #64 + stp q0, q1, [DST, #(0 * 32)] + stp q2, q3, [DST, #(1 * 32)] + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + +asm_function aligned_block_copy_ldpstp_q_pf64_l2strm_aarch64 +0: + prfm pldl2strm, [SRC, #(PREFETCH_DISTANCE)] + ldp q0, q1, [SRC, #(0 * 32)] + ldp q2, q3, [SRC, #(1 * 32)] + add SRC, SRC, #64 + stp q0, q1, [DST, #(0 * 32)] + stp q2, q3, [DST, #(1 * 32)] + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + +asm_function aligned_block_copy_ldpstp_q_pf32_l1keep_aarch64 +0: + prfm pldl1keep, [SRC, #(PREFETCH_DISTANCE + 0)] + ldp q0, q1, [SRC, #(0 * 32)] + prfm pldl1keep, [SRC, #(PREFETCH_DISTANCE + 32)] + ldp q2, q3, [SRC, #(1 * 32)] + add SRC, SRC, #64 + stp q0, q1, [DST, #(0 * 32)] + stp q2, q3, [DST, #(1 * 32)] + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + +asm_function aligned_block_copy_ldpstp_q_pf64_l1keep_aarch64 +0: + prfm pldl1keep, [SRC, #(PREFETCH_DISTANCE)] + ldp q0, q1, [SRC, #(0 * 32)] + ldp q2, q3, [SRC, #(1 * 32)] + add SRC, SRC, #64 + stp q0, q1, [DST, #(0 * 32)] + stp q2, q3, [DST, #(1 * 32)] + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + +asm_function aligned_block_fill_stp_x_aarch64 +0: + stp x3, x4, [DST, #(0 * 16)] + stp x5, x6, [DST, #(1 * 16)] + stp x7, x8, [DST, #(2 * 16)] + stp x9, x10, [DST, #(3 * 16)] + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + +asm_function aligned_block_fill_stp_q_aarch64 +0: + stp q0, q1, [DST, #(0 * 32)] + stp q2, q3, [DST, #(1 * 32)] + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + +asm_function aligned_block_fill_stnp_x_aarch64 +0: + stnp x3, x4, [DST, #(0 * 16)] + stnp x5, x6, [DST, #(1 * 16)] + stnp x7, x8, [DST, #(2 * 16)] + stnp x9, x10, [DST, #(3 * 16)] + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + +asm_function aligned_block_fill_stnp_q_aarch64 +0: + stnp q0, q1, [DST, #(0 * 32)] + stnp q2, q3, [DST, #(1 * 32)] + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + +asm_function aligned_block_copy_ld1st1_aarch64 +0: + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [SRC] + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [DST] + add SRC, SRC, #64 + add DST, DST, #64 + subs SIZE, SIZE, #64 + bgt 0b + ret +.endfunc + +#endif diff --git a/Source/TinyMemBenchNX/source/main.cpp b/Source/TinyMemBenchNX/source/main.cpp new file mode 100644 index 00000000..276a2121 --- /dev/null +++ b/Source/TinyMemBenchNX/source/main.cpp @@ -0,0 +1,940 @@ +/* + * Copyright © 2011 Siarhei Siamashka + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Switch port by Kazushi and built with libnx. + */ + +// Include the most common headers from the C standard library +#include +#include +#include +#include +#include +#include +#include +#include + +#define __ASM_OPT_H__ +#define SIZE (32 * 1024 * 1024) +#define BLOCKSIZE 2048 +#ifndef MAXREPEATS +# define MAXREPEATS 10 +#endif +#ifndef LATBENCH_COUNT +# define LATBENCH_COUNT 10000000 +#endif + +#define ALIGN_PADDING 0x100000 +#define CACHE_LINE_SIZE 128 + +#include "aarch64-asm.h" +#include + +using namespace std; + +PadState pad; + +typedef struct +{ + const char *description; + int use_tmpbuf; + void (*f)(int64_t *, int64_t *, int); +} bench_info; + +static char *align_up(char *ptr, int align) +{ + return (char *)(((uintptr_t)ptr + align - 1) & ~(uintptr_t)(align - 1)); +} + +void aligned_block_copy(int64_t * __restrict dst_, + int64_t * __restrict src, + int size) +{ + volatile int64_t *dst = dst_; + int64_t t1, t2, t3, t4; + while ((size -= 64) >= 0) + { + t1 = *src++; + t2 = *src++; + t3 = *src++; + t4 = *src++; + *dst++ = t1; + *dst++ = t2; + *dst++ = t3; + *dst++ = t4; + t1 = *src++; + t2 = *src++; + t3 = *src++; + t4 = *src++; + *dst++ = t1; + *dst++ = t2; + *dst++ = t3; + *dst++ = t4; + } +} + +void aligned_block_copy_backwards(int64_t * __restrict dst_, + int64_t * __restrict src, + int size) +{ + volatile int64_t *dst = dst_; + int64_t t1, t2, t3, t4; + src += size / 8 - 1; + dst += size / 8 - 1; + while ((size -= 64) >= 0) + { + t1 = *src--; + t2 = *src--; + t3 = *src--; + t4 = *src--; + *dst-- = t1; + *dst-- = t2; + *dst-- = t3; + *dst-- = t4; + t1 = *src--; + t2 = *src--; + t3 = *src--; + t4 = *src--; + *dst-- = t1; + *dst-- = t2; + *dst-- = t3; + *dst-- = t4; + } +} + +void aligned_block_copy_backwards_bs32(int64_t * __restrict dst_, + int64_t * __restrict src, + int size) +{ + volatile int64_t *dst = dst_; + int64_t t1, t2, t3, t4; + src += size / 8 - 8; + dst += size / 8 - 8; + while ((size -= 64) >= 0) + { + t1 = src[4]; + t2 = src[5]; + t3 = src[6]; + t4 = src[7]; + dst[4] = t1; + dst[5] = t2; + dst[6] = t3; + dst[7] = t4; + t1 = src[0]; + t2 = src[1]; + t3 = src[2]; + t4 = src[3]; + dst[0] = t1; + dst[1] = t2; + dst[2] = t3; + dst[3] = t4; + src -= 8; + dst -= 8; + } +} + +void aligned_block_copy_backwards_bs64(int64_t * __restrict dst_, + int64_t * __restrict src, + int size) +{ + volatile int64_t *dst = dst_; + int64_t t1, t2, t3, t4; + src += size / 8 - 8; + dst += size / 8 - 8; + while ((size -= 64) >= 0) + { + t1 = src[0]; + t2 = src[1]; + t3 = src[2]; + t4 = src[3]; + dst[0] = t1; + dst[1] = t2; + dst[2] = t3; + dst[3] = t4; + t1 = src[4]; + t2 = src[5]; + t3 = src[6]; + t4 = src[7]; + dst[4] = t1; + dst[5] = t2; + dst[6] = t3; + dst[7] = t4; + src -= 8; + dst -= 8; + } +} + +void aligned_block_copy_pf32(int64_t * __restrict dst_, + int64_t * __restrict src, + int size) +{ + volatile int64_t *dst = dst_; + int64_t t1, t2, t3, t4; + while ((size -= 64) >= 0) + { + __builtin_prefetch(src + 32, 0, 0); + t1 = *src++; + t2 = *src++; + t3 = *src++; + t4 = *src++; + *dst++ = t1; + *dst++ = t2; + *dst++ = t3; + *dst++ = t4; + __builtin_prefetch(src + 32, 0, 0); + t1 = *src++; + t2 = *src++; + t3 = *src++; + t4 = *src++; + *dst++ = t1; + *dst++ = t2; + *dst++ = t3; + *dst++ = t4; + } +} + +void aligned_block_copy_pf64(int64_t * __restrict dst_, + int64_t * __restrict src, + int size) +{ + volatile int64_t *dst = dst_; + int64_t t1, t2, t3, t4; + while ((size -= 64) >= 0) + { + __builtin_prefetch(src + 32, 0, 0); + t1 = *src++; + t2 = *src++; + t3 = *src++; + t4 = *src++; + *dst++ = t1; + *dst++ = t2; + *dst++ = t3; + *dst++ = t4; + t1 = *src++; + t2 = *src++; + t3 = *src++; + t4 = *src++; + *dst++ = t1; + *dst++ = t2; + *dst++ = t3; + *dst++ = t4; + } +} + +void aligned_block_fill(int64_t * __restrict dst_, + int64_t * __restrict src, + int size) +{ + volatile int64_t *dst = dst_; + int64_t data = *src; + while ((size -= 64) >= 0) + { + *dst++ = data; + *dst++ = data; + *dst++ = data; + *dst++ = data; + *dst++ = data; + *dst++ = data; + *dst++ = data; + *dst++ = data; + } +} + +void aligned_block_fill_shuffle16(int64_t * __restrict dst_, + int64_t * __restrict src, + int size) +{ + volatile int64_t *dst = dst_; + int64_t data = *src; + while ((size -= 64) >= 0) + { + dst[0 + 0] = data; + dst[1 + 0] = data; + dst[1 + 2] = data; + dst[0 + 2] = data; + dst[1 + 4] = data; + dst[0 + 4] = data; + dst[0 + 6] = data; + dst[1 + 6] = data; + dst += 8; + } +} + +void aligned_block_fill_shuffle32(int64_t * __restrict dst_, + int64_t * __restrict src, + int size) +{ + volatile int64_t *dst = dst_; + int64_t data = *src; + while ((size -= 64) >= 0) + { + dst[3 + 0] = data; + dst[0 + 0] = data; + dst[2 + 0] = data; + dst[1 + 0] = data; + dst[3 + 4] = data; + dst[0 + 4] = data; + dst[2 + 4] = data; + dst[1 + 4] = data; + dst += 8; + } +} + +void aligned_block_fill_shuffle64(int64_t * __restrict dst_, + int64_t * __restrict src, + int size) +{ + volatile int64_t *dst = dst_; + int64_t data = *src; + while ((size -= 64) >= 0) + { + dst[5] = data; + dst[2] = data; + dst[7] = data; + dst[6] = data; + dst[1] = data; + dst[3] = data; + dst[0] = data; + dst[4] = data; + dst += 8; + } +} + +double gettime(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (double)((int64_t)tv.tv_sec * 1000000 + tv.tv_usec) / 1000000.; +} + +static double bandwidth_bench_helper(int64_t *dstbuf, int64_t *srcbuf, + int64_t *tmpbuf, + int size, int blocksize, + const char *indent_prefix, + int use_tmpbuf, + void (*f)(int64_t *, int64_t *, int), + const char *description) +{ + int i, j, loopcount, innerloopcount, n; + double t1, t2; + double speed, maxspeed; + double s, s0, s1, s2; + + /* do up to MAXREPEATS measurements */ + s = s0 = s1 = s2 = 0; + maxspeed = 0; + for (n = 0; n < MAXREPEATS; n++) + { + f(dstbuf, srcbuf, size); + loopcount = 0; + innerloopcount = 1; + t1 = gettime(); + do + { + loopcount += innerloopcount; + if (use_tmpbuf) + { + for (i = 0; i < innerloopcount; i++) + { + for (j = 0; j < size; j += blocksize) + { + f(tmpbuf, srcbuf + j / sizeof(int64_t), blocksize); + f(dstbuf + j / sizeof(int64_t), tmpbuf, blocksize); + } + } + } + else + { + for (i = 0; i < innerloopcount; i++) + { + f(dstbuf, srcbuf, size); + } + } + innerloopcount *= 2; + t2 = gettime(); + } while (t2 - t1 < 0.5); + speed = (double)size * loopcount / (t2 - t1) / 1000000.; + + s0 += 1; + s1 += speed; + s2 += speed * speed; + + if (speed > maxspeed) + maxspeed = speed; + + if (s0 > 2) + { + s = sqrt((s0 * s2 - s1 * s1) / (s0 * (s0 - 1))); + if (s < maxspeed / 1000.) + break; + } + } + + if (maxspeed > 0 && s / maxspeed * 100. >= 0.1) + { + printf("%s%-52s : %8.1f MB/s (%.1f%%)\n", indent_prefix, description, + maxspeed, s / maxspeed * 100.); + } + else + { + printf("%s%-52s : %8.1f MB/s\n", indent_prefix, description, maxspeed); + } + consoleUpdate(NULL); + return maxspeed; +} + +void bandwidth_bench(int64_t *dstbuf, int64_t *srcbuf, int64_t *tmpbuf, + int size, int blocksize, const char *indent_prefix, + bench_info *bi) +{ + while (bi->f) + { + bandwidth_bench_helper(dstbuf, srcbuf, tmpbuf, size, blocksize, + indent_prefix, bi->use_tmpbuf, + bi->f, + bi->description); + bi++; + } +} + +void memcpy_wrapper(int64_t *dst, int64_t *src, int size) +{ + memcpy(dst, src, size); +} + +void memset_wrapper(int64_t *dst, int64_t *src, int size) +{ + memset(dst, src[0], size); +} + +static bench_info aarch64_neon[] = +{ + { "NEON LDP/STP copy", 0, aligned_block_copy_ldpstp_q_aarch64 }, + { "NEON LDP/STP copy pldl2strm (32 bytes step)", 0, aligned_block_copy_ldpstp_q_pf32_l2strm_aarch64 }, + { "NEON LDP/STP copy pldl2strm (64 bytes step)", 0, aligned_block_copy_ldpstp_q_pf64_l2strm_aarch64 }, + { "NEON LDP/STP copy pldl1keep (32 bytes step)", 0, aligned_block_copy_ldpstp_q_pf32_l1keep_aarch64 }, + { "NEON LDP/STP copy pldl1keep (64 bytes step)", 0, aligned_block_copy_ldpstp_q_pf64_l1keep_aarch64 }, + { "NEON LD1/ST1 copy", 0, aligned_block_copy_ld1st1_aarch64 }, + { "NEON STP fill", 0, aligned_block_fill_stp_q_aarch64 }, + { "NEON STNP fill", 0, aligned_block_fill_stnp_q_aarch64 }, + { "ARM LDP/STP copy", 0, aligned_block_copy_ldpstp_x_aarch64 }, + { "ARM STP fill", 0, aligned_block_fill_stp_x_aarch64 }, + { "ARM STNP fill", 0, aligned_block_fill_stnp_x_aarch64 }, + { NULL, 0, NULL } +}; + +bench_info *get_asm_benchmarks(void) +{ + return aarch64_neon; +} + +static bench_info c_benchmarks[] = +{ + { "C copy backwards", 0, aligned_block_copy_backwards }, + { "C copy backwards (32 byte blocks)", 0, aligned_block_copy_backwards_bs32 }, + { "C copy backwards (64 byte blocks)", 0, aligned_block_copy_backwards_bs64 }, + { "C copy", 0, aligned_block_copy }, + { "C copy prefetched (32 bytes step)", 0, aligned_block_copy_pf32 }, + { "C copy prefetched (64 bytes step)", 0, aligned_block_copy_pf64 }, + { "C 2-pass copy", 1, aligned_block_copy }, + { "C 2-pass copy prefetched (32 bytes step)", 1, aligned_block_copy_pf32 }, + { "C 2-pass copy prefetched (64 bytes step)", 1, aligned_block_copy_pf64 }, + { "C fill", 0, aligned_block_fill }, + { "C fill (shuffle within 16 byte blocks)", 0, aligned_block_fill_shuffle16 }, + { "C fill (shuffle within 32 byte blocks)", 0, aligned_block_fill_shuffle32 }, + { "C fill (shuffle within 64 byte blocks)", 0, aligned_block_fill_shuffle64 }, + { NULL, 0, NULL } +}; + +static bench_info libc_benchmarks[] = +{ + { "standard memcpy", 0, memcpy_wrapper }, + { "standard memset", 0, memset_wrapper }, + { NULL, 0, NULL } +}; + +void *alloc_four_nonaliased_buffers(void **buf1_, int size1, + void **buf2_, int size2, + void **buf3_, int size3, + void **buf4_, int size4) +{ + char **buf1 = (char **)buf1_, **buf2 = (char **)buf2_; + char **buf3 = (char **)buf3_, **buf4 = (char **)buf4_; + int antialias_pattern_mask = (ALIGN_PADDING - 1) & ~(CACHE_LINE_SIZE - 1); + char *buf, *ptr; + + if (!buf1 || size1 < 0) + size1 = 0; + if (!buf2 || size2 < 0) + size2 = 0; + if (!buf3 || size3 < 0) + size3 = 0; + if (!buf4 || size4 < 0) + size4 = 0; + + ptr = buf = + (char *)malloc(size1 + size2 + size3 + size4 + 9 * ALIGN_PADDING); + memset(buf, 0xCC, size1 + size2 + size3 + size4 + 9 * ALIGN_PADDING); + + ptr = align_up(ptr, ALIGN_PADDING); + if (buf1) + { + *buf1 = ptr + (0xAAAAAAAA & antialias_pattern_mask); + ptr = align_up(*buf1 + size1, ALIGN_PADDING); + } + if (buf2) + { + *buf2 = ptr + (0x55555555 & antialias_pattern_mask); + ptr = align_up(*buf2 + size2, ALIGN_PADDING); + } + if (buf3) + { + *buf3 = ptr + (0xCCCCCCCC & antialias_pattern_mask); + ptr = align_up(*buf3 + size3, ALIGN_PADDING); + } + if (buf4) + { + *buf4 = ptr + (0x33333333 & antialias_pattern_mask); + } + + return buf; +} + +static void __attribute__((noinline)) random_read_test(char *zerobuffer, + int count, int nbits) +{ + uint32_t seed = 0; + uintptr_t addrmask = (1 << nbits) - 1; + uint32_t v; + static volatile uint32_t dummy; + +#ifdef __arm__ + uint32_t tmp; + __asm__ volatile ( + "subs %[count], %[count], #16\n" + "blt 1f\n" + "0:\n" + "subs %[count], %[count], #16\n" + ".rept 16\n" + "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" + "and %[v], %[xFF], %[seed], lsr #16\n" + "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" + "and %[tmp], %[xFF00], %[seed], lsr #8\n" + "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" + "orr %[v], %[v], %[tmp]\n" + "and %[tmp], %[x7FFF0000], %[seed]\n" + "orr %[v], %[v], %[tmp]\n" + "and %[v], %[v], %[addrmask]\n" + "ldrb %[v], [%[zerobuffer], %[v]]\n" + "orr %[seed], %[seed], %[v]\n" + ".endr\n" + "bge 0b\n" + "1:\n" + "add %[count], %[count], #16\n" + : [count] "+&r" (count), + [seed] "+&r" (seed), [v] "=&r" (v), + [tmp] "=&r" (tmp) + : [c1103515245] "r" (1103515245), [c12345] "r" (12345), + [xFF00] "r" (0xFF00), [xFF] "r" (0xFF), + [x7FFF0000] "r" (0x7FFF0000), + [zerobuffer] "r" (zerobuffer), + [addrmask] "r" (addrmask) + : "cc"); +#else + #define RANDOM_MEM_ACCESS() \ + seed = seed * 1103515245 + 12345; \ + v = (seed >> 16) & 0xFF; \ + seed = seed * 1103515245 + 12345; \ + v |= (seed >> 8) & 0xFF00; \ + seed = seed * 1103515245 + 12345; \ + v |= seed & 0x7FFF0000; \ + seed |= zerobuffer[v & addrmask]; + + while (count >= 16) { + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + count -= 16; + } +#endif + dummy = seed; + #undef RANDOM_MEM_ACCESS +} + +static void __attribute__((noinline)) random_dual_read_test(char *zerobuffer, + int count, int nbits) +{ + uint32_t seed = 0; + uintptr_t addrmask = (1 << nbits) - 1; + uint32_t v1, v2; + static volatile uint32_t dummy; + +#ifdef __arm__ + uint32_t tmp; + __asm__ volatile ( + "subs %[count], %[count], #16\n" + "blt 1f\n" + "0:\n" + "subs %[count], %[count], #16\n" + ".rept 16\n" + "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" + "and %[v1], %[xFF00], %[seed], lsr #8\n" + "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" + "and %[v2], %[xFF00], %[seed], lsr #8\n" + "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" + "and %[tmp], %[x7FFF0000], %[seed]\n" + "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" + "orr %[v1], %[v1], %[tmp]\n" + "and %[tmp], %[x7FFF0000], %[seed]\n" + "mla %[seed], %[c1103515245], %[seed], %[c12345]\n" + "orr %[v2], %[v2], %[tmp]\n" + "and %[tmp], %[xFF], %[seed], lsr #16\n" + "orr %[v2], %[v2], %[seed], lsr #24\n" + "orr %[v1], %[v1], %[tmp]\n" + "and %[v2], %[v2], %[addrmask]\n" + "eor %[v1], %[v1], %[v2]\n" + "and %[v1], %[v1], %[addrmask]\n" + "ldrb %[v2], [%[zerobuffer], %[v2]]\n" + "ldrb %[v1], [%[zerobuffer], %[v1]]\n" + "orr %[seed], %[seed], %[v2]\n" + "add %[seed], %[seed], %[v1]\n" + ".endr\n" + "bge 0b\n" + "1:\n" + "add %[count], %[count], #16\n" + : [count] "+&r" (count), + [seed] "+&r" (seed), [v1] "=&r" (v1), [v2] "=&r" (v2), + [tmp] "=&r" (tmp) + : [c1103515245] "r" (1103515245), [c12345] "r" (12345), + [xFF00] "r" (0xFF00), [xFF] "r" (0xFF), + [x7FFF0000] "r" (0x7FFF0000), + [zerobuffer] "r" (zerobuffer), + [addrmask] "r" (addrmask) + : "cc"); +#else + #define RANDOM_MEM_ACCESS() \ + seed = seed * 1103515245 + 12345; \ + v1 = (seed >> 8) & 0xFF00; \ + seed = seed * 1103515245 + 12345; \ + v2 = (seed >> 8) & 0xFF00; \ + seed = seed * 1103515245 + 12345; \ + v1 |= seed & 0x7FFF0000; \ + seed = seed * 1103515245 + 12345; \ + v2 |= seed & 0x7FFF0000; \ + seed = seed * 1103515245 + 12345; \ + v1 |= (seed >> 16) & 0xFF; \ + v2 |= (seed >> 24); \ + v2 &= addrmask; \ + v1 ^= v2; \ + seed |= zerobuffer[v2]; \ + seed += zerobuffer[v1 & addrmask]; + + while (count >= 16) { + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + RANDOM_MEM_ACCESS(); + count -= 16; + } +#endif + dummy = seed; + #undef RANDOM_MEM_ACCESS +} + +static uint32_t rand32() +{ + static int seed = 0; + uint32_t hi, lo; + hi = (seed = seed * 1103515245 + 12345) >> 16; + lo = (seed = seed * 1103515245 + 12345) >> 16; + return (hi << 16) + lo; +} + +int latency_bench(int size, int count, int use_hugepage) +{ + double t, t2, t_before, t_after, t_noaccess, t_noaccess2 = 0; + double xs, xs0, xs1, xs2; + double ys, ys0, ys1, ys2; + double min_t, min_t2; + int nbits, n; + char *buffer, *buffer_alloc; +#if !defined(__linux__) || !defined(MADV_HUGEPAGE) + if (use_hugepage) + return 0; + buffer_alloc = (char *)malloc(size + 4095); + if (!buffer_alloc) + return 0; + buffer = (char *)(((uintptr_t)buffer_alloc + 4095) & ~(uintptr_t)4095); +#else + if (posix_memalign((void **)&buffer_alloc, 4 * 1024 * 1024, size) != 0) + return 0; + buffer = buffer_alloc; + if (use_hugepage && madvise(buffer, size, use_hugepage > 0 ? + MADV_HUGEPAGE : MADV_NOHUGEPAGE) != 0) + { + free(buffer_alloc); + return 0; + } +#endif + memset(buffer, 0, size); + + for (n = 1; n <= MAXREPEATS; n++) + { + t_before = gettime(); + random_read_test(buffer, count, 1); + t_after = gettime(); + if (n == 1 || t_after - t_before < t_noaccess) + t_noaccess = t_after - t_before; + + t_before = gettime(); + random_dual_read_test(buffer, count, 1); + t_after = gettime(); + if (n == 1 || t_after - t_before < t_noaccess2) + t_noaccess2 = t_after - t_before; + } + + printf("\nblock size : single random read / dual random read"); + if (use_hugepage > 0) + printf(", [MADV_HUGEPAGE]\n"); + else if (use_hugepage < 0) + printf(", [MADV_NOHUGEPAGE]\n"); + else + printf("\n"); + + consoleUpdate(NULL); + + for (nbits = 10; (1 << nbits) <= size; nbits++) + { + int testsize = 1 << nbits; + xs1 = xs2 = ys = ys1 = ys2 = 0; + for (n = 1; n <= MAXREPEATS; n++) + { + int testoffs = (rand32() % (size / testsize)) * testsize; + + t_before = gettime(); + random_read_test(buffer + testoffs, count, nbits); + t_after = gettime(); + t = t_after - t_before - t_noaccess; + if (t < 0) t = 0; + + xs1 += t; + xs2 += t * t; + + if (n == 1 || t < min_t) + min_t = t; + + t_before = gettime(); + random_dual_read_test(buffer + testoffs, count, nbits); + t_after = gettime(); + t2 = t_after - t_before - t_noaccess2; + if (t2 < 0) t2 = 0; + + ys1 += t2; + ys2 += t2 * t2; + + if (n == 1 || t2 < min_t2) + min_t2 = t2; + + if (n > 2) + { + xs = sqrt((xs2 * n - xs1 * xs1) / (n * (n - 1))); + ys = sqrt((ys2 * n - ys1 * ys1) / (n * (n - 1))); + if (xs < min_t / 1000. && ys < min_t2 / 1000.) + break; + } + } + printf("%10d : %6.1f ns / %6.1f ns \n", (1 << nbits), + min_t * 1000000000. / count, min_t2 * 1000000000. / count); + + consoleUpdate(NULL); + } + free(buffer_alloc); + return 1; +} + +void waitForKeyA() { + while (appletMainLoop()) + { + padUpdate(&pad); + + u64 kDown = padGetButtonsDown(&pad); + + if (kDown & HidNpadButton_A) + break; + else if(kDown) + { + consoleExit(NULL); + exit(0); + } + + consoleUpdate(NULL); + } +} + +void printClock() +{ + int res = 0; + uint32_t cpu_hz = 0, mem_hz = 0; + + ClkrstSession clkrstSession; + res = clkrstInitialize(); + if(R_FAILED(res)) { + fatalThrow(res); + } + + clkrstOpenSession(&clkrstSession, PcvModuleId_CpuBus, 3); + clkrstGetClockRate(&clkrstSession, &cpu_hz); + clkrstCloseSession(&clkrstSession); + clkrstOpenSession(&clkrstSession, PcvModuleId_EMC, 3); + clkrstGetClockRate(&clkrstSession, &mem_hz); + clkrstCloseSession(&clkrstSession); + + printf("== CPU: %u.%u MHz == MEM: %u.%u MHz ==\n", + cpu_hz/1000000, cpu_hz/100000 - cpu_hz/1000000*10, + mem_hz/1000000, mem_hz/100000 - mem_hz/1000000*10); + consoleUpdate(NULL); +} + +// Main program entrypoint +int main(int argc, char* argv[]) +{ + consoleInit(NULL); + + padConfigureInput(1, HidNpadStyleSet_NpadStandard); + + padInitializeDefault(&pad); + + int latbench_size = SIZE * 2, latbench_count = LATBENCH_COUNT; + int64_t *srcbuf, *dstbuf, *tmpbuf; + void *poolbuf; + size_t bufsize = SIZE; + + printf("tinymembench v0.4.9 (simple benchmark for memory throughput and latency)\n"); + + + poolbuf = alloc_four_nonaliased_buffers((void **)&srcbuf, bufsize, + (void **)&dstbuf, bufsize, + (void **)&tmpbuf, BLOCKSIZE, + NULL, 0); + printf("\n"); + printf("==========================================================================\n"); + printf("== Memory bandwidth tests ==\n"); + printf("== ==\n"); + printf("== Note 1: 1MB = 1000000 bytes ==\n"); + printf("== Note 2: Results for 'copy' tests show how many bytes can be ==\n"); + printf("== copied per second (adding together read and writen ==\n"); + printf("== bytes would have provided twice higher numbers) ==\n"); + printf("== Note 3: 2-pass copy means that we are using a small temporary buffer ==\n"); + printf("== to first fetch data into it, and only then write it to the ==\n"); + printf("== destination (source -> L1 cache, L1 cache -> destination) ==\n"); + printf("== Note 4: If sample standard deviation exceeds 0.1%%, it is shown in ==\n"); + printf("== brackets ==\n"); + printf("==========================================================================\n\n"); + + consoleUpdate(NULL); + + printf("!!! Memory bandwidth heavily depends on CPU clock. !!!\n\n"); + printClock(); + printf("Press A to start bandwidth test, any other key to exit.\n\n"); + waitForKeyA(); + + bandwidth_bench(dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", c_benchmarks); + printf(" ---\n"); + + bandwidth_bench(dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", libc_benchmarks); + bench_info *bi = get_asm_benchmarks(); + if (bi->f) { + printf(" ---\n"); + bandwidth_bench(dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", bi); + } + + free(poolbuf); + + printf("\nPress A to continue, any other key to exit.\n\n"); + waitForKeyA(); + consoleClear(); + + printf("\n"); + printf("==========================================================================\n"); + printf("== Memory latency test ==\n"); + printf("== ==\n"); + printf("== Average time is measured for random memory accesses in the buffers ==\n"); + printf("== of different sizes. The larger is the buffer, the more significant ==\n"); + printf("== are relative contributions of TLB, L1/L2 cache misses and SDRAM ==\n"); + printf("== accesses. For extremely large buffer sizes we are expecting to see ==\n"); + printf("== page table walk with several requests to SDRAM for almost every ==\n"); + printf("== memory access (though 64MiB is not nearly large enough to experience ==\n"); + printf("== this effect to its fullest). ==\n"); + printf("== ==\n"); + printf("== Note 1: All the numbers are representing extra time, which needs to ==\n"); + printf("== be added to L1 cache latency. The cycle timings for L1 cache ==\n"); + printf("== latency can be usually found in the processor documentation. ==\n"); + printf("== Note 2: Dual random read means that we are simultaneously performing ==\n"); + printf("== two independent memory accesses at a time. In the case if ==\n"); + printf("== the memory subsystem can't handle multiple outstanding ==\n"); + printf("== requests, dual random read has the same timings as two ==\n"); + printf("== single reads performed one after another. ==\n"); + printf("==========================================================================\n\n"); + + consoleUpdate(NULL); + printClock(); + printf("Press A to start latency test, any other key to exit.\n\n"); + waitForKeyA(); + + if (!latency_bench(latbench_size, latbench_count, -1) || + !latency_bench(latbench_size, latbench_count, 1)) + { + latency_bench(latbench_size, latbench_count, 0); + } + + printf("\nPress any key to exit.\n"); + waitForKeyA(); + + // Deinitialize and clean up resources used by the console (important!) + consoleExit(NULL); + return 0; +} \ No newline at end of file diff --git a/Source/sys-clk-OC/manager/Makefile b/Source/sys-clk-OC/manager/Makefile index 2a41992c..dd5bdd18 100644 --- a/Source/sys-clk-OC/manager/Makefile +++ b/Source/sys-clk-OC/manager/Makefile @@ -54,7 +54,7 @@ APP_RESOURCES := romfs:/ #--------------------------------------------------------------------------------- # version control constants #--------------------------------------------------------------------------------- -TARGET_VERSION := $(shell git describe --dirty --always --tags) +TARGET_VERSION := OC-$$(date +%Y/%m/%d) APP_VERSION := $(TARGET_VERSION) #--------------------------------------------------------------------------------- diff --git a/Source/sys-clk-OC/overlay/Makefile b/Source/sys-clk-OC/overlay/Makefile index d44a28bf..35a25761 100644 --- a/Source/sys-clk-OC/overlay/Makefile +++ b/Source/sys-clk-OC/overlay/Makefile @@ -32,7 +32,7 @@ NO_ICON := 1 #--------------------------------------------------------------------------------- # version control constants #--------------------------------------------------------------------------------- -TARGET_VERSION := $(shell git describe --dirty --always --tags) +TARGET_VERSION := OC-$$(date +%Y/%m/%d) APP_VERSION := $(TARGET_VERSION) #--------------------------------------------------------------------------------- diff --git a/Source/sys-clk-OC/sysmodule/Makefile b/Source/sys-clk-OC/sysmodule/Makefile index 27dc4fb5..af418d5f 100644 --- a/Source/sys-clk-OC/sysmodule/Makefile +++ b/Source/sys-clk-OC/sysmodule/Makefile @@ -30,7 +30,7 @@ LIBNAMES := minIni nxExt #--------------------------------------------------------------------------------- # version control constants #--------------------------------------------------------------------------------- -TARGET_VERSION := OC-$(shell git describe --dirty --always --tags) +TARGET_VERSION := OC-$$(date +%Y/%m/%d) #--------------------------------------------------------------------------------- # options for code generation diff --git a/Source/sys-clk-OC/sysmodule/src/clock_manager.cpp b/Source/sys-clk-OC/sysmodule/src/clock_manager.cpp index da6c6ee7..72b158bb 100644 --- a/Source/sys-clk-OC/sysmodule/src/clock_manager.cpp +++ b/Source/sys-clk-OC/sysmodule/src/clock_manager.cpp @@ -214,7 +214,7 @@ void ClockManager::Tick() { std::uint32_t hz = 0; std::uint32_t hzForceOverride = 0; - for (unsigned int module = 0; module < SysClkModule_EnumMax; module++) + for (unsigned int module = 0; module < SysClkModule_EnumMax - 1; module++) { hz = this->context->overrideFreqs[module]; @@ -456,6 +456,15 @@ bool ClockManager::RefreshContext() for (unsigned int module = 0; module < SysClkModule_EnumMax; module++) { hz = Clocks::GetCurrentHz((SysClkModule)module); + + // Skip MEM freq check + if (module == SysClkModule_MEM) + { + this->context->freqs[module] = hz; + break; + } + + // Round to MHz uint32_t cur_mhz = hz/1000'000; uint32_t be4_mhz = this->context->freqs[module]/1000'000; if (hz != 0 && cur_mhz != be4_mhz) @@ -475,6 +484,7 @@ bool ClockManager::RefreshContext() else { FileUtils::LogLine("[mgr] %s override disabled", Clocks::GetModuleName((SysClkModule)module, true)); + Clocks::ResetToStock(module); } this->context->overrideFreqs[module] = hz; hasChanged = true; diff --git a/Source/sys-clk-OC/sysmodule/src/clocks.cpp b/Source/sys-clk-OC/sysmodule/src/clocks.cpp index 4de486df..7897cccf 100644 --- a/Source/sys-clk-OC/sysmodule/src/clocks.cpp +++ b/Source/sys-clk-OC/sysmodule/src/clocks.cpp @@ -167,7 +167,7 @@ PcvModuleId Clocks::GetPcvModuleId(SysClkModule sysclkModule) return pcvModuleId; } -void Clocks::ResetToStock() +void Clocks::ResetToStock(unsigned int module) { Result rc = 0; if(hosversionAtLeast(9,0,0)) @@ -191,8 +191,14 @@ void Clocks::ResetToStock() ERROR_THROW("Unknown apm configuration: %x", confId); } - Clocks::SetHz(SysClkModule_CPU, apmConfiguration->cpu_hz); - Clocks::SetHz(SysClkModule_GPU, apmConfiguration->gpu_hz); + if (module == SysClkModule_EnumMax || module == SysClkModule_CPU) + { + Clocks::SetHz(SysClkModule_CPU, apmConfiguration->cpu_hz); + } + if (module == SysClkModule_EnumMax || module == SysClkModule_GPU) + { + Clocks::SetHz(SysClkModule_GPU, apmConfiguration->gpu_hz); + } // We don't need to set MEM freqs any more //Clocks::SetHz(SysClkModule_MEM, apmConfiguration->mem_hz); } diff --git a/Source/sys-clk-OC/sysmodule/src/clocks.h b/Source/sys-clk-OC/sysmodule/src/clocks.h index 5ff180bd..b6161621 100644 --- a/Source/sys-clk-OC/sysmodule/src/clocks.h +++ b/Source/sys-clk-OC/sysmodule/src/clocks.h @@ -19,7 +19,7 @@ class Clocks static bool isMariko; static void Exit(); static void Initialize(); - static void ResetToStock(); + static void ResetToStock(unsigned int module = SysClkModule_EnumMax); static SysClkProfile GetCurrentProfile(); static std::uint32_t GetCurrentHz(SysClkModule module); static void SetHz(SysClkModule module, std::uint32_t hz);