Schedutil-like governor with proper load_avg calculation; Fixed #36

This commit is contained in:
KazushiM
2022-10-27 01:41:38 +08:00
parent 0f6fb06e53
commit 851839be0a
7 changed files with 230 additions and 282 deletions

View File

@@ -13,57 +13,31 @@
class CpuCoreUtil {
public:
CpuCoreUtil (int coreid = -2, uint64_t ns = 1000'000ULL):
m_core_id(coreid), m_wait_time_ns(ns) {};
inline uint64_t Get() { Start(); WaitForStop(); Stop(); return Calculate(); };
inline void Start() { m_idletick = GetIdleTickCount(); };
inline void WaitForStop() { svcSleepThread(m_wait_time_ns); };
inline void Stop() { m_idletick = GetIdleTickCount() - m_idletick; };
static constexpr uint64_t TICKS_PER_MS = 192;
inline uint64_t Calculate() { return 100'0 - m_idletick * 10 * 1000'000ULL / (TICKS_PER_MS * m_wait_time_ns); };
CpuCoreUtil (int coreid, uint64_t ns);
uint32_t Get();
protected:
const int m_core_id;
const uint64_t m_wait_time_ns;
uint64_t m_idletick;
static constexpr uint64_t TICKS_PER_MS = 192;
static constexpr uint32_t UTIL_MAX = 100'0;
inline uint64_t GetIdleTickCount() {
uint64_t idletick = 0;
svcGetInfo(&idletick, InfoType_IdleTickCount, INVALID_HANDLE, m_core_id);
return idletick;
};
uint64_t GetIdleTickCount();
};
class GpuCoreUtil {
public:
GpuCoreUtil (uint32_t nvgpu_field, uint64_t ns = 1000'000ULL):
m_nvgpu_field(nvgpu_field), m_wait_time_ns(ns) {};
inline uint64_t Get() { Wait(); return GetLoad(); };
inline void Wait() { svcSleepThread(m_wait_time_ns); };
inline uint32_t GetLoad() {
uint32_t load;
nvIoctl(m_nvgpu_field, NVGPU_GPU_IOCTL_PMU_GET_GPU_LOAD, &load);
// if (R_FAILED(rc)) {
// ERROR_THROW("[mgr] nvIoctl() failed: 0x%lX", rc);
// }
return load;
};
GpuCoreUtil (uint32_t nvgpu_field);
uint32_t Get();
protected:
uint32_t m_nvgpu_field;
const uint64_t m_wait_time_ns;
static constexpr uint64_t NVGPU_GPU_IOCTL_PMU_GET_GPU_LOAD = 0x80044715;
};
class ReverseNXSync {
public:
ReverseNXSync ()
: m_rt_mode(ReverseNX_NotFound), m_tool_mode(ReverseNX_NotFound) {
CheckToolEnabled();
};
ReverseNXSync ();
void ToggleSync(bool enable) { m_sync_enabled = enable; };
void Reset(uint64_t app_id) { m_app_id = app_id; SetRTMode(ReverseNX_NotFound); GetToolMode(); }
@@ -79,7 +53,6 @@ protected:
bool m_tool_enabled;
bool m_sync_enabled;
bool CheckToolEnabled();
ReverseNXMode GetToolModeFromPatch(const char* patch_path);
ReverseNXMode RecheckToolMode();
};
@@ -90,57 +63,23 @@ namespace PsmExt {
class Governor {
public:
Governor() {
memset(reinterpret_cast<void*>(&m_cpu_freq), 0, sizeof(m_cpu_freq));
memset(reinterpret_cast<void*>(&m_gpu_freq), 0, sizeof(m_gpu_freq));
m_cpu_freq.module = SysClkModule_CPU;
m_gpu_freq.module = SysClkModule_GPU;
m_cpu_freq.hz_list = &sysclk_g_freq_table_cpu_hz[0];
m_gpu_freq.hz_list = &sysclk_g_freq_table_gpu_hz[0];
m_cpu_freq.idx_boost_hz = FindIndex(&m_cpu_freq, 1785'000'000);
m_gpu_freq.idx_boost_hz = FindIndex(&m_gpu_freq, 76'800'000);
m_gpu_freq.idx_min_hz = FindIndex(&m_gpu_freq, 153'600'000);
nvInitialize();
Result rc = nvOpen(&m_nvgpu_field, "/dev/nvhost-ctrl-gpu");
if (R_FAILED(rc)) {
ERROR_THROW("[mgr] nvOpen() failed: 0x%lX", rc);
nvExit();
}
};
~Governor() {
Stop();
nvClose(m_nvgpu_field);
nvExit();
};
Governor();
~Governor();
void Start();
void Stop();
void SetMaxHz(uint32_t max_hz, SysClkModule module);
void SetCPUBoostHz(uint32_t hz) { m_cpu_freq.idx_boost_hz = FindIndex(&m_cpu_freq, hz); };
void SetCPUBoostHz(uint32_t hz) { m_cpu_freq.boost_hz = hz; };
void SetPerfConf(uint32_t id);
protected:
// Parameters for sampling
static constexpr uint64_t SAMPLE_RATE = 200;
static constexpr uint64_t TICK_TIME_MS = 1000 / SAMPLE_RATE;
static constexpr uint64_t TICK_TIME_NS = 1000'000'000 / SAMPLE_RATE;
// Parameters for frequency ramp threshold
static constexpr uint64_t CPU_THR_RAMP_DOWN = 70'0;
static constexpr uint64_t CPU_THR_RAMP_UP = 90'0;
static constexpr uint64_t GPU_THR_RAMP_DOWN = 70'0;
static constexpr uint64_t GPU_THR_RAMP_UP = 80'0;
static constexpr uint64_t GPU_THR_RAMP_MAX = 90'0;
static constexpr int CORE_NUMS = 4;
bool m_stop_threads = false;
bool m_running = false;
Thread m_t_cpuworker[CORE_NUMS], m_t_main;
std::atomic<uint64_t> m_core3_stuck_cnt = 0;
@@ -149,62 +88,52 @@ protected:
uint32_t m_perf_conf_id;
SysClkApmConfiguration *m_apm_conf;
typedef enum {
RAMP_UP,
RAMP_DOWN,
RAMP_MAX,
RAMP_MIN,
RAMP_BOOST,
} FREQ_RAMP_DIRECTION;
typedef struct {
SysClkModule module;
uint32_t* hz_list;
uint8_t idx_target_hz;
uint8_t idx_min_hz;
uint8_t idx_max_hz;
uint8_t idx_boost_hz;
} s_Freq;
s_Freq m_cpu_freq, m_gpu_freq;
uint32_t target_hz;
uint32_t min_hz;
uint32_t max_hz;
uint32_t boost_hz;
uint32_t utilref_hz;
static uint32_t FindIndex(s_Freq* f, uint32_t hz);
static bool TargetRamp(s_Freq* f, FREQ_RAMP_DIRECTION dir);
static void SetHz(s_Freq* f);
static void SetBoostHz(s_Freq* f);
uint32_t GetNormalizedUtil(uint32_t raw_util);
void SetNextFreq(uint32_t norm_util);
void SetHz();
void SetBoostHz();
} s_FreqContext;
s_FreqContext m_cpu_freq, m_gpu_freq;
typedef struct {
Governor* self;
int64_t id;
uint64_t util;
int id;
uint32_t util;
} s_CoreContext;
s_CoreContext m_cpu_core_ctx[CORE_NUMS];
s_CoreContext* InitCoreContext(s_CoreContext* context, Governor* self, int64_t id = 0);
static void CheckCpuUtilWorker(void* args);
static void Main(void* args);
// PELT: https://github.com/torvalds/linux/blob/master/kernel/sched/pelt.c
// Util_acc_n = Util_0 + Util_1 * D + Util_2 * D^2 + ... + Util_n * D^n
// To approximate D (decay multiplier):
// After 100 ms (if SAMPLE_RATE == 200, 20 samples)
// (UTIL_MAX * D)^20 ≈ 1 (UTIL_MAX decayed to 1)
// D = 0.7079457843841379... ≈ 725 / 1024
// Util_acc_20 ≈ 3421, Util_acc_40 ≈ 3424, Util_acc_inf ≈ 3424
static constexpr uint32_t UTIL_MAX = 100'0;
struct s_Util {
uint32_t util_acc = 0;
private:
static constexpr size_t QUEUE_SIZE = 8;
template <typename T>
struct s_Queue {
// Much faster than <queue> from stl
T queue[QUEUE_SIZE] = { 0 };
T sum = 0;
size_t pos = 0;
static constexpr uint32_t DECAY_DIVIDENT = 725;
static constexpr uint32_t DECAY_DIVISOR = 1024;
static constexpr uint32_t UTIL_ACC_MAX = 3424;
T GetAvg() { return sum / QUEUE_SIZE; };
T GetFirst() { return queue[pos % QUEUE_SIZE]; };
T GetLast() { return queue[(pos - 1) % QUEUE_SIZE]; };
T PopAndPush(T val_to_push) {
T val_to_pop;
sum -= (val_to_pop = GetFirst()); // Pop and subtract from sum
sum += (queue[pos % QUEUE_SIZE] = val_to_push); // Push and add to sum
pos++;
return val_to_pop;
}
uint32_t Get() { return (util_acc * UTIL_MAX / UTIL_ACC_MAX); };
void Update(uint32_t util) { util_acc = util_acc * DECAY_DIVIDENT / DECAY_DIVISOR + util; };
};
void CheckCpuUtilWorkerSysCore();
void CheckCpuUtilWorkerAppCore(int64_t coreid);
static void CheckCpuUtilWorker(void* args);
void CheckCpuUtilWorkerAppCore(int64_t coreid);
void CheckCpuUtilWorkerSysCore();
static void Main(void* args);
};