diff --git a/Source/sys-clk-OC/sysmodule/src/oc_extra.cpp b/Source/sys-clk-OC/sysmodule/src/oc_extra.cpp index ca1308e6..a26545d7 100644 --- a/Source/sys-clk-OC/sysmodule/src/oc_extra.cpp +++ b/Source/sys-clk-OC/sysmodule/src/oc_extra.cpp @@ -231,88 +231,11 @@ uint32_t Governor::s_FreqContext::GetNormalizedUtil(uint32_t raw_util) { void Governor::s_FreqContext::SetNextFreq(uint32_t norm_util) { uint32_t prev_hz = target_hz; - // === Add a non-linear coefficient to tipping-point === - // float nonlinear_coeff = (float)max_hz / target_hz; // Always non-negative - // #ifdef __aarch64__ - // asm ("FSQRT %s0, %s0" - // : "=w" (nonlinear_coeff) - // : "w" (nonlinear_coeff)); - // asm ("FSQRT %s0, %s0" - // : "=w" (nonlinear_coeff) - // : "w" (nonlinear_coeff)); - // #else - // nonlinear_coeff = sqrt(sqrt(nonlinear_coeff)); - // #endif - - // === Tipping-point look-up table for all frequencies === - // typedef struct { - // uint16_t numerator; - // uint16_t denom_shift; - // } lut_entry; - - // static constexpr auto apply_cpu_nonlinear_coeff = [](uint32_t input) { - // lut_entry lut[] = { - // { 4645, 12 }, // 1963500000 - // {}, - // { 4505, 12 }, // 2091000000 - // { 8971, 13 }, // 2193000000 - // { 1117, 10 }, // 2295000000 - // { 1117, 10 }, // 2397000000 - // {}, - // { 5575, 12 }, // 612000000 - // { 10699, 13 }, // 714000000 - // {}, - // { 81, 6 }, // 816000000 - // { 10113, 13 }, // 918000000 - // { 1239, 10 }, // 1020000000 - // {}, - // { 9749, 13 }, // 1122000000 - // { 4807, 12 }, // 1224000000 - // { 2375, 11 }, // 1326000000 - // { 10041, 13 }, // 1428000000 - // {}, - // { 9283, 13 }, // 1581000000 - // {}, - // { 9215, 13 }, // 1683000000 - // { 18309, 14 }, // 1785000000 - // }; - // size_t idx = (input >> 20) % 24; - // lut_entry entry = lut[idx]; - // return (input >> entry.denom_shift) * entry.numerator; - // }; - - // static constexpr auto apply_gpu_nonlinear_coeff = [](uint32_t input) { - // lut_entry lut[] = { - // { 1087, 10 }, // 1305600000 - // { 2351, 11 }, // 1075200000 - // { 9749, 13 }, // 844800000 - // { 81, 6 }, // 614400000 - // { 2949, 11 }, // 384000000 - // { 9, 2 }, // 153600000 - // {}, - // { 1089, 10 }, // 1228800000 - // { 2375, 11 }, // 998400000 - // { 1239, 10 }, // 768000000 - // { 10699, 13 }, // 537600000 - // { 25, 4 }, // 307200000 - // { 4, 0 }, // 76800000 - // { 1087, 10 }, // 1267200000 - // { 1165, 10 }, // 1152000000 - // { 4807, 12 }, // 921600000 - // { 10113, 13 }, // 691200000 - // { 5575, 12 }, // 460800000 - // }; - // size_t idx = (input >> 18) % 20; - // lut_entry entry = lut[idx]; - // return (input >> entry.denom_shift) * entry.numerator; - // }; - auto FindHzInTable = [](uint32_t* hz_list, uint32_t in_hz) { uint32_t* p = hz_list; - while (*p) { + for (; *p != 0; p++) { if (in_hz <= *p) return p; - p++; } return (--p); }; @@ -359,6 +282,7 @@ void Governor::CpuUtilWorker(void* args) { continue; } + // Check if other cores are stuck for (int id = 0; id < CORE_NUMS; id++) { if (id == coreid) continue; @@ -385,24 +309,25 @@ void Governor::Main(void* args) { s_FreqContext* gpu_ctx = &self->m_gpu_freq; uint32_t nvgpu_field = self->m_nvgpu_field; - s_Util cpu_util, gpu_util; + s_CpuUtil *cpu_util = new s_CpuUtil; + s_GpuUtil *gpu_util = new s_GpuUtil; auto SetCpuFreq = [self, cpu_ctx, cpu_util]() mutable { uint32_t util = self->m_cpu_core_ctx[0].util; for (size_t i = 1; i < CORE_NUMS; i++) { if (util < self->m_cpu_core_ctx[i].util) util = self->m_cpu_core_ctx[i].util; } - cpu_util.Update(util); + cpu_util->Update(util); if (self->m_cpu_core_ctx[SYS_CORE_ID].util > BOOST_THRESHOLD && self->m_syscore_autoboost) cpu_ctx->Boost(); else - cpu_ctx->SetNextFreq(cpu_util.Get()); + cpu_ctx->SetNextFreq(cpu_util->Get()); }; auto SetGpuFreq = [gpu_ctx, nvgpu_field, gpu_util]() mutable { uint32_t util = gpu_ctx->GetNormalizedUtil(GpuCoreUtil(nvgpu_field).Get()); - gpu_util.Update(util); - util = gpu_util.Get(); + gpu_util->Update(util); + util = gpu_util->Get(); gpu_ctx->SetNextFreq(util); }; @@ -442,5 +367,8 @@ void Governor::Main(void* args) { svcSleepThread(TICK_TIME_NS); } + + delete cpu_util; + delete gpu_util; } diff --git a/Source/sys-clk-OC/sysmodule/src/oc_extra.h b/Source/sys-clk-OC/sysmodule/src/oc_extra.h index a3af2ec2..ae4f38d3 100644 --- a/Source/sys-clk-OC/sysmodule/src/oc_extra.h +++ b/Source/sys-clk-OC/sysmodule/src/oc_extra.h @@ -117,17 +117,17 @@ protected: // PELT: https://github.com/torvalds/linux/blob/master/kernel/sched/pelt.c // Util_acc_n = Util_0 + Util_1 * D + Util_2 * D^2 + ... + Util_n * D^n // To approximate D (decay multiplier): - // After 100 ms (if SAMPLE_RATE == 200, 20 samples) - // (UTIL_MAX * D)^20 ≈ 1 (UTIL_MAX decayed to 1) - // D = 0.707946... ≈ 5799 / 8192 (epsilon < 0.0001) - // Util_acc_20 ≈ 3419, Util_acc_40 ≈ 3420, Util_acc_inf ≈ 3420 + // After 50 ms (if SAMPLE_RATE == 200, 10 samples) + // UTIL_MAX * D^10 ≈ 1 (UTIL_MAX decayed to 1) + // D = 4129 / 8192 + // Util_acc_max = Util_acc_inf = 2012 static constexpr uint32_t UTIL_MAX = 100'0; - struct s_Util { + struct s_CpuUtil { uint32_t util_acc = 0; - static constexpr uint32_t DECAY_DIVIDENT = 5799; + static constexpr uint32_t DECAY_DIVIDENT = 4129; static constexpr uint32_t DECAY_DIVISOR = 8192; - static constexpr uint32_t UTIL_ACC_MAX = 3420; + static constexpr uint32_t UTIL_ACC_MAX = 2012; uint32_t Get() { return (util_acc * UTIL_MAX / UTIL_ACC_MAX); }; void Update(uint32_t util) { util_acc = util_acc * DECAY_DIVIDENT / DECAY_DIVISOR + util; }; @@ -136,9 +136,8 @@ protected: static void CpuUtilWorker(void* args); static void Main(void* args); - // Get max from a sliding window in O(1) - static constexpr size_t WINDOW_SIZE = SAMPLE_RATE / 10; - template + // Get max value from a sliding window in O(1) + template class SWindowMax { protected: typedef struct { @@ -148,7 +147,7 @@ protected: struct s_Stack { s_Entry m_stack[WINDOW_SIZE] = {}; - size_t m_next = 0; + size_t m_next = WINDOW_SIZE; bool empty() { return m_next == 0; }; s_Entry top() { return m_stack[m_next-1]; }; @@ -164,26 +163,27 @@ protected: s_Stack deqStack; void Push(s_Stack& stack, T item) { - s_Entry n; - n.item = item; - n.max = enqStack.empty() ? item : std::max(item, enqStack.top().max); + s_Entry n = { + .item = item, + .max = enqStack.empty() ? item : std::max(item, enqStack.top().max) + }; stack.push(n); } - void Pop() { + T Pop() { if (deqStack.empty()) { while (!enqStack.empty()) Push(deqStack, enqStack.pop().max); } - deqStack.pop(); + return deqStack.pop().item; } public: - SWindowMax() { deqStack.m_next = WINDOW_SIZE; } + SWindowMax() {} void Add(T item) { Pop(); Push(enqStack, item); } - T Max() { + T Get() { if (!enqStack.empty()) { T enqMax = enqStack.top().max; if (!deqStack.empty()) { @@ -197,4 +197,42 @@ protected: return 0; } }; + + // Get average value from a sliding window in O(1) + template + class SWindowAvg { + public: + SWindowAvg() {} + + void Add(T item) { + T pop = m_queue[m_next]; + m_queue[m_next] = item; + m_next = (m_next + 1) % WINDOW_SIZE; + m_sum -= pop; + m_sum += item; + } + + T Get() { return m_sum / WINDOW_SIZE; } + + protected: + size_t m_next = 0; + T m_sum = 0; + T m_queue[WINDOW_SIZE] = {}; + }; + + struct s_GpuUtil { + SWindowMax window {}; + + uint32_t util_acc = 0; + // After 160 ms (if SAMPLE_RATE == 200, 32 samples) + // UTIL_MAX * D^32 ≈ 1 (UTIL_MAX decayed to 1) + // D = 6880 / 8192 + // Util_acc_max = Util_acc_inf = 6145 + static constexpr uint32_t DECAY_DIVIDENT = 6880; + static constexpr uint32_t DECAY_DIVISOR = 8192; + static constexpr uint32_t UTIL_ACC_MAX = 6145; + + uint32_t Get() { return ((util_acc * UTIL_MAX / UTIL_ACC_MAX) + window.Get()) / 2; }; + void Update(uint32_t util) { window.Add(util); util_acc = util_acc * DECAY_DIVIDENT / DECAY_DIVISOR + util; }; + }; };