[sys-clk-OC] Governor: Improve utility responsiveness
This commit is contained in:
@@ -231,88 +231,11 @@ uint32_t Governor::s_FreqContext::GetNormalizedUtil(uint32_t raw_util) {
|
||||
void Governor::s_FreqContext::SetNextFreq(uint32_t norm_util) {
|
||||
uint32_t prev_hz = target_hz;
|
||||
|
||||
// === Add a non-linear coefficient to tipping-point ===
|
||||
// float nonlinear_coeff = (float)max_hz / target_hz; // Always non-negative
|
||||
// #ifdef __aarch64__
|
||||
// asm ("FSQRT %s0, %s0"
|
||||
// : "=w" (nonlinear_coeff)
|
||||
// : "w" (nonlinear_coeff));
|
||||
// asm ("FSQRT %s0, %s0"
|
||||
// : "=w" (nonlinear_coeff)
|
||||
// : "w" (nonlinear_coeff));
|
||||
// #else
|
||||
// nonlinear_coeff = sqrt(sqrt(nonlinear_coeff));
|
||||
// #endif
|
||||
|
||||
// === Tipping-point look-up table for all frequencies ===
|
||||
// typedef struct {
|
||||
// uint16_t numerator;
|
||||
// uint16_t denom_shift;
|
||||
// } lut_entry;
|
||||
|
||||
// static constexpr auto apply_cpu_nonlinear_coeff = [](uint32_t input) {
|
||||
// lut_entry lut[] = {
|
||||
// { 4645, 12 }, // 1963500000
|
||||
// {},
|
||||
// { 4505, 12 }, // 2091000000
|
||||
// { 8971, 13 }, // 2193000000
|
||||
// { 1117, 10 }, // 2295000000
|
||||
// { 1117, 10 }, // 2397000000
|
||||
// {},
|
||||
// { 5575, 12 }, // 612000000
|
||||
// { 10699, 13 }, // 714000000
|
||||
// {},
|
||||
// { 81, 6 }, // 816000000
|
||||
// { 10113, 13 }, // 918000000
|
||||
// { 1239, 10 }, // 1020000000
|
||||
// {},
|
||||
// { 9749, 13 }, // 1122000000
|
||||
// { 4807, 12 }, // 1224000000
|
||||
// { 2375, 11 }, // 1326000000
|
||||
// { 10041, 13 }, // 1428000000
|
||||
// {},
|
||||
// { 9283, 13 }, // 1581000000
|
||||
// {},
|
||||
// { 9215, 13 }, // 1683000000
|
||||
// { 18309, 14 }, // 1785000000
|
||||
// };
|
||||
// size_t idx = (input >> 20) % 24;
|
||||
// lut_entry entry = lut[idx];
|
||||
// return (input >> entry.denom_shift) * entry.numerator;
|
||||
// };
|
||||
|
||||
// static constexpr auto apply_gpu_nonlinear_coeff = [](uint32_t input) {
|
||||
// lut_entry lut[] = {
|
||||
// { 1087, 10 }, // 1305600000
|
||||
// { 2351, 11 }, // 1075200000
|
||||
// { 9749, 13 }, // 844800000
|
||||
// { 81, 6 }, // 614400000
|
||||
// { 2949, 11 }, // 384000000
|
||||
// { 9, 2 }, // 153600000
|
||||
// {},
|
||||
// { 1089, 10 }, // 1228800000
|
||||
// { 2375, 11 }, // 998400000
|
||||
// { 1239, 10 }, // 768000000
|
||||
// { 10699, 13 }, // 537600000
|
||||
// { 25, 4 }, // 307200000
|
||||
// { 4, 0 }, // 76800000
|
||||
// { 1087, 10 }, // 1267200000
|
||||
// { 1165, 10 }, // 1152000000
|
||||
// { 4807, 12 }, // 921600000
|
||||
// { 10113, 13 }, // 691200000
|
||||
// { 5575, 12 }, // 460800000
|
||||
// };
|
||||
// size_t idx = (input >> 18) % 20;
|
||||
// lut_entry entry = lut[idx];
|
||||
// return (input >> entry.denom_shift) * entry.numerator;
|
||||
// };
|
||||
|
||||
auto FindHzInTable = [](uint32_t* hz_list, uint32_t in_hz) {
|
||||
uint32_t* p = hz_list;
|
||||
while (*p) {
|
||||
for (; *p != 0; p++) {
|
||||
if (in_hz <= *p)
|
||||
return p;
|
||||
p++;
|
||||
}
|
||||
return (--p);
|
||||
};
|
||||
@@ -359,6 +282,7 @@ void Governor::CpuUtilWorker(void* args) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if other cores are stuck
|
||||
for (int id = 0; id < CORE_NUMS; id++) {
|
||||
if (id == coreid)
|
||||
continue;
|
||||
@@ -385,24 +309,25 @@ void Governor::Main(void* args) {
|
||||
s_FreqContext* gpu_ctx = &self->m_gpu_freq;
|
||||
uint32_t nvgpu_field = self->m_nvgpu_field;
|
||||
|
||||
s_Util cpu_util, gpu_util;
|
||||
s_CpuUtil *cpu_util = new s_CpuUtil;
|
||||
s_GpuUtil *gpu_util = new s_GpuUtil;
|
||||
auto SetCpuFreq = [self, cpu_ctx, cpu_util]() mutable {
|
||||
uint32_t util = self->m_cpu_core_ctx[0].util;
|
||||
for (size_t i = 1; i < CORE_NUMS; i++) {
|
||||
if (util < self->m_cpu_core_ctx[i].util)
|
||||
util = self->m_cpu_core_ctx[i].util;
|
||||
}
|
||||
cpu_util.Update(util);
|
||||
cpu_util->Update(util);
|
||||
if (self->m_cpu_core_ctx[SYS_CORE_ID].util > BOOST_THRESHOLD && self->m_syscore_autoboost)
|
||||
cpu_ctx->Boost();
|
||||
else
|
||||
cpu_ctx->SetNextFreq(cpu_util.Get());
|
||||
cpu_ctx->SetNextFreq(cpu_util->Get());
|
||||
};
|
||||
|
||||
auto SetGpuFreq = [gpu_ctx, nvgpu_field, gpu_util]() mutable {
|
||||
uint32_t util = gpu_ctx->GetNormalizedUtil(GpuCoreUtil(nvgpu_field).Get());
|
||||
gpu_util.Update(util);
|
||||
util = gpu_util.Get();
|
||||
gpu_util->Update(util);
|
||||
util = gpu_util->Get();
|
||||
gpu_ctx->SetNextFreq(util);
|
||||
};
|
||||
|
||||
@@ -442,5 +367,8 @@ void Governor::Main(void* args) {
|
||||
|
||||
svcSleepThread(TICK_TIME_NS);
|
||||
}
|
||||
|
||||
delete cpu_util;
|
||||
delete gpu_util;
|
||||
}
|
||||
|
||||
|
||||
@@ -117,17 +117,17 @@ protected:
|
||||
// PELT: https://github.com/torvalds/linux/blob/master/kernel/sched/pelt.c
|
||||
// Util_acc_n = Util_0 + Util_1 * D + Util_2 * D^2 + ... + Util_n * D^n
|
||||
// To approximate D (decay multiplier):
|
||||
// After 100 ms (if SAMPLE_RATE == 200, 20 samples)
|
||||
// (UTIL_MAX * D)^20 ≈ 1 (UTIL_MAX decayed to 1)
|
||||
// D = 0.707946... ≈ 5799 / 8192 (epsilon < 0.0001)
|
||||
// Util_acc_20 ≈ 3419, Util_acc_40 ≈ 3420, Util_acc_inf ≈ 3420
|
||||
// After 50 ms (if SAMPLE_RATE == 200, 10 samples)
|
||||
// UTIL_MAX * D^10 ≈ 1 (UTIL_MAX decayed to 1)
|
||||
// D = 4129 / 8192
|
||||
// Util_acc_max = Util_acc_inf = 2012
|
||||
static constexpr uint32_t UTIL_MAX = 100'0;
|
||||
struct s_Util {
|
||||
struct s_CpuUtil {
|
||||
uint32_t util_acc = 0;
|
||||
|
||||
static constexpr uint32_t DECAY_DIVIDENT = 5799;
|
||||
static constexpr uint32_t DECAY_DIVIDENT = 4129;
|
||||
static constexpr uint32_t DECAY_DIVISOR = 8192;
|
||||
static constexpr uint32_t UTIL_ACC_MAX = 3420;
|
||||
static constexpr uint32_t UTIL_ACC_MAX = 2012;
|
||||
|
||||
uint32_t Get() { return (util_acc * UTIL_MAX / UTIL_ACC_MAX); };
|
||||
void Update(uint32_t util) { util_acc = util_acc * DECAY_DIVIDENT / DECAY_DIVISOR + util; };
|
||||
@@ -136,9 +136,8 @@ protected:
|
||||
static void CpuUtilWorker(void* args);
|
||||
static void Main(void* args);
|
||||
|
||||
// Get max from a sliding window in O(1)
|
||||
static constexpr size_t WINDOW_SIZE = SAMPLE_RATE / 10;
|
||||
template <typename T>
|
||||
// Get max value from a sliding window in O(1)
|
||||
template <typename T, size_t WINDOW_SIZE>
|
||||
class SWindowMax {
|
||||
protected:
|
||||
typedef struct {
|
||||
@@ -148,7 +147,7 @@ protected:
|
||||
|
||||
struct s_Stack {
|
||||
s_Entry m_stack[WINDOW_SIZE] = {};
|
||||
size_t m_next = 0;
|
||||
size_t m_next = WINDOW_SIZE;
|
||||
|
||||
bool empty() { return m_next == 0; };
|
||||
s_Entry top() { return m_stack[m_next-1]; };
|
||||
@@ -164,26 +163,27 @@ protected:
|
||||
s_Stack deqStack;
|
||||
|
||||
void Push(s_Stack& stack, T item) {
|
||||
s_Entry n;
|
||||
n.item = item;
|
||||
n.max = enqStack.empty() ? item : std::max(item, enqStack.top().max);
|
||||
s_Entry n = {
|
||||
.item = item,
|
||||
.max = enqStack.empty() ? item : std::max(item, enqStack.top().max)
|
||||
};
|
||||
stack.push(n);
|
||||
}
|
||||
|
||||
void Pop() {
|
||||
T Pop() {
|
||||
if (deqStack.empty()) {
|
||||
while (!enqStack.empty())
|
||||
Push(deqStack, enqStack.pop().max);
|
||||
}
|
||||
deqStack.pop();
|
||||
return deqStack.pop().item;
|
||||
}
|
||||
|
||||
public:
|
||||
SWindowMax() { deqStack.m_next = WINDOW_SIZE; }
|
||||
SWindowMax() {}
|
||||
|
||||
void Add(T item) { Pop(); Push(enqStack, item); }
|
||||
|
||||
T Max() {
|
||||
T Get() {
|
||||
if (!enqStack.empty()) {
|
||||
T enqMax = enqStack.top().max;
|
||||
if (!deqStack.empty()) {
|
||||
@@ -197,4 +197,42 @@ protected:
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
// Get average value from a sliding window in O(1)
|
||||
template <typename T, size_t WINDOW_SIZE>
|
||||
class SWindowAvg {
|
||||
public:
|
||||
SWindowAvg() {}
|
||||
|
||||
void Add(T item) {
|
||||
T pop = m_queue[m_next];
|
||||
m_queue[m_next] = item;
|
||||
m_next = (m_next + 1) % WINDOW_SIZE;
|
||||
m_sum -= pop;
|
||||
m_sum += item;
|
||||
}
|
||||
|
||||
T Get() { return m_sum / WINDOW_SIZE; }
|
||||
|
||||
protected:
|
||||
size_t m_next = 0;
|
||||
T m_sum = 0;
|
||||
T m_queue[WINDOW_SIZE] = {};
|
||||
};
|
||||
|
||||
struct s_GpuUtil {
|
||||
SWindowMax<uint32_t, 32> window {};
|
||||
|
||||
uint32_t util_acc = 0;
|
||||
// After 160 ms (if SAMPLE_RATE == 200, 32 samples)
|
||||
// UTIL_MAX * D^32 ≈ 1 (UTIL_MAX decayed to 1)
|
||||
// D = 6880 / 8192
|
||||
// Util_acc_max = Util_acc_inf = 6145
|
||||
static constexpr uint32_t DECAY_DIVIDENT = 6880;
|
||||
static constexpr uint32_t DECAY_DIVISOR = 8192;
|
||||
static constexpr uint32_t UTIL_ACC_MAX = 6145;
|
||||
|
||||
uint32_t Get() { return ((util_acc * UTIL_MAX / UTIL_ACC_MAX) + window.Get()) / 2; };
|
||||
void Update(uint32_t util) { window.Add(util); util_acc = util_acc * DECAY_DIVIDENT / DECAY_DIVISOR + util; };
|
||||
};
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user