The current PID controller in the Linux Kernel currently makes a rough estimation for the PID coefficients (which we will call SE, for Standard Estimation). We want to investigate whether or not these can be improved by using an alternative method to compute these coefficients (which we will call ZN, for Ziegler-Nichols).
The Ziegler-Nichols Heuristic was chosen due to its inherent simplicity and low computational overhead. It works by initially setting the I and D terms of the PID controller to 0 and steadily increasing the P-term. It simultaneously monitors the temperature with the end goal being a stable set of Oscillations between over and under-shooting. Once this has been achieved it calculates the I and D terms based on the final value of the P term and the period of oscillations, and the P term is scaled down. More details on this can be found here: https://en.wikipedia.org/wiki/Ziegler%E2%80%93Nichols_method
from lisa.platforms.platinfo import PlatformInfo
from lisa.trace import Trace
import os
import numpy as np
import json
import pandas as pd
import holoviews as hv
hv.extension('bokeh')
ZN_geekbench = []
SE_geekbench = []
SW_geekbench = []
ZN_gfxbench = []
SE_gfxbench = []
SW_gfxbench = []
trace_home_dir = "/path/to/traces/"
platinfo = trace_home_dir + "platinfo.yaml"
SW = trace_home_dir + "SW/"
SE = trace_home_dir + "SE/"
ZN = trace_home_dir + "ZN/"
geekbench = "wk1-geekbench-1"
gfxbench = "wk2-gfxbench-corporate-1"
events = [
'thermal_power_allocator_pid',
'thermal_power_cpu_limit',
'cdev_update',
'thermal_temperature',
'cpu_frequency',
]
hikey_folders = []
for i in range(0, 10):
dir_name = "hikey_data.{}/".format(i)
hikey_folders.append(dir_name)
allocator_to_traces = {
("ZN", geekbench) : ZN_geekbench,
("SE", geekbench) : SE_geekbench,
("SW", geekbench) : SW_geekbench,
("ZN", gfxbench) : ZN_gfxbench,
("SE", gfxbench) : SE_gfxbench,
("SW", gfxbench) : SW_gfxbench
}
allocator_to_dir = {"ZN" : ZN,
"SE" : SE,
"SW": SW,
}
def get_score(results, score):
with open(results) as json_file:
data = json.load(json_file)
for p in data["metrics"]:
if p["name"] == score:
return p["value"]
def get_single_core_score(results):
return get_score(results, "score")
def get_multi_core_score(results):
return get_score(results, "multicore_score")
def get_time(results):
return get_score(results, "execution_time")
def get_1080_manhattan(results):
return get_score(results, "1080p Manhattan 3.1 Offscreen")
def get_1440_manhattan(results):
return get_score(results, "1440p Manhattan 3.1.1 Offscreen")
def get_tessellation(results):
return get_score(results, "Tessellation")
def get_1080_tessellation(results):
return get_score(results, "1080p Tessellation Offscreen")
def get_file(path, file):
return path +"/"+ file
def get_trace_file(allocator, run, workload):
path = allocator + run + workload
return Trace(get_file(path, "trace.dat"), PlatformInfo.from_yaml_map(platinfo), events = events, normalize_time=True)
def get_results_file(allocator, run, workload):
path = allocator + run + workload
return get_file(path, "result.json")
def get_average_geekbench_scores(allocator):
average_single_core = 0
average_multi_core = 0
average_time = 0
for run in hikey_folders:
results = get_results_file(allocator, run, geekbench)
average_single_core += get_single_core_score(results)
average_multi_core += get_multi_core_score(results)
average_time += get_time(results)
average_single_core /= len(hikey_folders)
average_multi_core /= len(hikey_folders)
average_time /= len(hikey_folders)
return average_single_core, average_multi_core, average_time
def get_average_gfxbench_scores(allocator):
average_1080_manhattan = 0
average_1440_manhattan = 0
average_tessellation = 0
average_1080_tessellation = 0
average_time = 0
for run in hikey_folders:
results = get_results_file(allocator, run, gfxbench)
average_1080_manhattan += get_1080_manhattan(results)
average_1440_manhattan += get_1440_manhattan(results)
average_tessellation += get_tessellation(results)
average_1080_tessellation += get_1080_tessellation(results)
average_time += get_time(results)
average_1080_manhattan /= len(hikey_folders)
average_1440_manhattan /= len(hikey_folders)
average_tessellation /= len(hikey_folders)
average_1080_tessellation /= len(hikey_folders)
return average_1080_manhattan, average_1440_manhattan, average_tessellation, average_1080_tessellation, average_time
def return_averages(df):
return df.min(), df.max(), df.mean(), df.std()
def get_temperature(traces):
dfs = []
for trace in traces:
dfs.append(trace.ana.thermal.df_thermal_zones_temperature())
all_data = pd.concat(dfs)
return all_data['temp']
def get_cpus_frequency(traces):
dfs = []
for trace in traces:
dfs.append(trace.ana.frequency.df_cpus_frequency())
all_data = pd.concat(dfs)
return all_data['frequency']
def get_average_cpu_frequency(traces):
freqs_0 = []
freqs_4 = []
for trace in traces:
freqs_0.append(trace.ana.frequency.get_average_cpu_frequency(0))
freqs_4.append(trace.ana.frequency.get_average_cpu_frequency(4))
return np.mean(freqs_0), np.mean(freqs_4)
def get_average_cdev_state(traces):
dfs = []
for trace in traces:
dfs.append(trace.ana.thermal.df_cpufreq_cooling_state())
all_data = pd.concat(dfs)
cdev = all_data['cdev_state']
return (return_averages(cdev))
def plot_cooling_states(traces, cpu):
dfs = []
for trace in traces:
dfs.append(trace.ana.thermal.df_cpufreq_cooling_state())
return traces[0].ana.thermal.plot_cpu_cooling_states(cpu)#,output='ui', link_dataframes=dfs)
def get_wl_traces(allocator, workload):
traces = []
if allocator_to_traces[(allocator, workload)] != []: return allocator_to_traces[(allocator, workload)]
for run in hikey_folders:
trace = get_trace_file(allocator_to_dir[allocator], run, workload)
traces.append(trace)
allocator_to_traces[(allocator, workload)] = traces
return allocator_to_traces[(allocator, workload)]
def get_average_geekbench_temperature(allocator):
return return_averages(get_temperature(get_wl_traces(allocator, geekbench)))
def get_average_geekbench_cpu_frequency(allocator):
return return_averages(get_cpus_frequency(get_wl_traces(allocator, geekbench)))
def get_average_gfxbench_temperature(allocator):
return return_averages(get_temperature(get_wl_traces(allocator, gfxbench)))
def get_average_gfxbench_cpu_frequency(allocator):
return return_averages(get_cpus_frequency(get_wl_traces(allocator, gfxbench)))
def percentage_improvement(a, b):
improvement = (np.divide(a, b))
print((improvement[0]+improvement[1] - 2)*100)
return (improvement[0]+improvement[1] - 2)*100
Results are averaged across 10 runs.
The following 3 code blocks show how the Hikey960 board performs when running the GeekBench5 and GFXBench Benchmarks.
Note: We also draw comparison with the Step-Wise (SW) governor to see how the Power-Allocator governor performs in general, as well as to determine how much the CPU is used in the given benchmark.
The value Tuple shows the following values:
It is immediately clear that Ziegler Nichols PID coeffiecients outperform that of the original estimates. There is a small improvement in the Single-Core Score and a significant improvement of the Multi-Core Score. Additionally we include the performance under the Step-Wise governor to provide a comparison.
The results show that there is overall a 9.59% improvement in on the GFXBench5 performance with ZN over SE.
Winner: ZN
The value Tuple shows the following values:
The results here contribute very-little. Between all 3 scenarios the results show only marginal differences. While SE outperforms ZN by 0.1%, we can conclude that the benchmark itself does not use the CPU in any meaningful way and so we can disregard the results.
Winner: Draw
print(get_average_geekbench_scores(ZN))
print(get_average_geekbench_scores(SE))
print(get_average_geekbench_scores(SW))
geek = percentage_improvement(get_average_geekbench_scores(ZN), get_average_geekbench_scores(SE))
(1744.2, 4394.5, 297.88991780281066) (1743.9, 4010.7, 310.7407777786255) (1739.6, 4534.8, 302.2964267969131) 9.586604671313825
print(get_average_gfxbench_scores(ZN))
print(get_average_gfxbench_scores(SE))
print(get_average_gfxbench_scores(SW))
gfx = percentage_improvement(get_average_gfxbench_scores(ZN), get_average_gfxbench_scores(SE))
(18.0, 9.900000000000002, 23.9, 35.8, 2390.5464832782745) (18.0, 9.91, 23.7, 35.5, 2441.0107715129852) (18.0, 9.920000000000002, 24.1, 35.8, 2419.61163854599) -0.10090817356203097
geek + gfx
9.485696497751794
A good PID system will try to find an optimal frequency to operate at over a sustained period of time. If the ideal frequency is somewhere between 2 OPP's, then we would expect it to fluctuate between the 2 adjacent OPP's.
What the following 4 snippets show is the total frequency residency for each of the CPU clusters across both benchmarks.
Due to the scaling the plots look similar, however, upon closer inspection it is apparant that the ZN plots have a more even time-split with around 65% of time being spent at the lowest frequency for CPU0 vs 80+% for SE. Due to the better thermal management, this means that ZN is also able to get a greater time-share in the highest frequency. For CPU4, SE does manage to get more cycles at the highest frequency, however it spends most of it's time at the lowest frequency. Conversly, ZN manages a better distribution of the middle OPP's which suggests that the optimal frequency is somewhere in the middle. These 2 results combined suggest that ZN does a better job at moving between OPPs to regulate temperature rather than SE which tends to flick primarily between max and min states.
Winner: ZN
We previously established that the CPU isn't used very much in GFXBench. However, that means that we want to be as efficient as possible when it is used. The plots for CPU4 (Big Core) look more or less identical between the 2 PID coefficients, but for CPU0 (Little Core) there is some significant variance. For SE, 90+% of the time is using the lowest frequency, compared with around 70% for ZN, and consequently can get more cycles out at higher frequencies. These results again show that ZN has a better distribution of Operating Frequencies than SE.
Winner: ZN
domain = 0
index = 0
pct = True
dev = "ZN"
alt = "SE"
benchmark = geekbench
dev_plots = get_wl_traces(dev, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
sw_plots = get_wl_traces(alt, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
for i in range(1, 10):
dev_plots *= get_wl_traces(dev, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
sw_plots *= get_wl_traces(alt, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
dev_plots + sw_plots
domain = 4
index = 0
pct = True
dev = "ZN"
alt = "SE"
benchmark = geekbench
dev_plots = get_wl_traces(dev, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
sw_plots = get_wl_traces(alt, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
for i in range(1, 10):
dev_plots *= get_wl_traces(dev, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
sw_plots *= get_wl_traces(alt, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
dev_plots + sw_plots
domain = 0
index = 0
pct = True
dev = "ZN"
alt = "SE"
benchmark = gfxbench
dev_plots = get_wl_traces(dev, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
sw_plots = get_wl_traces(alt, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
for i in range(1, 10):
dev_plots *= get_wl_traces(dev, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
sw_plots *= get_wl_traces(alt, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
dev_plots + sw_plots
domain = 4
index = 0
pct = True
dev = "ZN"
alt = "SE"
benchmark = gfxbench
dev_plots = get_wl_traces(dev, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
sw_plots = get_wl_traces(alt, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
for i in range(1, 10):
dev_plots *= get_wl_traces(dev, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
sw_plots *= get_wl_traces(alt, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index]
dev_plots + sw_plots
The following snippets show how the PID output varies across the benchmarks across the 10 runs.
While neither plots are "smooth", it is clear that SE has a much wilder set of PID output fluctuating between max and min output rapidly. ZN does fluctate but no where near to the same extent.
Winner: ZN
The difference between these 2 plots are night-and-day. ZN clearly has a smooth downwards curve as it approaches the target temperature whereas SE has much more significant drop between max and min output.
Winner: ZN
bench = geekbench
traces = get_wl_traces("ZN", bench)
se = get_wl_traces("SE", bench)
index = 0
for i in range(0,10):
dev = [(traces[i].df_event("thermal_power_allocator_pid"))]
b = [(se[i].df_event("thermal_power_allocator_pid"))]
if i == 0:
plt = hv.Curve(b[index]["output"])
output = hv.Curve(dev[index]["output"])
else:
output *= hv.Curve(dev[index]["output"])
plt *= hv.Curve(b[index]["output"])
output + plt
bench = gfxbench
traces = get_wl_traces("ZN", bench)
se = get_wl_traces("SE", bench)
index = 0
for i in range(0,10):
dev = [(traces[i].df_event("thermal_power_allocator_pid"))]
b = [(se[i].df_event("thermal_power_allocator_pid"))]
if i == 0:
plt = hv.Curve(b[index]["output"])
output = hv.Curve(dev[index]["output"])
else:
output *= hv.Curve(dev[index]["output"])
plt *= hv.Curve(b[index]["output"])
output + plt
The following snippets show the average temperature and CPU frequencies for across the 2 benchmarks.
Given that the target temperature is 75000mC, we see that ZN overshoots by 650mC and SE undershoots by 260mC. It is open to interpretation on what constitutes "better" but since SE is closer, it wins by a close margin.
Winner: SE
We see that ZN operates at around 45000Hz faster than SE.
Winner: ZN
Both sets of Coefficients overshoot the target temperature, which is largely driven by the GPU, with SE slightly performing better than ZN.
Winner: SE
While the CPU is not used heavily in GFXBench, it can be argued that higher is better. Further analysis should be provided to see what the impact of CPU frequency has on the performance with GFXBench.
Winner: SE
print("GeekBench Temperature")
print(get_average_geekbench_temperature("ZN")[2])
print(get_average_geekbench_temperature("SE")[2])
print(get_average_geekbench_temperature("SW")[2])
GeekBench Temperature 75649.10177095632 74740.85297292435 65193.36749574658
print("GeekBench CPU Freq")
print(get_average_geekbench_cpu_frequency("ZN")[2])
print(get_average_geekbench_cpu_frequency("SE")[2])
print(get_average_geekbench_cpu_frequency("SW")[2])
GeekBench CPU Freq 1456545.2518043953 1411671.7535723238 1330705.8275540834
print("GFXBench Temperature")
print(get_average_gfxbench_temperature("ZN")[2])
print(get_average_gfxbench_temperature("SE")[2])
print(get_average_gfxbench_temperature("SW")[2])
GFXBench Temperature 76772.77984693878 76113.54191118936 66897.63799801246
print("GFXBench CPU Freq")
print(get_average_gfxbench_cpu_frequency("ZN")[2])
print(get_average_gfxbench_cpu_frequency("SE")[2])
print(get_average_gfxbench_cpu_frequency("SW")[2])
GFXBench CPU Freq 1245030.9842552834 1355338.9051399915 1263972.0676866018
The lower the CDev state, the better, as that represents a higher operating frequency.
ZN runs on an average CDev state of 1.07 vs SE which is 1.85.
Winner: ZN
ZN runs on an average CDev State of 1.19 vs SE which is 2.39
Winner: ZN
ZN is the overall winner here as shown by the plots below, which shows that ZN alternates between middle Cdev states a lot more than SE which alternates between the Max and Min states more often than not.
print ("GeekBench CDev State")
print(get_average_cdev_state(get_wl_traces("ZN", geekbench))[2])
print(get_average_cdev_state(get_wl_traces("SE", geekbench))[2])
GeekBench CDev State 1.0682865210283845 1.854222535873912
print ("GFXBench CDev State")
print(get_average_cdev_state(get_wl_traces("ZN", gfxbench))[2])
print(get_average_cdev_state(get_wl_traces("SE", gfxbench))[2])
GFXBench CDev State 1.1883011985409067 2.39258337481334
plot_cooling_states(get_wl_traces("ZN", geekbench),0) + plot_cooling_states(get_wl_traces("SE",geekbench),0)
plot_cooling_states(get_wl_traces("ZN", geekbench),4) + plot_cooling_states(get_wl_traces("SE",geekbench),4)
Based on these results, it is fair to say that ZN is a better method of estimating PID coefficients. The main point being a 9% improvement on the GeekBench5 score, but also additional gains such as the PID controller operating in a more controlled fashion (smoother output curves).