#!/usr/bin/env python # coding: utf-8 # # Description # # The current PID controller in the Linux Kernel currently makes a rough estimation for the PID coefficients (which we will call **SE**, for **Standard Estimation**). # We want to investigate whether or not these can be improved by using an alternative method to compute these coefficients (which we will call **ZN**, for **Ziegler-Nichols**). # # ## Ziegler-Nichols Heuristic Algorithm # # The Ziegler-Nichols Heuristic was chosen due to its inherent simplicity and low computational overhead. # It works by initially setting the I and D terms of the PID controller to 0 and steadily increasing the P-term. It simultaneously monitors the temperature with the end goal being a stable set of Oscillations between over and under-shooting. # Once this has been achieved it calculates the I and D terms based on the final value of the P term and the period of oscillations, and the P term is scaled down. # More details on this can be found here: https://en.wikipedia.org/wiki/Ziegler%E2%80%93Nichols_method # # ## Investigations (Comparing with SE): # 1. Benchmark Performance # 2. Distribution of CPU Frequency Domains # 3. PID Output Curves # 4. Average Temperature and CPU Frequencies # 5. Average CDev State # # In[1]: from lisa.platforms.platinfo import PlatformInfo from lisa.trace import Trace import os import numpy as np import json import pandas as pd import holoviews as hv hv.extension('bokeh') ZN_geekbench = [] SE_geekbench = [] SW_geekbench = [] ZN_gfxbench = [] SE_gfxbench = [] SW_gfxbench = [] trace_home_dir = "/path/to/traces/" platinfo = trace_home_dir + "platinfo.yaml" SW = trace_home_dir + "SW/" SE = trace_home_dir + "SE/" ZN = trace_home_dir + "ZN/" geekbench = "wk1-geekbench-1" gfxbench = "wk2-gfxbench-corporate-1" events = [ 'thermal_power_allocator_pid', 'thermal_power_cpu_limit', 'cdev_update', 'thermal_temperature', 'cpu_frequency', ] hikey_folders = [] for i in range(0, 10): dir_name = "hikey_data.{}/".format(i) hikey_folders.append(dir_name) # In[2]: allocator_to_traces = { ("ZN", geekbench) : ZN_geekbench, ("SE", geekbench) : SE_geekbench, ("SW", geekbench) : SW_geekbench, ("ZN", gfxbench) : ZN_gfxbench, ("SE", gfxbench) : SE_gfxbench, ("SW", gfxbench) : SW_gfxbench } allocator_to_dir = {"ZN" : ZN, "SE" : SE, "SW": SW, } # In[4]: def get_score(results, score): with open(results) as json_file: data = json.load(json_file) for p in data["metrics"]: if p["name"] == score: return p["value"] def get_single_core_score(results): return get_score(results, "score") def get_multi_core_score(results): return get_score(results, "multicore_score") def get_time(results): return get_score(results, "execution_time") def get_1080_manhattan(results): return get_score(results, "1080p Manhattan 3.1 Offscreen") def get_1440_manhattan(results): return get_score(results, "1440p Manhattan 3.1.1 Offscreen") def get_tessellation(results): return get_score(results, "Tessellation") def get_1080_tessellation(results): return get_score(results, "1080p Tessellation Offscreen") def get_file(path, file): return path +"/"+ file def get_trace_file(allocator, run, workload): path = allocator + run + workload return Trace(get_file(path, "trace.dat"), PlatformInfo.from_yaml_map(platinfo), events = events, normalize_time=True) def get_results_file(allocator, run, workload): path = allocator + run + workload return get_file(path, "result.json") def get_average_geekbench_scores(allocator): average_single_core = 0 average_multi_core = 0 average_time = 0 for run in hikey_folders: results = get_results_file(allocator, run, geekbench) average_single_core += get_single_core_score(results) average_multi_core += get_multi_core_score(results) average_time += get_time(results) average_single_core /= len(hikey_folders) average_multi_core /= len(hikey_folders) average_time /= len(hikey_folders) return average_single_core, average_multi_core, average_time def get_average_gfxbench_scores(allocator): average_1080_manhattan = 0 average_1440_manhattan = 0 average_tessellation = 0 average_1080_tessellation = 0 average_time = 0 for run in hikey_folders: results = get_results_file(allocator, run, gfxbench) average_1080_manhattan += get_1080_manhattan(results) average_1440_manhattan += get_1440_manhattan(results) average_tessellation += get_tessellation(results) average_1080_tessellation += get_1080_tessellation(results) average_time += get_time(results) average_1080_manhattan /= len(hikey_folders) average_1440_manhattan /= len(hikey_folders) average_tessellation /= len(hikey_folders) average_1080_tessellation /= len(hikey_folders) return average_1080_manhattan, average_1440_manhattan, average_tessellation, average_1080_tessellation, average_time def return_averages(df): return df.min(), df.max(), df.mean(), df.std() def get_temperature(traces): dfs = [] for trace in traces: dfs.append(trace.ana.thermal.df_thermal_zones_temperature()) all_data = pd.concat(dfs) return all_data['temp'] def get_cpus_frequency(traces): dfs = [] for trace in traces: dfs.append(trace.ana.frequency.df_cpus_frequency()) all_data = pd.concat(dfs) return all_data['frequency'] def get_average_cpu_frequency(traces): freqs_0 = [] freqs_4 = [] for trace in traces: freqs_0.append(trace.ana.frequency.get_average_cpu_frequency(0)) freqs_4.append(trace.ana.frequency.get_average_cpu_frequency(4)) return np.mean(freqs_0), np.mean(freqs_4) def get_average_cdev_state(traces): dfs = [] for trace in traces: dfs.append(trace.ana.thermal.df_cpufreq_cooling_state()) all_data = pd.concat(dfs) cdev = all_data['cdev_state'] return (return_averages(cdev)) def plot_cooling_states(traces, cpu): dfs = [] for trace in traces: dfs.append(trace.ana.thermal.df_cpufreq_cooling_state()) return traces[0].ana.thermal.plot_cpu_cooling_states(cpu)#,output='ui', link_dataframes=dfs) def get_wl_traces(allocator, workload): traces = [] if allocator_to_traces[(allocator, workload)] != []: return allocator_to_traces[(allocator, workload)] for run in hikey_folders: trace = get_trace_file(allocator_to_dir[allocator], run, workload) traces.append(trace) allocator_to_traces[(allocator, workload)] = traces return allocator_to_traces[(allocator, workload)] def get_average_geekbench_temperature(allocator): return return_averages(get_temperature(get_wl_traces(allocator, geekbench))) def get_average_geekbench_cpu_frequency(allocator): return return_averages(get_cpus_frequency(get_wl_traces(allocator, geekbench))) def get_average_gfxbench_temperature(allocator): return return_averages(get_temperature(get_wl_traces(allocator, gfxbench))) def get_average_gfxbench_cpu_frequency(allocator): return return_averages(get_cpus_frequency(get_wl_traces(allocator, gfxbench))) def percentage_improvement(a, b): improvement = (np.divide(a, b)) print((improvement[0]+improvement[1] - 2)*100) return (improvement[0]+improvement[1] - 2)*100 # # Benchmark Performance # # **Results are averaged across 10 runs.** # # The following 3 code blocks show how the Hikey960 board performs when running the GeekBench5 and GFXBench Benchmarks. # # **Note:** We also draw comparison with the *Step-Wise (SW)* governor to see how the Power-Allocator governor performs in general, as well as to determine how much the CPU is used in the given benchmark. # # ## GeekBench5 # The value Tuple shows the following values: # 1. Single-Core Score # 2. Multi-Core Score # 3. Time Taken (*seconds*) # # # It is immediately clear that Ziegler Nichols PID coeffiecients outperform that of the original estimates. # There is a small improvement in the Single-Core Score and a significant improvement of the Multi-Core Score. Additionally we include the performance under the Step-Wise governor to provide a comparison. # # The results show that there is overall a 9.59% improvement in on the GFXBench5 performance with ZN over SE. # # Winner: **ZN** # # ## GFXBench # The value Tuple shows the following values: # 1. 1080p Manhattan 3.1 Offscreen # 2. 1440p Manhattan 3.1.1 Offscreen # 3. Tessellation # 4. 1080p Tessllation Offscreen # # The results here contribute very-little. Between all 3 scenarios the results show only marginal differences. While SE outperforms ZN by 0.1%, we can conclude that the benchmark itself does not use the CPU in any meaningful way and so we can disregard the results. # # Winner: **Draw** # In[5]: print(get_average_geekbench_scores(ZN)) print(get_average_geekbench_scores(SE)) print(get_average_geekbench_scores(SW)) geek = percentage_improvement(get_average_geekbench_scores(ZN), get_average_geekbench_scores(SE)) # In[6]: print(get_average_gfxbench_scores(ZN)) print(get_average_gfxbench_scores(SE)) print(get_average_gfxbench_scores(SW)) gfx = percentage_improvement(get_average_gfxbench_scores(ZN), get_average_gfxbench_scores(SE)) # In[7]: geek + gfx # # Distribution of CPU Frequency Domains # # A good PID system will try to find an optimal frequency to operate at over a sustained period of time. If the ideal frequency is somewhere between 2 OPP's, then we would expect it to fluctuate between the 2 adjacent OPP's. # # What the following 4 snippets show is the total frequency residency for each of the CPU clusters across both benchmarks. # # ## GeekBench5 # # Due to the scaling the plots look similar, however, upon closer inspection it is apparant that the ZN plots have a more even time-split with around 65% of time being spent at the lowest frequency for CPU0 vs 80+% for SE. Due to the better thermal management, this means that ZN is also able to get a greater time-share in the highest frequency. # For CPU4, SE does manage to get more cycles at the highest frequency, however it spends most of it's time at the lowest frequency. Conversly, ZN manages a better distribution of the middle OPP's which suggests that the optimal frequency is somewhere in the middle. # These 2 results combined suggest that ZN does a better job at moving between OPPs to regulate temperature rather than SE which tends to flick primarily between max and min states. # # Winner: **ZN** # # ## GFXBench # # We previously established that the CPU isn't used very much in GFXBench. However, that means that we want to be as efficient as possible when it *is* used. # The plots for CPU4 (Big Core) look more or less identical between the 2 PID coefficients, but for CPU0 (Little Core) there is some significant variance. # For SE, 90+% of the time is using the lowest frequency, compared with around 70% for ZN, and consequently can get more cycles out at higher frequencies. # These results again show that ZN has a better distribution of Operating Frequencies than SE. # # Winner: **ZN** # In[8]: domain = 0 index = 0 pct = True dev = "ZN" alt = "SE" benchmark = geekbench dev_plots = get_wl_traces(dev, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] sw_plots = get_wl_traces(alt, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] for i in range(1, 10): dev_plots *= get_wl_traces(dev, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] sw_plots *= get_wl_traces(alt, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] dev_plots + sw_plots # In[9]: domain = 4 index = 0 pct = True dev = "ZN" alt = "SE" benchmark = geekbench dev_plots = get_wl_traces(dev, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] sw_plots = get_wl_traces(alt, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] for i in range(1, 10): dev_plots *= get_wl_traces(dev, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] sw_plots *= get_wl_traces(alt, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] dev_plots + sw_plots # In[10]: domain = 0 index = 0 pct = True dev = "ZN" alt = "SE" benchmark = gfxbench dev_plots = get_wl_traces(dev, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] sw_plots = get_wl_traces(alt, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] for i in range(1, 10): dev_plots *= get_wl_traces(dev, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] sw_plots *= get_wl_traces(alt, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] dev_plots + sw_plots # In[12]: domain = 4 index = 0 pct = True dev = "ZN" alt = "SE" benchmark = gfxbench dev_plots = get_wl_traces(dev, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] sw_plots = get_wl_traces(alt, benchmark)[0].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] for i in range(1, 10): dev_plots *= get_wl_traces(dev, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] sw_plots *= get_wl_traces(alt, benchmark)[i].ana.frequency.plot_cpu_frequency_residency(domain, pct=pct)[index] dev_plots + sw_plots # # PID Output Curves # # The following snippets show how the PID output varies across the benchmarks across the 10 runs. # # ## GeekBench # # While neither plots are "smooth", it is clear that SE has a much wilder set of PID output fluctuating between max and min output rapidly. # ZN does fluctate but no where near to the same extent. # # Winner: **ZN** # # ## GFXBench # # The difference between these 2 plots are night-and-day. ZN clearly has a smooth downwards curve as it approaches the target temperature whereas SE has much more significant drop between max and min output. # # Winner: **ZN** # In[14]: bench = geekbench traces = get_wl_traces("ZN", bench) se = get_wl_traces("SE", bench) index = 0 for i in range(0,10): dev = [(traces[i].df_event("thermal_power_allocator_pid"))] b = [(se[i].df_event("thermal_power_allocator_pid"))] if i == 0: plt = hv.Curve(b[index]["output"]) output = hv.Curve(dev[index]["output"]) else: output *= hv.Curve(dev[index]["output"]) plt *= hv.Curve(b[index]["output"]) output + plt # In[15]: bench = gfxbench traces = get_wl_traces("ZN", bench) se = get_wl_traces("SE", bench) index = 0 for i in range(0,10): dev = [(traces[i].df_event("thermal_power_allocator_pid"))] b = [(se[i].df_event("thermal_power_allocator_pid"))] if i == 0: plt = hv.Curve(b[index]["output"]) output = hv.Curve(dev[index]["output"]) else: output *= hv.Curve(dev[index]["output"]) plt *= hv.Curve(b[index]["output"]) output + plt # # Average Temperature and CPU Frequencies # # The following snippets show the average temperature and CPU frequencies for across the 2 benchmarks. # # ## GeekBench5 # # ### Temperature # # Given that the target temperature is 75000mC, we see that ZN overshoots by 650mC and SE undershoots by 260mC. It is open to interpretation on what constitutes "better" but since SE is closer, it wins by a close margin. # # Winner: **SE** # # ### CPU Frequencies # # We see that ZN operates at around 45000Hz faster than SE. # # Winner: **ZN** # # ## GFXBench # ### Temperature # # Both sets of Coefficients overshoot the target temperature, which is largely driven by the GPU, with SE slightly performing better than ZN. # # Winner: **SE** # # ### CPU Frequencies # # While the CPU is not used heavily in GFXBench, it can be argued that higher is better. Further analysis should be provided to see what the impact of CPU frequency has on the performance with GFXBench. # # Winner: **SE** # In[24]: print("GeekBench Temperature") print(get_average_geekbench_temperature("ZN")[2]) print(get_average_geekbench_temperature("SE")[2]) print(get_average_geekbench_temperature("SW")[2]) # In[25]: print("GeekBench CPU Freq") print(get_average_geekbench_cpu_frequency("ZN")[2]) print(get_average_geekbench_cpu_frequency("SE")[2]) print(get_average_geekbench_cpu_frequency("SW")[2]) # In[26]: print("GFXBench Temperature") print(get_average_gfxbench_temperature("ZN")[2]) print(get_average_gfxbench_temperature("SE")[2]) print(get_average_gfxbench_temperature("SW")[2]) # In[27]: print("GFXBench CPU Freq") print(get_average_gfxbench_cpu_frequency("ZN")[2]) print(get_average_gfxbench_cpu_frequency("SE")[2]) print(get_average_gfxbench_cpu_frequency("SW")[2]) # # Average CDev State # # The lower the CDev state, the better, as that represents a higher operating frequency. # # ## GeekBench # # ZN runs on an average CDev state of 1.07 vs SE which is 1.85. # # Winner: **ZN** # # ## GFXBench # ZN runs on an average CDev State of 1.19 vs SE which is 2.39 # # Winner: **ZN** # # ZN is the overall winner here as shown by the plots below, which shows that ZN alternates between middle Cdev states a lot more than SE which alternates between the Max and Min states more often than not. # In[28]: print ("GeekBench CDev State") print(get_average_cdev_state(get_wl_traces("ZN", geekbench))[2]) print(get_average_cdev_state(get_wl_traces("SE", geekbench))[2]) # In[29]: print ("GFXBench CDev State") print(get_average_cdev_state(get_wl_traces("ZN", gfxbench))[2]) print(get_average_cdev_state(get_wl_traces("SE", gfxbench))[2]) # In[30]: plot_cooling_states(get_wl_traces("ZN", geekbench),0) + plot_cooling_states(get_wl_traces("SE",geekbench),0) # In[31]: plot_cooling_states(get_wl_traces("ZN", geekbench),4) + plot_cooling_states(get_wl_traces("SE",geekbench),4) # # Conclusions # # Based on these results, it is fair to say that ZN is a better method of estimating PID coefficients. The main point being a 9% improvement on the GeekBench5 score, but also additional gains such as the PID controller operating in a more controlled fashion (smoother output curves).