This colab provides several example queries and graphs using Altair for the 2019 Google cluster trace. Further examples will be added over time.
Important: in order to be able to run the queries you will need to:
#@title Please input your project id
import pandas as pd
import numpy as np
import altair as alt
from google.cloud import bigquery
# Provide credentials to the runtime
from google.colab import auth
from google.cloud.bigquery import magics
auth.authenticate_user()
print('Authenticated')
project_id = '' #@param {type: "string"}
# Set the default project id for %bigquery magic
magics.context.project = project_id
# Use the client to run queries constructed from a more complicated function.
client = bigquery.Client(project=project_id)
This section shows the most basic way of querying the trace using the bigquery magic
%%bigquery
SELECT capacity.cpus AS cpu_cap,
capacity.memory AS memory_cap,
COUNT(DISTINCT machine_id) AS num_machines
FROM `google.com:google-cluster-data`.clusterdata_2019_a.machine_events
GROUP BY 1,2
%%bigquery
SELECT COUNT(DISTINCT collection_id) AS collections FROM
`google.com:google-cluster-data`.clusterdata_2019_a.collection_events;
This query takes a cell as input and plots a resource usage time-series for every hour of the trace broken down by tier.
#@title Select a cell and a resource to plot the cell level usage series
def query_cell_capacity(cell):
return '''
SELECT SUM(cpu_cap) AS cpu_capacity,
SUM(memory_cap) AS memory_capacity
FROM (
SELECT machine_id, MAX(capacity.cpus) AS cpu_cap,
MAX(capacity.memory) AS memory_cap
FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.machine_events
GROUP BY 1
)
'''.format(cell=cell)
def query_per_instance_usage_priority(cell):
return '''
SELECT u.time AS time,
u.collection_id AS collection_id,
u.instance_index AS instance_index,
e.priority AS priority,
CASE
WHEN e.priority BETWEEN 0 AND 99 THEN '1_free'
WHEN e.priority BETWEEN 100 AND 115 THEN '2_beb'
WHEN e.priority BETWEEN 116 AND 119 THEN '3_mid'
ELSE '4_prod'
END AS tier,
u.cpu_usage AS cpu_usage,
u.memory_usage AS memory_usage
FROM (
SELECT start_time AS time,
collection_id,
instance_index,
machine_id,
average_usage.cpus AS cpu_usage,
average_usage.memory AS memory_usage
FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.instance_usage
WHERE (alloc_collection_id IS NULL OR alloc_collection_id = 0)
AND (end_time - start_time) >= (5 * 60 * 1e6)
) AS u JOIN (
SELECT collection_id, instance_index, machine_id,
MAX(priority) AS priority
FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.instance_events
WHERE (alloc_collection_id IS NULL OR alloc_collection_id = 0)
GROUP BY 1, 2, 3
) AS e ON u.collection_id = e.collection_id
AND u.instance_index = e.instance_index
AND u.machine_id = e.machine_id
'''.format(cell=cell)
def query_per_tier_utilization_time_series(cell, cpu_capacity, memory_capacity):
return '''
SELECT CAST(FLOOR(time/(1e6 * 60 * 60)) AS INT64) AS hour_index,
tier,
SUM(cpu_usage) / (12 * {cpu_capacity}) AS avg_cpu_usage,
SUM(memory_usage) / (12 * {memory_capacity}) AS avg_memory_usage
FROM ({table})
GROUP BY 1, 2 ORDER BY hour_index
'''.format(table=query_per_instance_usage_priority(cell),
cpu_capacity=cpu_capacity, memory_capacity=memory_capacity)
def run_query_utilization_per_time_time_series(cell):
cell_cap = client.query(query_cell_capacity(cell)).to_dataframe()
query = query_per_tier_utilization_time_series(
cell,
cell_cap['cpu_capacity'][0],
cell_cap['memory_capacity'][0])
time_series = client.query(query).to_dataframe()
return time_series
cell = 'c' #@param ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
hourly_usage = run_query_utilization_per_time_time_series(cell)
# CPU graph
cpu = alt.Chart(hourly_usage).mark_area().encode(
alt.X('hour_index:N'),
alt.Y('avg_cpu_usage:Q'),
color=alt.Color('tier', legend=alt.Legend(orient="left")),
order=alt.Order('tier', sort='descending'),
tooltip=['hour_index', 'tier', 'avg_cpu_usage']
)
cpu.encoding.x.title = "Hour"
cpu.encoding.y.title = "Average CPU usage"
cpu.display()
# Memory graph
memory = alt.Chart(hourly_usage).mark_area().encode(
alt.X('hour_index:N'),
alt.Y('avg_memory_usage:Q'),
color=alt.Color('tier', legend=alt.Legend(orient="left")),
order=alt.Order('tier', sort='descending'),
tooltip=['hour_index', 'tier', 'avg_memory_usage']
)
memory.encoding.x.title = "Hour"
memory.encoding.y.title = "Average memory usage"
memory.display()
This query takes a cell as input and plots a per-machine resource utilization CDF.
#@title Select a cell and plot its per-machine resource utilization CDFs
# Functions to plot CDFs using Altair
def pick_quantiles_from_tall_dataframe(data, qcol, name=""):
quantiles = pd.DataFrame([x for x in data[qcol]]).transpose()
if name != "":
quantiles.columns = data[name]
return quantiles
# - data: a dataframe with one row and one or more columns of quantiles (results
# returned from APPROX_QUANTILES)
# - qcols: a list of names of the quantiles
# - names: the names of each returned quantiles' columns.
def pick_quantiles_from_wide_dataframe(data, qcols, names=[]):
quantiles = {}
i = 0
for qcol in qcols:
col_name = qcol
if i < len(names):
col_name = names[i]
quantiles[col_name] = data[qcol][0]
i+=1
return pd.DataFrame(quantiles)
# - quantiles: a dataframe where each column contains the quantiles of one
# data set. The index (i.e. row names) of the dataframe is the quantile. The
# column names are the names of the data set.
def plot_cdfs(quantiles, xlab="Value", ylab="CDF",
legend_title="dataset", labels=[],
interactive=False,
title=''):
dfs = []
label = legend_title
yval = range(quantiles.shape[0])
esp = 1.0/(len(quantiles)-1)
yval = [y * esp for y in yval]
while label == xlab or label == ylab:
label += '_'
for col_idx, col in enumerate(quantiles.columns):
col_label = col
if col_idx < len(labels):
col_label = labels[col_idx]
dfs.append(pd.DataFrame({
label: col_label,
xlab: quantiles[col],
ylab: yval
}))
cdfs = pd.concat(dfs)
lines = alt.Chart(cdfs).mark_line().encode(
# If you can draw a CDF, it has to be continuous real-valued
x=xlab+":Q",
y=ylab+":Q",
color=label+":N"
).properties(
title=title
)
if not interactive:
return lines
# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection(type='single', nearest=True, on='mouseover',
fields=[ylab], empty='none')
# Transparent selectors across the chart. This is what tells us
# the y-value of the cursor
selectors = alt.Chart(cdfs).mark_point().encode(
y=ylab+":Q",
opacity=alt.value(0),
).properties(
selection=nearest
)
# Draw text labels near the points, and highlight based on selection
text = lines.mark_text(align='left', dx=5, dy=-5).encode(
text=alt.condition(nearest,
alt.Text(xlab+":Q", format=".2f"),
alt.value(' '))
)
# Draw a rule at the location of the selection
rules = alt.Chart(cdfs).mark_rule(color='gray').encode(
y=ylab+":Q",
).transform_filter(
nearest.ref()
)
# Draw points on the line, and highlight based on selection
points = lines.mark_point().encode(
opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)
# Put the five layers into a chart and bind the data
return alt.layer(lines, selectors, rules, text, points).interactive(
bind_y=False)
# Functions to create the query
def query_machine_capacity(cell):
return '''
SELECT machine_id, MAX(capacity.cpus) AS cpu_cap,
MAX(capacity.memory) AS memory_cap
FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.machine_events
GROUP BY 1
'''.format(cell=cell)
def query_top_level_instance_usage(cell):
return '''
SELECT CAST(FLOOR(start_time/(1e6 * 300)) * (1000000 * 300) AS INT64) AS time,
collection_id,
instance_index,
machine_id,
average_usage.cpus AS cpu_usage,
average_usage.memory AS memory_usage
FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.instance_usage
WHERE (alloc_collection_id IS NULL OR alloc_collection_id = 0)
AND (end_time - start_time) >= (5 * 60 * 1e6)
'''.format(cell=cell)
def query_machine_usage(cell):
return '''
SELECT u.time AS time,
u.machine_id AS machine_id,
SUM(u.cpu_usage) AS cpu_usage,
SUM(u.memory_usage) AS memory_usage,
MAX(m.cpu_cap) AS cpu_capacity,
MAX(m.memory_cap) AS memory_capacity
FROM ({instance_usage}) AS u JOIN
({machine_capacity}) AS m
ON u.machine_id = m.machine_id
GROUP BY 1, 2
'''.format(instance_usage = query_top_level_instance_usage(cell),
machine_capacity = query_machine_capacity(cell))
def query_machine_utilization_distribution(cell):
return '''
SELECT APPROX_QUANTILES(IF(cpu_usage > cpu_capacity, 1.0, cpu_usage / cpu_capacity), 100) AS cpu_util_dist,
APPROX_QUANTILES(IF(memory_usage > memory_capacity, 1.0, memory_usage / memory_capacity), 100) AS memory_util_dist
FROM ({table})
'''.format(table = query_machine_usage(cell))
cell = 'd' #@param ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
query = query_machine_utilization_distribution(cell)
machine_util_dist = client.query(query).to_dataframe()
plot_cdfs(pick_quantiles_from_wide_dataframe(machine_util_dist, ['cpu_util_dist', 'memory_util_dist'], ['CPU', 'Memory']), xlab='x - resource utilization (%)', ylab="Probability (resource utilization < x)", interactive=True)