Notebook

Google trace analysis colab¶

This colab provides several example queries and graphs using Altair for the 2019 Google cluster trace. Further examples will be added over time.

Important: in order to be able to run the queries you will need to:

Use the Cloud Resource Manager to Create a Cloud Platform project if you do not already have one.
Enable billing for the project.
Enable BigQuery APIs for the project.

In [ ]:

#@title Please input your project id
import pandas as pd
import numpy as np
import altair as alt
from google.cloud import bigquery
# Provide credentials to the runtime
from google.colab import auth
from google.cloud.bigquery import magics

auth.authenticate_user()
print('Authenticated')
project_id = '' #@param {type: "string"}
# Set the default project id for %bigquery magic
magics.context.project = project_id

# Use the client to run queries constructed from a more complicated function.
client = bigquery.Client(project=project_id)

Basic queries¶

This section shows the most basic way of querying the trace using the bigquery magic

In [ ]:

%%bigquery
SELECT capacity.cpus AS cpu_cap, 
capacity.memory AS memory_cap, 
COUNT(DISTINCT machine_id) AS num_machines
FROM `google.com:google-cluster-data`.clusterdata_2019_a.machine_events
GROUP BY 1,2

In [ ]:

%%bigquery
SELECT COUNT(DISTINCT collection_id) AS collections FROM 
`google.com:google-cluster-data`.clusterdata_2019_a.collection_events;

Cell level resource usage time series¶

This query takes a cell as input and plots a resource usage time-series for every hour of the trace broken down by tier.

In [ ]:

#@title Select a cell and a resource to plot the cell level usage series

def query_cell_capacity(cell):
  return '''
SELECT SUM(cpu_cap) AS cpu_capacity,
  SUM(memory_cap) AS memory_capacity
FROM (
  SELECT machine_id, MAX(capacity.cpus) AS cpu_cap,
    MAX(capacity.memory) AS memory_cap
  FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.machine_events
  GROUP BY 1
)
  '''.format(cell=cell)

def query_per_instance_usage_priority(cell):
  return '''
SELECT u.time AS time,
  u.collection_id AS collection_id,
  u.instance_index AS instance_index,
  e.priority AS priority,
  CASE
    WHEN e.priority BETWEEN 0 AND 99 THEN '1_free'
    WHEN e.priority BETWEEN 100 AND 115 THEN '2_beb'
    WHEN e.priority BETWEEN 116 AND 119 THEN '3_mid'
    ELSE '4_prod'
  END AS tier,
  u.cpu_usage AS cpu_usage,
  u.memory_usage AS memory_usage
FROM (
  SELECT start_time AS time,
    collection_id,
    instance_index,
    machine_id,
    average_usage.cpus AS cpu_usage,
    average_usage.memory AS memory_usage
  FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.instance_usage
  WHERE (alloc_collection_id IS NULL OR alloc_collection_id = 0)
    AND (end_time - start_time) >= (5 * 60 * 1e6)
) AS u JOIN (
  SELECT collection_id, instance_index, machine_id,
    MAX(priority) AS priority
  FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.instance_events
  WHERE (alloc_collection_id IS NULL OR alloc_collection_id = 0)
  GROUP BY 1, 2, 3
) AS e ON u.collection_id = e.collection_id
  AND u.instance_index = e.instance_index
  AND u.machine_id = e.machine_id
  '''.format(cell=cell)

def query_per_tier_utilization_time_series(cell, cpu_capacity, memory_capacity):
  return '''
SELECT CAST(FLOOR(time/(1e6 * 60 * 60)) AS INT64) AS hour_index,
  tier,
  SUM(cpu_usage) / (12 * {cpu_capacity}) AS avg_cpu_usage,
  SUM(memory_usage) / (12 * {memory_capacity}) AS avg_memory_usage
FROM ({table})
GROUP BY 1, 2 ORDER BY hour_index
  '''.format(table=query_per_instance_usage_priority(cell),
             cpu_capacity=cpu_capacity, memory_capacity=memory_capacity)
  
def run_query_utilization_per_time_time_series(cell):
  cell_cap = client.query(query_cell_capacity(cell)).to_dataframe()
  query = query_per_tier_utilization_time_series(
      cell,
      cell_cap['cpu_capacity'][0],
      cell_cap['memory_capacity'][0])
  time_series = client.query(query).to_dataframe()
  return time_series

cell = 'c' #@param ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
hourly_usage = run_query_utilization_per_time_time_series(cell)

# CPU graph
cpu = alt.Chart(hourly_usage).mark_area().encode(
        alt.X('hour_index:N'),
        alt.Y('avg_cpu_usage:Q'),
        color=alt.Color('tier', legend=alt.Legend(orient="left")),
        order=alt.Order('tier', sort='descending'),
        tooltip=['hour_index', 'tier', 'avg_cpu_usage']
  )
cpu.encoding.x.title = "Hour"
cpu.encoding.y.title = "Average CPU usage"
cpu.display()

# Memory graph
memory = alt.Chart(hourly_usage).mark_area().encode(
        alt.X('hour_index:N'),
        alt.Y('avg_memory_usage:Q'),
        color=alt.Color('tier', legend=alt.Legend(orient="left")),
        order=alt.Order('tier', sort='descending'),
        tooltip=['hour_index', 'tier', 'avg_memory_usage']
  )
memory.encoding.x.title = "Hour"
memory.encoding.y.title = "Average memory usage"
memory.display()

Per machine resource usage distribution¶

This query takes a cell as input and plots a per-machine resource utilization CDF.

In [ ]:

#@title Select a cell and plot its per-machine resource utilization CDFs

# Functions to plot CDFs using Altair
def pick_quantiles_from_tall_dataframe(data, qcol, name=""):
  quantiles = pd.DataFrame([x for x in data[qcol]]).transpose()
  if name != "":
    quantiles.columns = data[name]
  return quantiles

# - data: a dataframe with one row and one or more columns of quantiles (results
#   returned from APPROX_QUANTILES)
# - qcols: a list of names of the quantiles
# - names: the names of each returned quantiles' columns.
def pick_quantiles_from_wide_dataframe(data, qcols, names=[]):
  quantiles = {}
  i = 0
  for qcol in qcols:
    col_name = qcol
    if i < len(names):
      col_name = names[i]
    quantiles[col_name] = data[qcol][0]
    i+=1
  return pd.DataFrame(quantiles)

# - quantiles: a dataframe where each column contains the quantiles of one
#   data set. The index (i.e. row names) of the dataframe is the quantile. The
#   column names are the names of the data set.
def plot_cdfs(quantiles, xlab="Value", ylab="CDF",
              legend_title="dataset", labels=[],
              interactive=False,
              title=''):
  dfs = []
  label = legend_title
  yval = range(quantiles.shape[0])
  esp = 1.0/(len(quantiles)-1)
  yval = [y * esp for y in yval]
  while label == xlab or label == ylab:
    label += '_'
  for col_idx, col in enumerate(quantiles.columns):
    col_label = col
    if col_idx < len(labels):
      col_label = labels[col_idx]
    dfs.append(pd.DataFrame({
        label: col_label,
        xlab: quantiles[col],
        ylab: yval
    }))
  cdfs = pd.concat(dfs)
  lines = alt.Chart(cdfs).mark_line().encode(
    # If you can draw a CDF, it has to be continuous real-valued
    x=xlab+":Q",
    y=ylab+":Q",
    color=label+":N"
  ).properties(
    title=title
  )
  if not interactive:
    return lines
  # Create a selection that chooses the nearest point & selects based on x-value
  nearest = alt.selection(type='single', nearest=True, on='mouseover',
                        fields=[ylab], empty='none')
  # Transparent selectors across the chart. This is what tells us
  # the y-value of the cursor
  selectors = alt.Chart(cdfs).mark_point().encode(
    y=ylab+":Q",
    opacity=alt.value(0),
  ).properties(
    selection=nearest
  )

  # Draw text labels near the points, and highlight based on selection
  text = lines.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest,
                       alt.Text(xlab+":Q", format=".2f"),
                       alt.value(' '))
  )

  # Draw a rule at the location of the selection
  rules = alt.Chart(cdfs).mark_rule(color='gray').encode(
    y=ylab+":Q",
  ).transform_filter(
    nearest.ref()
  )
  # Draw points on the line, and highlight based on selection
  points = lines.mark_point().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
  )
  # Put the five layers into a chart and bind the data
  return alt.layer(lines, selectors, rules, text, points).interactive(
      bind_y=False)
  
# Functions to create the query

def query_machine_capacity(cell):
  return '''
SELECT machine_id, MAX(capacity.cpus) AS cpu_cap,
  MAX(capacity.memory) AS memory_cap
FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.machine_events
GROUP BY 1
  '''.format(cell=cell)

def query_top_level_instance_usage(cell):
  return '''
SELECT CAST(FLOOR(start_time/(1e6 * 300)) * (1000000 * 300) AS INT64) AS time,
  collection_id,
  instance_index,
  machine_id,
  average_usage.cpus AS cpu_usage,
  average_usage.memory AS memory_usage
FROM `google.com:google-cluster-data`.clusterdata_2019_{cell}.instance_usage
WHERE (alloc_collection_id IS NULL OR alloc_collection_id = 0)
  AND (end_time - start_time) >= (5 * 60 * 1e6)
  '''.format(cell=cell)

def query_machine_usage(cell):
  return '''
SELECT u.time AS time,
  u.machine_id AS machine_id,
  SUM(u.cpu_usage) AS cpu_usage,
  SUM(u.memory_usage) AS memory_usage,
  MAX(m.cpu_cap) AS cpu_capacity,
  MAX(m.memory_cap) AS memory_capacity
FROM ({instance_usage}) AS u JOIN
 ({machine_capacity}) AS m
ON u.machine_id = m.machine_id
GROUP BY 1, 2
  '''.format(instance_usage = query_top_level_instance_usage(cell),
             machine_capacity = query_machine_capacity(cell))
  
def query_machine_utilization_distribution(cell):
  return '''
SELECT APPROX_QUANTILES(IF(cpu_usage > cpu_capacity, 1.0, cpu_usage / cpu_capacity), 100) AS cpu_util_dist,
  APPROX_QUANTILES(IF(memory_usage > memory_capacity, 1.0, memory_usage / memory_capacity), 100) AS memory_util_dist
FROM ({table})
  '''.format(table = query_machine_usage(cell))

cell = 'd' #@param ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
query = query_machine_utilization_distribution(cell)
machine_util_dist = client.query(query).to_dataframe()
plot_cdfs(pick_quantiles_from_wide_dataframe(machine_util_dist, ['cpu_util_dist', 'memory_util_dist'], ['CPU', 'Memory']), xlab='x - resource utilization (%)', ylab="Probability (resource utilization < x)", interactive=True)