Step 1: connect to the cluster
import kubernetes.config
import kubernetes.client
context = 'prod-a'
namespace = 'prod'
kubernetes.config.load_kube_config(context=context)
kube = kubernetes.client.CoreV1Api()
Get a list of our nodes
all_nodes = kube.list_node().items
nodes = [ node.metadata.name for node in all_nodes ]
node_dict = {
node.metadata.name.rsplit('-', 1)[-1]: node
for node in all_nodes
}
nodes
['gke-prod-a-default-pool-6ab3d8ec-jsv7', 'gke-prod-a-default-pool-6ab3d8ec-mhw8', 'gke-prod-a-default-pool-6ab3d8ec-mzfb', 'gke-prod-a-default-pool-6ab3d8ec-rq6b', 'gke-prod-a-default-pool-6ab3d8ec-xq51']
Utilities for running commands on all of our nodes via
gcloud compute ssh
from subprocess import check_output
def ssh(node, cmd, parse=None):
out = check_output([
'gcloud',
'compute',
'ssh',
node,
'--',
cmd,
]).decode('utf8', 'replace').strip()
if parse:
out = parse(out)
return out
from concurrent.futures import ThreadPoolExecutor
pool = ThreadPoolExecutor(len(nodes))
def ssh_all(cmd, parse=None):
results = {}
for node in nodes:
key = node.rsplit('-', 1)[-1]
results[key] = pool.submit(ssh, node, cmd, parse=parse)
# wait for results
for key, future in list(results.items()):
results[key] = future.result()
return results
Collect some data from the nodes
# count docker images
images = ssh_all('docker images | wc -l', parse=int)
all_images = ssh_all('docker images -a | wc -l', parse=int)
mounts = ssh_all('mount | wc -l', parse=int)
def _parse_inode_usage(lines):
for line in lines.splitlines():
if line.endswith('stateful_partition'):
return int(line.split()[4].rstrip('%'))
inode_usage = ssh_all('df -i', parse=_parse_inode_usage)
# no space messages in the last day
no_space = ssh_all(f'sudo journalctl --since=yesterday | grep -i "no space left" | wc -l', parse=int)
no_space
{'jsv7': 10671, 'mhw8': 0, 'mzfb': 0, 'rq6b': 0, 'xq51': 0}
Now do some plotting
import datetime
import matplotlib.pyplot as plt
import pandas as pd
now = datetime.datetime.now(tz=datetime.timezone.utc)
plt.ion()
Turn our data into a nice dataframe.
We can see that jsv7
is the only node seeing 'no space left on device' messages
data = {}
data['node'] = keys = list(images.keys())
data['mounts'] = [ mounts[key] for key in keys ]
data['tagged images'] = [ images[key] for key in keys ]
data['all images'] = [ all_images[key] for key in keys ]
data['age'] = [ (now - node_dict[key].metadata.creation_timestamp).total_seconds() / (24 * 3600) for key in keys ]
data['no space msgs'] = [ no_space[key] for key in keys ]
data['inode usage'] = [ inode_usage[key] for key in keys ]
df = pd.DataFrame.from_dict(data)
# reindex by node
df.index = df.pop('node')
df
age | all images | inode usage | mounts | no space msgs | tagged images | |
---|---|---|---|---|---|---|
node | ||||||
jsv7 | 25.415210 | 1280 | 13 | 3191 | 10671 | 493 |
mhw8 | 7.680499 | 547 | 6 | 806 | 0 | 186 |
mzfb | 19.698566 | 1500 | 13 | 2565 | 0 | 547 |
rq6b | 7.670430 | 1824 | 13 | 650 | 0 | 572 |
xq51 | 18.570950 | 1455 | 12 | 633 | 0 | 487 |
Plot our metrics:
df.plot(kind='bar', subplots=True, y=['all images', 'tagged images', 'inode usage', 'mounts']);
# plt.ylim(0, 4000);
df.age.plot(kind='bar', title="Node age (days)")
<matplotlib.axes._subplots.AxesSubplot at 0x119416eb8>
Compute the correlation matrix to see what might be the best predictor of out of space errors
import seaborn as sns
corr = df.corr()
corr['no space msgs']
age 0.683030 all images -0.048453 inode usage 0.293294 mounts 0.744964 no space msgs 1.000000 tagged images 0.129258 Name: no space msgs, dtype: float64
Looks like the number of mounts has the highest correlation, followed by just the age of the node.
Interestingly, the total number of docker images is negatively correlated, suggesting that pure image count isn't the right metric.
Correlation heatmap!
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
<matplotlib.axes._subplots.AxesSubplot at 0x11ca68ba8>