#!/usr/bin/env python # coding: utf-8 # # Kubernetes node diagnostics! # # Step 1: connect to the cluster # In[1]: import kubernetes.config import kubernetes.client # In[2]: context = 'prod-a' namespace = 'prod' kubernetes.config.load_kube_config(context=context) kube = kubernetes.client.CoreV1Api() # Get a list of our nodes # In[3]: all_nodes = kube.list_node().items nodes = [ node.metadata.name for node in all_nodes ] node_dict = { node.metadata.name.rsplit('-', 1)[-1]: node for node in all_nodes } nodes # Utilities for running commands on all of our nodes via # # ```bash # gcloud compute ssh # ``` # In[4]: from subprocess import check_output def ssh(node, cmd, parse=None): out = check_output([ 'gcloud', 'compute', 'ssh', node, '--', cmd, ]).decode('utf8', 'replace').strip() if parse: out = parse(out) return out from concurrent.futures import ThreadPoolExecutor pool = ThreadPoolExecutor(len(nodes)) def ssh_all(cmd, parse=None): results = {} for node in nodes: key = node.rsplit('-', 1)[-1] results[key] = pool.submit(ssh, node, cmd, parse=parse) # wait for results for key, future in list(results.items()): results[key] = future.result() return results # Collect some data from the nodes # In[5]: # count docker images images = ssh_all('docker images | wc -l', parse=int) all_images = ssh_all('docker images -a | wc -l', parse=int) mounts = ssh_all('mount | wc -l', parse=int) # In[6]: def _parse_inode_usage(lines): for line in lines.splitlines(): if line.endswith('stateful_partition'): return int(line.split()[4].rstrip('%')) inode_usage = ssh_all('df -i', parse=_parse_inode_usage) # In[13]: # no space messages in the last day no_space = ssh_all(f'sudo journalctl --since=yesterday | grep -i "no space left" | wc -l', parse=int) no_space # Now do some plotting # In[9]: import datetime import matplotlib.pyplot as plt import pandas as pd now = datetime.datetime.now(tz=datetime.timezone.utc) plt.ion() # Turn our data into a nice dataframe. # # We can see that `jsv7` is the only node seeing 'no space left on device' messages # In[14]: data = {} data['node'] = keys = list(images.keys()) data['mounts'] = [ mounts[key] for key in keys ] data['tagged images'] = [ images[key] for key in keys ] data['all images'] = [ all_images[key] for key in keys ] data['age'] = [ (now - node_dict[key].metadata.creation_timestamp).total_seconds() / (24 * 3600) for key in keys ] data['no space msgs'] = [ no_space[key] for key in keys ] data['inode usage'] = [ inode_usage[key] for key in keys ] df = pd.DataFrame.from_dict(data) # reindex by node df.index = df.pop('node') df # Plot our metrics: # In[17]: df.plot(kind='bar', subplots=True, y=['all images', 'tagged images', 'inode usage', 'mounts']); # plt.ylim(0, 4000); # In[18]: df.age.plot(kind='bar', title="Node age (days)") # # Correlating metrics # # Compute the correlation matrix to see what might be the best predictor # of out of space errors # In[19]: import seaborn as sns corr = df.corr() corr['no space msgs'] # Looks like the number of mounts has the highest correlation, # followed by just the age of the node. # # Interestingly, the total number of docker images is *negatively* correlated, # suggesting that pure image count isn't the right metric. # # Correlation heatmap! # In[20]: sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)