#!/usr/bin/env python
# coding: utf-8

# # Kubernetes node diagnostics!
# 
# Step 1: connect to the cluster

# In[1]:


import kubernetes.config
import kubernetes.client


# In[2]:


context = 'prod-a'
namespace = 'prod'

kubernetes.config.load_kube_config(context=context)
kube = kubernetes.client.CoreV1Api()


# Get a list of our nodes

# In[3]:


all_nodes = kube.list_node().items
nodes = [ node.metadata.name for node in all_nodes ]

node_dict = {
    node.metadata.name.rsplit('-', 1)[-1]: node
    for node in all_nodes
}
nodes


# Utilities for running commands on all of our nodes via
# 
# ```bash
# gcloud compute ssh
# ```

# In[4]:


from subprocess import check_output

def ssh(node, cmd, parse=None):
    out = check_output([
        'gcloud',
        'compute',
        'ssh',
        node,
        '--',
        cmd,
    ]).decode('utf8', 'replace').strip()
    if parse:
        out = parse(out)
    return out

from concurrent.futures import ThreadPoolExecutor
pool = ThreadPoolExecutor(len(nodes))

def ssh_all(cmd, parse=None):
    results = {}
    for node in nodes:
        key = node.rsplit('-', 1)[-1]
        results[key] = pool.submit(ssh, node, cmd, parse=parse)

    # wait for results
    for key, future in list(results.items()):
        results[key] = future.result()
    return results


# Collect some data from the nodes

# In[5]:


# count docker images
images = ssh_all('docker images | wc -l', parse=int)
all_images = ssh_all('docker images -a | wc -l', parse=int)
mounts = ssh_all('mount | wc -l', parse=int)


# In[6]:


def _parse_inode_usage(lines):
    for line in lines.splitlines():
        if line.endswith('stateful_partition'):
            return int(line.split()[4].rstrip('%'))

inode_usage = ssh_all('df -i', parse=_parse_inode_usage)


# In[13]:


# no space messages in the last day
no_space = ssh_all(f'sudo journalctl --since=yesterday | grep -i "no space left" | wc -l', parse=int)
no_space


# Now do some plotting

# In[9]:


import datetime

import matplotlib.pyplot as plt
import pandas as pd

now = datetime.datetime.now(tz=datetime.timezone.utc)
plt.ion()


# Turn our data into a nice dataframe.
# 
# We can see that `jsv7` is the only node seeing 'no space left on device' messages

# In[14]:


data = {}
data['node'] = keys = list(images.keys())
data['mounts'] = [ mounts[key] for key in keys ]
data['tagged images'] = [ images[key] for key in keys ]
data['all images'] = [ all_images[key] for key in keys ]
data['age'] = [ (now - node_dict[key].metadata.creation_timestamp).total_seconds() / (24 * 3600) for key in keys ]
data['no space msgs'] = [ no_space[key] for key in keys ]
data['inode usage'] = [ inode_usage[key] for key in keys ]

df = pd.DataFrame.from_dict(data)
# reindex by node
df.index = df.pop('node')
df


# Plot our metrics:

# In[17]:


df.plot(kind='bar', subplots=True, y=['all images', 'tagged images', 'inode usage', 'mounts']);
# plt.ylim(0, 4000);


# In[18]:


df.age.plot(kind='bar', title="Node age (days)")


# # Correlating metrics
# 
# Compute the correlation matrix to see what might be the best predictor
# of out of space errors

# In[19]:


import seaborn as sns
corr = df.corr()
corr['no space msgs']


# Looks like the number of mounts has the highest correlation,
# followed by just the age of the node.
# 
# Interestingly, the total number of docker images is *negatively* correlated,
# suggesting that pure image count isn't the right metric.
# 
# Correlation heatmap!

# In[20]:


sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)