#!/usr/bin/env python
# coding: utf-8

# # Introduction to the MIT Supercloud Dataset
# 
# This notebook is an introduction to working with the MIT Supercloud Dataset.  It introduces the types of data collected and ways to load, process, and plot the data.
# 
# Details of the dataset can be found in [The MIT Supercloud Dataset](https://arxiv.org/abs/2108.02037).

# In[1]:


import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# # Functions

# In[2]:


def plot_time_series(df=None,
                     columns=None,
                     downsample=1,
                     samples_per_second=1,
                     title=None):
    
    """
    Plot CPU or GPU time series data
    
    Inputs:
    
        df: timeseries pandas dataframe
        columns: columns from timeseries to print
        downsample: number of samples to skip between each plotted sample
        samples_per_second: number of samples collected per second 
        title: string for plot title
        
    """

    # time index.  CPU time series are sampled every 10 seconds, GPU every tenth of a second
    t = np.linspace(0,df.shape[0]*(samples_per_second**-1),df.shape[0])[::downsample]

    # colors
    cm = plt.get_cmap('tab10')
    num_colors = df.columns.shape[0]
    colors = [cm(1.*i/num_colors) for i in range(num_colors)]

    # figure
    fig, axs = plt.subplots(3,3,figsize=(16,16))
    plt.suptitle(title,fontsize=14)

    # loop over columns to plot
    for ax,column,color in zip(axs.ravel(),columns,colors):
        plot_data = df[column].values[::downsample]
        ax.plot(t,plot_data,color=color)
        ax.tick_params(axis='x',rotation=-45)
        ax.set_xlabel('Time (s)')
        ax.set_ylabel(column)
        ax.grid()
    plt.show()
    plt.close()


# # Paths

# In[3]:


# This path points to the root directory where the data was extracted
ROOT_PATH = 'PATH/TO/DATASET/LOCATION'

# The paths below point to specific files or directories
SCHEDULER_LOG_PATH = os.path.join(ROOT_PATH,'scheduler-log.csv') # slurm log csv
NODE_DATA_PATH = os.path.join(ROOT_PATH,'node-data.csv') # node data csv
CPU_DATA_PATH = os.path.join(ROOT_PATH,'cpu') # cpu time series directory
GPU_DATA_PATH = os.path.join(ROOT_PATH,'gpu') # gpu time series directory


# # Slurm Log

# In[4]:


# slurm log dataframe
scheduler_log_df = pd.read_csv(SCHEDULER_LOG_PATH)


# In[5]:


# columns in slurm log dataframe
print('Columns for Scheduler log dataframe:\n')
print("\n".join([str(i) for i in scheduler_log_df.columns]))


# In[6]:


# job IDs in the slurm log
scheduler_log_job_ids = scheduler_log_df.id_job.unique()

# indices of gpu jobs
gpu_idx = scheduler_log_df.tres_req.apply(lambda x:str(x).find('1001')>0 or str(x).find('1002')>0)
scheduler_log_job_ids_gpu = np.unique(scheduler_log_df[gpu_idx].id_job.values)

print('There are {} jobs in the scheduler log, of which {} requested GPUs.'.format(scheduler_log_job_ids.shape[0],
                                                                                   scheduler_log_job_ids_gpu.shape[0]))


# # Node Data
# Explore the data colleced from each compute node on the system.

# In[7]:


# node data dataframe
node_data_df = pd.read_csv(NODE_DATA_PATH)


# In[8]:


# # columns in slurm log dataframe
print('Columns for Node data dataframe:\n')
print("\n".join([str(i) for i in node_data_df.columns]))


# In[9]:


node_data_df.head()


# # CPU Data (for single job)
# Explore the CPU time series data.

# In[10]:


# Load CPU data for a specific job 
cpu_job_id = scheduler_log_job_ids[0] 

# Load summary and time series csv files into dataframes for specific job
for root, _, files in os.walk(CPU_DATA_PATH,topdown=False):
    for csv_file in files:
        if str(cpu_job_id) in csv_file:
            if 'summary' in csv_file:
                cpu_summary_path = os.path.join(root,csv_file)
            else:
                cpu_timeseries_path = os.path.join(root,csv_file)
        
# read csv into dataframe
cpu_summary_df = pd.read_csv(cpu_summary_path)
cpu_df = pd.read_csv(cpu_timeseries_path)

# columns for summary dataframe
print('###################################')
print('# Columns for CPU summary dataframe')
print('###################################\n')
print("\n".join([str(i) for i in cpu_summary_df.columns]))

# columns for timeseries dataframe
print('\n######################################')
print('# Columns for CPU timeseries dataframe')
print('######################################\n')
print("\n".join([str(i) for i in cpu_df.columns]))

print('\nThe CPU time series for job_id {} has {} samples.'.format(cpu_job_id,
                                                                   cpu_df.shape[0]))


# ## Plot CPU time series

# In[11]:


# Be sure to only select Step values equal to 'batch'
cpu_summary_df = cpu_summary_df[cpu_summary_df.Step=='batch']
cpu_df = cpu_df[cpu_df.Step=='batch']


# In[12]:


cpu_plot_columns = [
    'CPUFrequency', 
    'CPUTime', 
    'CPUUtilization', 
    'RSS', 
    'VMSize', 
    'Pages', 
    'ReadMB', 
    'WriteMB'
]

# downsample step size for plots
cpu_downsample = 10

plot_time_series(df=cpu_df,
                 columns=cpu_plot_columns,
                 downsample=cpu_downsample,
                 samples_per_second=0.1,
                 title='CPU Time Series Plots for Job ID {}'.format(cpu_job_id))


# # GPU Data (for a single job)
# Explore the GPU time series data.

# In[13]:


# Load GPU data for a specific job 
gpu_job_id = scheduler_log_job_ids_gpu[0]

# Load summary and time series csv files into dataframes for specific job
for root, _, files in os.walk(GPU_DATA_PATH,topdown=False):
   
   for csv_file in files:
   
        if str(gpu_job_id) in csv_file:

            gpu_timeseries_path = os.path.join(root,csv_file)
        
# read csv into dataframe
gpu_df = pd.read_csv(gpu_timeseries_path)

# columns for timeseries dataframe
print('\n######################################')
print('# Columns for GPU timeseries dataframe')
print('######################################\n')
print("\n".join([str(i) for i in gpu_df.columns]))

print('\nThe GPU time series for job_id {} has {} samples.'.format(gpu_job_id,
                                                                   gpu_df.shape[0]))


# ## Extract GPU time series by GPU index

# In[14]:


# Since a single job can request multiple GPUs we need to separate the GPU df loaded above by gpu_index.
# It may be the case that only a single GPU was requested but for generality we will create a new dictionary
# structure here with each gpu index acting as the key and corresponding GPU's time series as the value.

# number of GPUs in time series, for given job
print('There were {} GPU(s) requested for the current job'.format(gpu_df.gpu_index.unique().shape[0]))

# separate GPU time series
gpu_df_dict = dict()

for gpu_index in gpu_df.gpu_index.unique():
    
    gpu_df_dict[gpu_index] = gpu_df[gpu_df.gpu_index==gpu_index]


# ## Plot GPU time series

# In[15]:


gpu_plot_columns = [
    'utilization_gpu_pct',
    'utilization_memory_pct',
    'memory_free_MiB',
    'memory_used_MiB',
    'temperature_gpu',
    'temperature_memory',
    'power_draw_W',
]

# downsample step size for plots
gpu_downsample = 100

gpu_index = 0

plot_time_series(df=gpu_df_dict[gpu_index],
                 columns=gpu_plot_columns,
                 downsample=gpu_downsample,
                 title='GPU Time Series Plots for Job ID {}, GPU index {}'.format(cpu_job_id,gpu_index))


# In[ ]: