#!/usr/bin/env python
# coding: utf-8

# # Process SCADA and downtime data
# 
# This notebook demonstrates the merging of all four CSV files containing SCADA
# and downtime data into single dataframes. Two files are older datasets, and
# the other two are newer datasets. Both old and new datasets have most of their
# timestamps in common.
# 
# The older SCADA datasets were found to have some errors in the rotor speed
# readings. This merging replaces the old errorred data points with the new
# ones, and removes incomplete rows.
# 
# This merging ensures the downtime data has the same range as the SCADA data,
# and removes incomplete rows.

# In[1]:


# import libraries
import os
import glob
import itertools
import pandas as pd
import numpy as np


# In[2]:


# create directory to store processed data
os.makedirs("data/processed/", exist_ok=True)


# ## Downtime categories

# In[3]:


# read and view data
data = pd.read_excel("data/Melogale Downtime Categories.xlsx")


# In[4]:


data


# In[5]:


# drop first row
data = data.drop([0])


# In[6]:


# function to filter data for each category type
catData = {}


def categorise_data(cat, number):
    catData[cat] = data.filter(
        items=[cat+" Categories", "Unnamed: "+str(number)]
    )
    catData[cat].rename(
        columns={
            cat+" Categories": "Category", "Unnamed: "+str(number): "Name"
        },
        inplace=True
    )
    catData[cat]["Type"] = cat
    catData[cat].dropna(inplace=True)


# In[7]:


# filtering
categorise_data("Turbine", 1)
categorise_data("Environmental", 3)
categorise_data("Grid", 5)
categorise_data("Infrastructure", 7)
categorise_data("Availability", 9)


# In[8]:


# concatenate data
data = pd.concat(catData.values(), ignore_index=True)


# In[9]:


data


# ## Downtime time series

# In[2]:


dt = {}
dtList = glob.glob("data/*downtime*.csv")
for num, df in enumerate(dtList):
    dt[num] = pd.read_csv(df)
    print(num, df, dt[num].shape)


# In[3]:


dt[0].head(5)


# In[4]:


dt[1].head(5)


# In[5]:


dt[1].columns


# In[6]:


# drop duplicate columns
dt[1].drop(columns=list(dt[1].filter(regex=".1")), inplace=True)


# In[7]:


# convert timestamps to datetime data type
for key in dt.keys():
    for col in list(dt[key].filter(regex="timestamp")):
        dt[key][col] = pd.to_datetime(dt[key][col])


# In[8]:


# concatenate data
data = pd.concat(dt.values(), join="outer")


# In[9]:


data.head(5)


# In[10]:


data.shape


# ## SCADA time series

# In[2]:


# old SCADA data
scada = {}
scadaList = glob.glob("data/*SCADA.csv")
for num, df in enumerate(scadaList):
    scada[num] = pd.read_csv(df)
    print(num, df, scada[num].shape)


# In[3]:


scada[0].head(5)


# In[4]:


scada[1].head(5)


# In[52]:


# rename ap_min in scada[1] to ap_max
scada[1].rename(columns={"ap_min": "ap_max"}, inplace=True)


# In[7]:


# fixing rotor speed readings due to errors in data
def fix_rs(c):
    if c["turbine"] <= 20:
        return c["rs_av"]


# In[ ]:


for df in scada.keys():
    scada[df]["rs_av_old"] = scada[df].apply(fix_rs, axis=1)
    scada[df] = scada[df].drop("rs_av", axis=1)


# In[32]:


# concatenate old datasets
scadaOld = pd.concat(scada.values())


# In[33]:


scadaOld.head(5)


# In[35]:


# new SCADA data
scada = {}
scadaList = glob.glob("data/NS_SCADA*.csv")
for num, df in enumerate(scadaList):
    scada[num] = pd.read_csv(df)
    print(num, df, scada[num].shape)


# In[36]:


scada[0].head(5)


# In[37]:


scada[1].head(5)


# In[38]:


# concatenate new datasets
scadaNew = pd.concat(scada.values())


# In[39]:


scadaNew.head(5)