#!/usr/bin/env python # coding: utf-8 # # Process SCADA and downtime data # # This notebook demonstrates the merging of all four CSV files containing SCADA # and downtime data into single dataframes. Two files are older datasets, and # the other two are newer datasets. Both old and new datasets have most of their # timestamps in common. # # The older SCADA datasets were found to have some errors in the rotor speed # readings. This merging replaces the old errorred data points with the new # ones, and removes incomplete rows. # # This merging ensures the downtime data has the same range as the SCADA data, # and removes incomplete rows. # In[1]: # import libraries import os import glob import itertools import pandas as pd import numpy as np # In[2]: # create directory to store processed data os.makedirs("data/processed/", exist_ok=True) # ## Downtime categories # In[3]: # read and view data data = pd.read_excel("data/Melogale Downtime Categories.xlsx") # In[4]: data # In[5]: # drop first row data = data.drop([0]) # In[6]: # function to filter data for each category type catData = {} def categorise_data(cat, number): catData[cat] = data.filter( items=[cat+" Categories", "Unnamed: "+str(number)] ) catData[cat].rename( columns={ cat+" Categories": "Category", "Unnamed: "+str(number): "Name" }, inplace=True ) catData[cat]["Type"] = cat catData[cat].dropna(inplace=True) # In[7]: # filtering categorise_data("Turbine", 1) categorise_data("Environmental", 3) categorise_data("Grid", 5) categorise_data("Infrastructure", 7) categorise_data("Availability", 9) # In[8]: # concatenate data data = pd.concat(catData.values(), ignore_index=True) # In[9]: data # ## Downtime time series # In[2]: dt = {} dtList = glob.glob("data/*downtime*.csv") for num, df in enumerate(dtList): dt[num] = pd.read_csv(df) print(num, df, dt[num].shape) # In[3]: dt[0].head(5) # In[4]: dt[1].head(5) # In[5]: dt[1].columns # In[6]: # drop duplicate columns dt[1].drop(columns=list(dt[1].filter(regex=".1")), inplace=True) # In[7]: # convert timestamps to datetime data type for key in dt.keys(): for col in list(dt[key].filter(regex="timestamp")): dt[key][col] = pd.to_datetime(dt[key][col]) # In[8]: # concatenate data data = pd.concat(dt.values(), join="outer") # In[9]: data.head(5) # In[10]: data.shape # ## SCADA time series # In[2]: # old SCADA data scada = {} scadaList = glob.glob("data/*SCADA.csv") for num, df in enumerate(scadaList): scada[num] = pd.read_csv(df) print(num, df, scada[num].shape) # In[3]: scada[0].head(5) # In[4]: scada[1].head(5) # In[52]: # rename ap_min in scada[1] to ap_max scada[1].rename(columns={"ap_min": "ap_max"}, inplace=True) # In[7]: # fixing rotor speed readings due to errors in data def fix_rs(c): if c["turbine"] <= 20: return c["rs_av"] # In[ ]: for df in scada.keys(): scada[df]["rs_av_old"] = scada[df].apply(fix_rs, axis=1) scada[df] = scada[df].drop("rs_av", axis=1) # In[32]: # concatenate old datasets scadaOld = pd.concat(scada.values()) # In[33]: scadaOld.head(5) # In[35]: # new SCADA data scada = {} scadaList = glob.glob("data/NS_SCADA*.csv") for num, df in enumerate(scadaList): scada[num] = pd.read_csv(df) print(num, df, scada[num].shape) # In[36]: scada[0].head(5) # In[37]: scada[1].head(5) # In[38]: # concatenate new datasets scadaNew = pd.concat(scada.values()) # In[39]: scadaNew.head(5)