import pandas as pd #import libraries
import numpy as np
import itertools
#import data
dfa=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Last_six_months_SCADA.csv',skip_blank_lines=True)
dfa1=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Prior_two_years_SCADA.csv',skip_blank_lines=True)
dfa2=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/NS_SCADA_v2.csv',skip_blank_lines=True)
dfa3=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/NS_SCADA_2017_v2.csv',skip_blank_lines=True)
def f(c): #define function to merge some old rotor speed readings with new data
if c['turbine']<=20:
return c['rs_av']
dfa['rs_av_old']=dfa.apply(f,axis=1)
dfa1['rs_av_old']=dfa1.apply(f,axis=1)
dfa=dfa.drop('rs_av',axis=1) #delete original columns
dfa1=dfa1.drop('rs_av',axis=1)
#concatenate two df to one
df=pd.concat([dfa,dfa1]) #old SCADA
df2=pd.concat([dfa2,dfa3]) #new SCADA
del dfa,dfa1,dfa2,dfa3
df2['timestamp']=pd.to_datetime(df2['timestamp']) #convert timestamp to datetime dtype
df['timestamp']=pd.to_datetime(df['timestamp'],dayfirst=True)
#filter data so that the latest timestamp is the same for both old and new datasets
df2=df2[df2.timestamp<='2017-04-30 23:50:00.000']
df2.rename(columns={'turbine_id':'turbine'},inplace=True) #rename columns
df2.rename(columns={'rs_av':'rs_av_new'},inplace=True)
df2=df2.sort_values(['timestamp','turbine']) #sort values and drop duplicates
df=df.sort_values(['timestamp','turbine'])
df2=df2.drop_duplicates(['timestamp','turbine'],keep='first')
df=df.drop_duplicates(['timestamp','turbine'],keep='first')
list1=list(pd.date_range('2014-11-01 00:00:00','2017-04-30 23:50:00',freq='10min')) #fill missing rows in time series
list2=range(1,26)
list3=list(itertools.product(list1,list2))
df3=pd.DataFrame(list3)
df3.columns=['timestamp','turbine']
df2['ap_max']=df2['ap_max'].astype(np.float64)
dfx=pd.merge(df,df3,on=['timestamp','turbine'],how='outer')
del df,df3
#merge old and new data by these columns
dfx1=pd.merge(dfx,df2,on=['timestamp','turbine','ws_av','wd_av','ws_1','ws_2','wd_1','wd_2','gen_sp','pitch',
'reactive_power','ap_max','ap_dev','ap_av','nac_pos'],how='outer')
del dfx,df2
dfx1=dfx1.sort_values(['timestamp','turbine']) #sort and drop duplicates again
dfx1=dfx1.drop_duplicates(['timestamp','turbine'],keep='first')
def f2(c): #merge rotor speed readings
if c['rs_av_new']>=0:
return c['rs_av_new']
else:
return c['rs_av_old']
dfx1['rs_av']=dfx1.apply(f2,axis=1)
dfx1=dfx1.drop('rs_av_old',axis=1) #drop old columns and reset index
dfx1=dfx1.drop('rs_av_new',axis=1)
dfx1.reset_index(drop=True,inplace=True)
dfx1.to_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA.csv',index=False) #write to new csv file
del dfx1
import pandas as pd #import libraries
#import downtime data
dfb=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Last_six_months_downtime.csv',skip_blank_lines=True)
dfb1=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Prior_two_years_downtime.csv',skip_blank_lines=True)
df1=pd.concat([dfb,dfb1]) #concatenate
del dfb,dfb1
df1['timestamp_start']=pd.to_datetime(df1['timestamp_start']) #convert dtype object to datetime
df1['timestamp_end']=pd.to_datetime(df1['timestamp_end'])
df1['timestamp_start']=df1['timestamp_start'].dt.round('10min') #round to nearest 10 min
df1['timestamp_end']=df1['timestamp_end'].dt.round('10min')
df1['period']=df1['timestamp_end']-df1['timestamp_start'] #period
#downtime ranges to everyten minutes between start and end timestamps
df1=pd.concat([pd.DataFrame({'timestamp':pd.date_range(row.timestamp_start,row.timestamp_end,freq='10min'),
'turbine_id':row.turbine_id,'period':row.period,'TurbineCategory_id':row.TurbineCategory_id,
'EnvironmentalCategory_id':row.EnvironmentalCategory_id,
'InfrastructureCategory_id':row.InfrastructureCategory_id,
'GridCategory_id':row.GridCategory_id,
'AvailabilityCategory_id':row.AvailabilityCategory_id,'alarm_id':row.alarm_id,
'workorder_id':row.workorder_id,'comment':row.comment},
columns=['timestamp','turbine_id','period','TurbineCategory_id','EnvironmentalCategory_id',
'InfrastructureCategory_id','GridCategory_id','AvailabilityCategory_id','alarm_id',
'workorder_id','comment'])
for i,row in df1.iterrows()],ignore_index=True)
df1=df1.sort_values(['timestamp','turbine_id','period']) #sort and drop duplicates for same timestamp and turbine
df1=df1.drop_duplicates(['timestamp','turbine_id'],keep='first')
df=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA.csv',skip_blank_lines=True) #import SCADA
df=df.drop('ws_1',axis=1) #drop unnecessary columns
df=df.drop('ws_2',axis=1)
df=df.drop('wd_1',axis=1)
df=df.drop('wd_2',axis=1)
df['timestamp']=pd.to_datetime(df['timestamp'],dayfirst=True) #convert timestamp to datetime
df['turbine_id']=df['turbine'] #copy turbine id to new column
df2=pd.merge(df,df1,how='outer') #merge SCADA and downtime
del df,df1
#drop downtime entries with no SCADA readings - in case of duplicates
df2=df2.drop(df2[(df2['turbine_id'].notnull())&(df2['turbine'].isnull())].index)
df2=df2.drop('turbine',axis=1) #drop old turbine id column
#write final dataframe to csv
df2.to_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA_and_downtime-NEW.csv',index=False)
dfb=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Last_six_months_downtime.csv',skip_blank_lines=True)
dfb1=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Prior_two_years_downtime.csv',skip_blank_lines=True)
dfb1=dfb1.drop('AvailabilityCategory_id.1',axis=1)
dfb1=dfb1.drop('EnvironmentalCategory_id.1',axis=1)
dfb1=dfb1.drop('GridCategory_id.1',axis=1)
dfb1=dfb1.drop('InfrastructureCategory_id.1',axis=1)
dfb1=dfb1.drop('TurbineCategory_id.1',axis=1)
dfb1=dfb1.drop('alarm_id.1',axis=1)
dfb1=dfb1.drop('comment.1',axis=1)
dfb1=dfb1.drop('id.1',axis=1)
dfb1=dfb1.drop('workorder_id.1',axis=1)
dfb1=dfb1.drop('turbine_id.1',axis=1)
dfb1=dfb1.drop('timestamp_end.1',axis=1)
dfb1=dfb1.drop('timestamp_start.1',axis=1)
dfb['timestamp_start']=pd.to_datetime(dfb['timestamp_start'])
dfb['timestamp_end']=pd.to_datetime(dfb['timestamp_end'])
dfb1['timestamp_start']=pd.to_datetime(dfb1['timestamp_start'])
dfb1['timestamp_end']=pd.to_datetime(dfb1['timestamp_end'])
df1=pd.concat([dfb,dfb1])
del dfb,dfb1
df1=df1.sort_values(['timestamp_start','timestamp_end','turbine_id'])
df1.reset_index(drop=True,inplace=True)
df1.to_csv('C:/Users/nithi/Google Drive/Python/Student Data/downtime.csv',index=False)
import pandas as pd
df=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA_and_downtime-NEW.csv',encoding="ISO-8859-1")
df.shape