#!/usr/bin/env python # coding: utf-8 # # Combining multiple csv files containing data into a single file # ## SCADA # In[ ]: import pandas as pd #import libraries import numpy as np import itertools #import data dfa=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Last_six_months_SCADA.csv',skip_blank_lines=True) dfa1=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Prior_two_years_SCADA.csv',skip_blank_lines=True) dfa2=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/NS_SCADA_v2.csv',skip_blank_lines=True) dfa3=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/NS_SCADA_2017_v2.csv',skip_blank_lines=True) def f(c): #define function to merge some old rotor speed readings with new data if c['turbine']<=20: return c['rs_av'] dfa['rs_av_old']=dfa.apply(f,axis=1) dfa1['rs_av_old']=dfa1.apply(f,axis=1) dfa=dfa.drop('rs_av',axis=1) #delete original columns dfa1=dfa1.drop('rs_av',axis=1) #concatenate two df to one df=pd.concat([dfa,dfa1]) #old SCADA df2=pd.concat([dfa2,dfa3]) #new SCADA del dfa,dfa1,dfa2,dfa3 df2['timestamp']=pd.to_datetime(df2['timestamp']) #convert timestamp to datetime dtype df['timestamp']=pd.to_datetime(df['timestamp'],dayfirst=True) #filter data so that the latest timestamp is the same for both old and new datasets df2=df2[df2.timestamp<='2017-04-30 23:50:00.000'] df2.rename(columns={'turbine_id':'turbine'},inplace=True) #rename columns df2.rename(columns={'rs_av':'rs_av_new'},inplace=True) df2=df2.sort_values(['timestamp','turbine']) #sort values and drop duplicates df=df.sort_values(['timestamp','turbine']) df2=df2.drop_duplicates(['timestamp','turbine'],keep='first') df=df.drop_duplicates(['timestamp','turbine'],keep='first') list1=list(pd.date_range('2014-11-01 00:00:00','2017-04-30 23:50:00',freq='10min')) #fill missing rows in time series list2=range(1,26) list3=list(itertools.product(list1,list2)) df3=pd.DataFrame(list3) df3.columns=['timestamp','turbine'] df2['ap_max']=df2['ap_max'].astype(np.float64) dfx=pd.merge(df,df3,on=['timestamp','turbine'],how='outer') del df,df3 #merge old and new data by these columns dfx1=pd.merge(dfx,df2,on=['timestamp','turbine','ws_av','wd_av','ws_1','ws_2','wd_1','wd_2','gen_sp','pitch', 'reactive_power','ap_max','ap_dev','ap_av','nac_pos'],how='outer') del dfx,df2 dfx1=dfx1.sort_values(['timestamp','turbine']) #sort and drop duplicates again dfx1=dfx1.drop_duplicates(['timestamp','turbine'],keep='first') def f2(c): #merge rotor speed readings if c['rs_av_new']>=0: return c['rs_av_new'] else: return c['rs_av_old'] dfx1['rs_av']=dfx1.apply(f2,axis=1) dfx1=dfx1.drop('rs_av_old',axis=1) #drop old columns and reset index dfx1=dfx1.drop('rs_av_new',axis=1) dfx1.reset_index(drop=True,inplace=True) dfx1.to_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA.csv',index=False) #write to new csv file del dfx1 # ## Merging SCADA and downtime # In[ ]: import pandas as pd #import libraries #import downtime data dfb=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Last_six_months_downtime.csv',skip_blank_lines=True) dfb1=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Prior_two_years_downtime.csv',skip_blank_lines=True) df1=pd.concat([dfb,dfb1]) #concatenate del dfb,dfb1 df1['timestamp_start']=pd.to_datetime(df1['timestamp_start']) #convert dtype object to datetime df1['timestamp_end']=pd.to_datetime(df1['timestamp_end']) df1['timestamp_start']=df1['timestamp_start'].dt.round('10min') #round to nearest 10 min df1['timestamp_end']=df1['timestamp_end'].dt.round('10min') df1['period']=df1['timestamp_end']-df1['timestamp_start'] #period #downtime ranges to everyten minutes between start and end timestamps df1=pd.concat([pd.DataFrame({'timestamp':pd.date_range(row.timestamp_start,row.timestamp_end,freq='10min'), 'turbine_id':row.turbine_id,'period':row.period,'TurbineCategory_id':row.TurbineCategory_id, 'EnvironmentalCategory_id':row.EnvironmentalCategory_id, 'InfrastructureCategory_id':row.InfrastructureCategory_id, 'GridCategory_id':row.GridCategory_id, 'AvailabilityCategory_id':row.AvailabilityCategory_id,'alarm_id':row.alarm_id, 'workorder_id':row.workorder_id,'comment':row.comment}, columns=['timestamp','turbine_id','period','TurbineCategory_id','EnvironmentalCategory_id', 'InfrastructureCategory_id','GridCategory_id','AvailabilityCategory_id','alarm_id', 'workorder_id','comment']) for i,row in df1.iterrows()],ignore_index=True) df1=df1.sort_values(['timestamp','turbine_id','period']) #sort and drop duplicates for same timestamp and turbine df1=df1.drop_duplicates(['timestamp','turbine_id'],keep='first') df=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA.csv',skip_blank_lines=True) #import SCADA df=df.drop('ws_1',axis=1) #drop unnecessary columns df=df.drop('ws_2',axis=1) df=df.drop('wd_1',axis=1) df=df.drop('wd_2',axis=1) df['timestamp']=pd.to_datetime(df['timestamp'],dayfirst=True) #convert timestamp to datetime df['turbine_id']=df['turbine'] #copy turbine id to new column df2=pd.merge(df,df1,how='outer') #merge SCADA and downtime del df,df1 #drop downtime entries with no SCADA readings - in case of duplicates df2=df2.drop(df2[(df2['turbine_id'].notnull())&(df2['turbine'].isnull())].index) df2=df2.drop('turbine',axis=1) #drop old turbine id column #write final dataframe to csv df2.to_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA_and_downtime-NEW.csv',index=False) # ## Downtime # In[ ]: dfb=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Last_six_months_downtime.csv',skip_blank_lines=True) dfb1=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Prior_two_years_downtime.csv',skip_blank_lines=True) dfb1=dfb1.drop('AvailabilityCategory_id.1',axis=1) dfb1=dfb1.drop('EnvironmentalCategory_id.1',axis=1) dfb1=dfb1.drop('GridCategory_id.1',axis=1) dfb1=dfb1.drop('InfrastructureCategory_id.1',axis=1) dfb1=dfb1.drop('TurbineCategory_id.1',axis=1) dfb1=dfb1.drop('alarm_id.1',axis=1) dfb1=dfb1.drop('comment.1',axis=1) dfb1=dfb1.drop('id.1',axis=1) dfb1=dfb1.drop('workorder_id.1',axis=1) dfb1=dfb1.drop('turbine_id.1',axis=1) dfb1=dfb1.drop('timestamp_end.1',axis=1) dfb1=dfb1.drop('timestamp_start.1',axis=1) dfb['timestamp_start']=pd.to_datetime(dfb['timestamp_start']) dfb['timestamp_end']=pd.to_datetime(dfb['timestamp_end']) dfb1['timestamp_start']=pd.to_datetime(dfb1['timestamp_start']) dfb1['timestamp_end']=pd.to_datetime(dfb1['timestamp_end']) df1=pd.concat([dfb,dfb1]) del dfb,dfb1 df1=df1.sort_values(['timestamp_start','timestamp_end','turbine_id']) df1.reset_index(drop=True,inplace=True) df1.to_csv('C:/Users/nithi/Google Drive/Python/Student Data/downtime.csv',index=False) # In[ ]: import pandas as pd df=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA_and_downtime-NEW.csv',encoding="ISO-8859-1") # In[ ]: df.shape