#!/usr/bin/env python # coding: utf-8 # ## Using MDH Command Line to Get Data # First, we'll install the Machine Data Hub package # - type `pip install mdh` into your terminal # - type `mdh list` to view the datasets and their ID's # - type `mdh download 9 1` to download the first file of the 9th dataset # In[1]: import pandas as pd import numpy as np # In[1]: f = open('CMAPSSData/readme.txt', 'r', errors='ignore') file_contents = f.read() print(file_contents) f.close() # ## Training Data # In[1]: train = pd.read_csv('CMAPSSData/train_FD001.txt', sep=" ", header=None) train.columns = ['unit', 'cycle', 'op1', 'op2','op3','sm1','sm2','sm3','sm4','sm5','sm6','sm7','sm8','sm9','sm10','sm11','sm12','sm13','sm14','sm15','sm16','sm17','sm18','sm19','sm20','sm21','sm22','sm23'] train = train.iloc[:, :26] units = train['unit'] # ## Remaining useful life for each unit # In[2]: RUL = pd.read_csv('CMAPSSData/RUL_FD001.txt', header=None) RUL.columns = ['RUL'] # ## Test Data # In[49]: test = pd.read_csv('CMAPSSData/test_FD001.txt', sep=" ", header=None) cols = pd.DataFrame(test.columns) test = test.iloc[:, :26] test.columns = ['unit', 'cycle', 'op1', 'op2','op3','sm1','sm2','sm3','sm4','sm5','sm6','sm7','sm8','sm9','sm10','sm11','sm12','sm13','sm14','sm15','sm16','sm17','sm18','sm19','sm20','sm21'] # # Data Processing # In[50]: max_life = max(train.groupby("unit")["cycle"].max()) min_life = min(train.groupby("unit")["cycle"].max()) mean_life = np.mean(train.groupby("unit")["cycle"].max()) max_life,min_life,mean_life # ## Setting up X and y in training and testing data # #### Setting up Remaining Useful Life variable (y) # In[51]: ## reversing remaining useful life column so that it counts down until failure grp = train["cycle"].groupby(train["unit"]) rul_lst = [j for i in train["unit"].unique() for j in np.array(grp.get_group(i)[::-1])] # can be used as y or target for training # #### Setting up sensor measurements (X) # In[52]: # getting all columns except target RUL column as X train_x = train.drop("cycle", axis=1) test_x = test.drop("cycle", axis=1) # ## Applying Principal Component Analysis # In[53]: from sklearn.decomposition import PCA from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import PowerTransformer from sklearn.model_selection import train_test_split, cross_val_score # In[54]: # fitting PCA on training data, transforming training and testing data # setting index to be unit so unit is not used as a column in PCA train_data = train_x.set_index("unit") test_data = test_x.set_index("unit") # scaling the data gen = MinMaxScaler(feature_range=(0, 1)) train_x_rescaled = gen.fit_transform(train_data) test_x_rescaled = gen.transform(test_data) # PCA pca = PCA(n_components=0.95) # 95% of variance train_data_reduced = pca.fit_transform(train_x_rescaled) test_data_reduced = pca.transform(test_x_rescaled) # save results as dataframes train_df = pd.DataFrame(train_data_reduced) test_df = pd.DataFrame(test_data_reduced) # ## Making Complete Dataframe # In[55]: # making dataframe with unit, RUL counting down instead of up, and PCA features train_df.insert(0, "Unit", units, True) train_df.insert(1, "RUL", rul_lst, True) train_df.head() # ## Splitting into Train and Validate Sets # # In[56]: import random # In[57]: unique_units = train_df["Unit"].unique().tolist() np.random.seed(200) random.shuffle(unique_units) train_units = unique_units[:80] val_units = unique_units[80:] # In[58]: X = train_df.iloc[:,train_df.columns != 'RUL'] y = train_df[["Unit", "RUL"]] # splitting into train and test data based on units rather than random split X_train = X[X['Unit'].isin(train_units)] # getting first 80 units X_validate = X[X['Unit'].isin(val_units)] # getting last 20 units y_train = y[y['Unit'].isin(train_units)]["RUL"] # getting first 80 units y_validate = y[y['Unit'].isin(val_units)]["RUL"] # getting last 20 units X_train = X_train.set_index('Unit') X_validate = X_validate.set_index('Unit') # # Making Predictions # ## Linear Regression Model # In[59]: from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score, mean_squared_error import matplotlib.pyplot as plt # ### Fitting the model # In[60]: reg = LinearRegression() reg.fit(X_train, y_train) y_hat = reg.predict(X_validate) # ### Assessing Performance # In[62]: print('Training Cross Validation Score: ', cross_val_score(reg, X_train, y_train, cv=5)) print('Validation Cross Validation Score: ', cross_val_score(reg, X_validate, y_validate, cv=5)) print('Validation R^2: ', r2_score(y_validate, y_hat)) print('Validation MSE: ', mean_squared_error(y_validate, y_hat)) # ### Plotting Predicted vs True Values # In[63]: _ = plt.scatter(y_hat, y_validate) _ = plt.plot([0, 350], [0, 350], "r") _ = plt.xlabel("Predicted Remaining Useful Life") _ = plt.ylabel("True Remaining Useful Life") # In[64]: log_y_train = np.log(y_train) log_y_validate = np.log(y_validate) reg.fit(X_train, log_y_train) y_hat = reg.predict(X_validate) #log_y_train.isna().sum() # In[65]: print('Training Cross Validation Score: ', cross_val_score(reg, X_train, y_train, cv=5)) print('Validation Cross Validation Score: ', cross_val_score(reg, X_validate, log_y_validate, cv=5)) print('Validation R^2: ', r2_score(log_y_validate, y_hat)) print('Validation MSE: ', mean_squared_error(log_y_validate, y_hat)) # In[66]: _ = plt.scatter(y_hat, log_y_validate) _ = plt.plot([0, 6], [0, 6], "r") _ = plt.xlabel("Predicted Log of Remaining Useful Life") _ = plt.ylabel("True Log of Remaining Useful Life") # ## Decision Tree Regressor # ### Fitting the model # In[67]: # trying decision tree regressor from sklearn.tree import DecisionTreeRegressor dt_reg = DecisionTreeRegressor() dt_reg.fit(X_train, y_train) y_hat = dt_reg.predict(X_validate) # ### Assessing Performance # In[68]: print('Training Cross Validation Score: ', cross_val_score(dt_reg, X_train, y_train, cv=5)) print('Validation Cross Validation Score: ', cross_val_score(dt_reg, X_validate, y_validate, cv=5)) print('Validation R^2: ', r2_score(y_validate, y_hat)) print('Validation MSE: ', mean_squared_error(y_validate, y_hat)) # ### Plotting Predicted vs True Values # In[69]: _ = plt.scatter(y_hat, y_validate) _ = plt.plot([0, 350], [0, 350], "r") _ = plt.xlabel("Predicted Remaining Useful Life") _ = plt.ylabel("True Remaining Useful Life") # In[70]: dt_reg.fit(X_train, log_y_train) y_hat = dt_reg.predict(X_validate) print('Training Cross Validation Score: ', cross_val_score(dt_reg, X_train, log_y_train, cv=5)) print('Validation Cross Validation Score: ', cross_val_score(dt_reg, X_validate, log_y_validate, cv=5)) print('Validation R^2: ', r2_score(log_y_validate, y_hat)) print('Validation MSE: ', mean_squared_error(log_y_validate, y_hat)) _ = plt.scatter(y_hat, log_y_validate) _ = plt.plot([0, 6], [0, 6], "r") _ = plt.xlabel("Predicted Log of Remaining Useful Life") _ = plt.ylabel("True Log of Remaining Useful Life") # ## K-Nearest Neighbors Regressor # ### Fitting the Model # In[71]: from sklearn.neighbors import KNeighborsRegressor # In[72]: kn_reg = DecisionTreeRegressor() kn_reg.fit(X_train, y_train) y_hat = kn_reg.predict(X_validate) # ### Assessing Performance # In[73]: print('Training Cross Validation Score: ', cross_val_score(kn_reg, X_train, y_train, cv=5)) print('Validation Cross Validation Score: ', cross_val_score(kn_reg, X_validate, y_validate, cv=5)) print('Validation R^2: ', r2_score(y_validate, y_hat)) print('Validation MSE: ', mean_squared_error(y_validate, y_hat)) # ### Plotting Predicted vs True Values # In[74]: _ = plt.scatter(y_hat, y_validate) _ = plt.plot([0, 350], [0, 350], "r") _ = plt.xlabel("Predicted Remaining Useful Life") _ = plt.ylabel("True Remaining Useful Life") # In[75]: kn_reg.fit(X_train, log_y_train) y_hat = kn_reg.predict(X_validate) print('Training Cross Validation Score: ', cross_val_score(dt_reg, X_train, log_y_train, cv=5)) print('Validation Cross Validation Score: ', cross_val_score(dt_reg, X_validate, log_y_validate, cv=5)) print('Validation R^2: ', r2_score(log_y_validate, y_hat)) print('Validation MSE: ', mean_squared_error(log_y_validate, y_hat)) _ = plt.scatter(y_hat, log_y_validate) _ = plt.plot([0, 6], [0, 6], "r") _ = plt.xlabel("Predicted Log of Remaining Useful Life") _ = plt.ylabel("True Log of Remaining Useful Life") # # Trying Our Model On Test Data # In[86]: test.head() # In[77]: # taking log of y values y_test = np.log(RUL) # In[78]: #test_grp = train["cycle"].groupby(test["unit"]) #test_rul_lst = [j for i in test["unit"].unique() for j in np.array(test_grp.get_group(i)[::-1])] # In[90]: # Test Data with PCA (fit on training data) applied test_df.head() # In[80]: # predict remaining useful life y_hat_test = reg.predict(test_df) #make predictions into a dataframe y_hat_test = pd.DataFrame(y_hat_test) # add unit column back in so that we know which predictions go with which unit y_hat_test.insert(0, "unit", test["unit"]) # In[81]: _ = plt.plot(y_hat_test[0][0:400]) # In[82]: rul_pred = [] # loop through all units for each in range(1,101): # get data for one unit at a time unit_data = y_hat_test[y_hat_test["unit"] == each] # get last prediction for that unit and add it to the list of predictions rul_pred.append(unit_data.tail(1).iloc[0][0]) rul_pred = pd.DataFrame(rul_pred) # In[83]: rul_pred.head() # In[84]: print('Training Cross Validation Score: ', cross_val_score(reg, rul_pred, y_test, cv=5)) print('Validation Cross Validation Score: ', cross_val_score(reg, rul_pred, y_test, cv=5)) print('Validation R^2: ', r2_score(y_test, rul_pred)) print('Validation MSE: ', mean_squared_error(y_test, rul_pred)) # In[85]: _ = plt.scatter(rul_pred, y_test) _ = plt.plot([0, 6], [0, 6], "r") _ = plt.xlabel("Predicted Log of Remaining Useful Life") _ = plt.ylabel("True Log of Remaining Useful Life")