#!/usr/bin/env python # coding: utf-8 # # Using Yellowbrick for Machine Learning Visualizations on Facebook Data # # Paul Witt # # The dataset below was provided to the UCI Machine Learning Repository from researchers who used Neural Networks and Decision Trees to predict how many comments a given Facebook post would generate. # # There are five variants of the dataset. This notebook only uses the first. # # The full paper can be found here: # # http://uksim.info/uksim2015/data/8713a015.pdf # # # ### The primary purpose of this notebook is to test Yellowbrick. # # # Attribute Information: # # # All features are integers or float values. # # # 1 # Page Popularity/likes # Decimal Encoding # Page feature # Defines the popularity or support for the source of the document. # # # 2 # Page Checkins’s # Decimal Encoding # Page feature # Describes how many individuals so far visited this place. This feature is only associated with the places eg:some institution, place, theater etc. # # # 3 # Page talking about # Decimal Encoding # Page feature # Defines the daily interest of individuals towards source of the document/ Post. The people who actually come back to the page, after liking the page. This include activities such as comments, likes to a post, shares, etc by visitors to the page. # # # 4 # Page Category # Value Encoding # Page feature # Defines the category of the source of the document eg: place, institution, brand etc. # # # 5 - 29 # Derived # Decimal Encoding # Derived feature # These features are aggregated by page, by calculating min, max, average, median and standard deviation of essential features. # # # 30 # CC1 # Decimal Encoding # Essential feature # The total number of comments before selected base date/time. # # # 31 # CC2 # Decimal Encoding # Essential feature # The number of comments in last 24 hours, relative to base date/time. # # # 32 # CC3 # Decimal Encoding # Essential feature # The number of comments in last 48 to last 24 hours relative to base date/time. # # # 33 # CC4 # Decimal Encoding # Essential feature # The number of comments in the first 24 hours after the publication of post but before base date/time. # # # 34 # CC5 # Decimal Encoding # Essential feature # The difference between CC2 and CC3. # # # 35 # Base time # Decimal(0-71) Encoding # Other feature # Selected time in order to simulate the scenario. # # # 36 # Post length # Decimal Encoding # Other feature # Character count in the post. # # # 37 # Post Share Count # Decimal Encoding # Other feature # This features counts the no of shares of the post, that how many peoples had shared this post on to their timeline. # # # 38 # Post Promotion Status # Binary Encoding # Other feature # To reach more people with posts in News Feed, individual promote their post and this features tells that whether the post is promoted(1) or not(0). # # # 39 # H Local # Decimal(0-23) Encoding # Other feature # This describes the H hrs, for which we have the target variable/ comments received. # # # 40-46 # Post published weekday # Binary Encoding # Weekdays feature # This represents the day(Sunday...Saturday) on which the post was published. # # # 47-53 # Base DateTime weekday # Binary Encoding # Weekdays feature # This represents the day(Sunday...Saturday) on selected base Date/Time. # # 54 # Target Variable # Decimal # Target # The no of comments in next H hrs(H is given in Feature no 39). # # # # # # # ## Data Exploration # # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import os import json import time import pickle import requests import numpy as np import pandas as pd import yellowbrick as yb import matplotlib.pyplot as plt # In[2]: df=pd.read_csv("/Users/pwitt/Documents/machine-learning/examples/pbwitt/Dataset/Training/Features_Variant_1.csv") # Fetch the data if required DATA = df print('Data Shape ' + str(df.shape)) print(df.dtypes) # In[3]: FEATURES = [ "Page Popularity/likes", "Page Checkins’s", "Page talking about", "Page Category", "Derived5", "Derived6", "Derived7", "Derived8", "Derived9", "Derived10", "Derived11", "Derived12", "Derived13", "Derived14", "Derived15", "Derived16", "Derived17", "Derived18", "Derived19", "Derived20", "Derived21", "Derived22", "Derived23", "Derived24", "Derived25", "Derived26", "Derived27", "Derived28", "Derived29", "CC1", "CC2", "CC3", 'CC4', 'CC5', "Base time", "Post length", "Post Share Count", "Post Promotion Status", "H Local", "Post published weekday-Sun", "Post published weekday-Mon", "Post published weekday-Tues", "Post published weekday-Weds", "Post published weekday-Thurs", "Post published weekday-Fri", "Post published weekday-Sat", "Base DateTime weekday-Sun", "Base DateTime weekday-Mon", "Base DateTime weekday-Tues", "Base DateTime weekday-Wed", "Base DateTime weekday-Thurs", "Base DateTime weekday-Fri", "Base DateTime weekday-Sat", "Target_Variable" ] # Read the data into a DataFrame df.columns=FEATURES df.head() #Note: Dataset is sorted. There is variation in the distributions. # In[4]: # Determine the shape of the data print("{} instances with {} columns\n".format(*df.shape)) # ## Test Yellowbrick Covariance Ranking # In[5]: from yellowbrick.features.rankd import Rank2D from yellowbrick.features.radviz import RadViz from yellowbrick.features.pcoords import ParallelCoordinates # In[6]: # Specify the features of interest # Used all for testing purposes features = FEATURES # Extract the numpy arrays from the data frame X = df[features].as_matrix() y = df["Base time"].as_matrix() # In[7]: # Instantiate the visualizer with the Covariance ranking algorithm visualizer = Rank2D(features=features, algorithm='covariance') visualizer.fit(X, y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof() # Draw/show/poof the data # In[8]: # Instantiate the visualizer with the Pearson ranking algorithm visualizer = Rank2D(features=features, algorithm='pearson') visualizer.fit(X, y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof() # Draw/show/poof the data # ## Data Extraction # # Create a bunch object to store data on disk. # # - **data**: array of shape `n_samples` * `n_features` # - **target**: array of length `n_samples` # - **feature_names**: names of the features # - **filenames**: names of the files that were loaded # - **DESCR**: contents of the readme # # # In[9]: from sklearn.datasets.base import Bunch DATA_DIR = os.path.abspath(os.path.join(".", "..", "pbwitt","data")) # Show the contents of the data directory for name in os.listdir(DATA_DIR): if name.startswith("."): continue print ("- {}".format(name)) def load_data(root=DATA_DIR): filenames = { 'meta': os.path.join(root, 'meta.json'), 'rdme': os.path.join(root, 'README.md'), 'data': os.path.join(root, 'Features_Variant_1.csv'), } #Load the meta data from the meta json with open(filenames['meta'], 'r') as f: meta = json.load(f) feature_names = meta['feature_names'] # Load the description from the README. with open(filenames['rdme'], 'r') as f: DESCR = f.read() # Load the dataset from the data file. dataset = pd.read_csv(filenames['data'], header=None) #tranform to numpy data = dataset.iloc[:,0:53] target = dataset.iloc[:,-1] # Extract the target from the data data = np.array(data) target = np.array(target) # Create the bunch object return Bunch( data=data, target=target, filenames=filenames, feature_names=feature_names, DESCR=DESCR ) # Save the dataset as a variable we can use. dataset = load_data() print(dataset.data.shape) print(dataset.target.shape) # In[10]: from yellowbrick.regressor import PredictionError, ResidualsPlot # In[11]: from sklearn import metrics from sklearn import cross_validation from sklearn.model_selection import KFold from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn import linear_model from sklearn.model_selection import train_test_split from sklearn import preprocessing from sklearn.linear_model import ElasticNet, Lasso from sklearn.linear_model import Ridge, Lasso from sklearn.model_selection import KFold # # Build and Score Regression Models # # * Create function -- add parameters for Yellowbrick target visulizations # * Score models using Mean Absolute Error, Mean Squared Error, Median Absolute Error, R2 # In[12]: def fit_and_evaluate(dataset, model, label,vis, **kwargs ): """ Because of the Scikit-Learn API, we can create a function to do all of the fit and evaluate work on our behalf! """ start = time.time() # Start the clock! scores = {'Mean Absolute Error:':[], 'Mean Squared Error:':[], 'Median Absolute Error':[], 'R2':[]} for train, test in KFold(dataset.data.shape[0], n_folds=12, shuffle=True): X_train, X_test = dataset.data[train], dataset.data[test] y_train, y_test = dataset.target[train], dataset.target[test] estimator = model(**kwargs) estimator.fit(X_train, y_train) expected = y_test predicted = estimator.predict(X_test) #For Visulizer below if vis=='Ridge_vis': return [X_train,y_train,X_test,y_test] if vis=='Lasso_vis': return [X_train,y_train,X_test,y_test] scores['Mean Absolute Error:'].append(metrics.mean_absolute_error(expected, predicted)) scores['Mean Squared Error:'].append(metrics.mean_squared_error(expected, predicted)) scores['Median Absolute Error'].append(metrics.median_absolute_error(expected, predicted )) scores['R2'].append(metrics.r2_score(expected, predicted)) # Report print("Build and Validation of {} took {:0.3f} seconds".format(label, time.time()-start)) print("Validation scores are as follows:\n") print(pd.DataFrame(scores).mean()) # Write official estimator to disk estimator = model(**kwargs) estimator.fit(dataset.data, dataset.target) outpath = label.lower().replace(" ", "-") + ".pickle" with open(outpath, 'wb') as f: pickle.dump(estimator, f) print("\nFitted model written to:\n{}".format(os.path.abspath(outpath))) # In[13]: print("Lasso Scores and Visualization Below: \n") fit_and_evaluate(dataset, Lasso, "Facebook Lasso",'NA') # Instantiate the linear model and visualizer lasso = Lasso() visualizer = PredictionError(lasso) visualizer.fit(fit_and_evaluate(dataset, Lasso, "X_train",'Lasso_vis')[0], fit_and_evaluate(dataset, Lasso, "y_train",'Lasso_vis')[1]) # Fit the training data to the visualizer visualizer.score(fit_and_evaluate(dataset, Lasso, "X_train",'Lasso_vis')[2], fit_and_evaluate(dataset, Lasso, "y_train",'Lasso_vis')[3]) g = visualizer.poof() # Draw/show/poof the data # In[14]: # Instantiate the linear model and visualizer print("Ridge Scores and Target Visualization Below:\n") fit_and_evaluate(dataset, Ridge, "Facebook Ridge", 'NA') ridge = Ridge() visualizer = ResidualsPlot(ridge) visualizer.fit(fit_and_evaluate(dataset, Ridge, "X_train",'Ridge_vis')[0], fit_and_evaluate(dataset, Ridge, "y_train",'Ridge_vis')[1]) # Fit the training data to the visualizer visualizer.score(fit_and_evaluate(dataset, Ridge, "X_train",'Ridge_vis')[2], fit_and_evaluate(dataset, Ridge, "y_train",'Ridge_vis')[3]) # Evaluate the model on the test data g = visualizer.poof() # Draw/show/poof the data # In[15]: fit_and_evaluate(dataset, ElasticNet, "Facebook ElasticNet", 'NA')