# import Pkg; Pkg.add("MLJ"); Pkg.add("MLJDecisionTreeInterface") using MLJ, DataFramesMeta, Plots # MLJ has some datasets builtin iris = load_iris(); typeof(iris) # but it comes as a special NamedTuple with 'scitypes' inside of it, so let's make it into a dataframe iris = DataFrame(iris) # and we can `pretty` print the dataframe first(iris,3) |> pretty # There's a univariate time series one here too sunspots = MLJ.load_sunspots() |> DataFrame; # Show an example as a dataframe example = info("KMeans", pkg="Clustering") |> pairs |> collect |> DataFrame example[1,["name", "package_name", "human_name", "hyperparameters", "is_pure_julia"]] |> DataFrame |> pretty KMeans = @load KMeans pkg=Clustering verbosity=0 # Split the dataframe into y and X, where y is the column `target` and X is everything else y, X = unpack(iris, ==(:target)) model = KMeans(k=3) mach = machine(model, X) |> fit! yhat = predict(mach, X) # Check cluster assignments compare = zip(yhat, y) |> collect compare[1:8] # clusters align with classes fitted_params(mach).centers report(mach) # Load in boston housing market data boston = load_boston() |> DataFrame first(boston,3) # Train Test Split with `partition` train, test = partition(boston, 0.8, rng=42); # As before, unpack to horizontally split into y and X y_train, X_train = unpack(train, ==(:MedV)) y_test, X_test = unpack(test, ==(:MedV)); # Load in our model and instantiate it Tree = @load RandomForestRegressor pkg=DecisionTree verbosity=0; tree = Tree() mach = machine(tree, X_train, y_train) |> fit! results = evaluate!(mach, resampling=CV(nfolds=5, rng=42), measure=rms, operation=predict) report(mach) y_pred = predict(mach, X_test) # Check RMSE accuracy: rms(y_pred, y_test) LinearRegressor = @load LinearRegressor pkg=MLJLinearModels verbosity=0 ols = LinearRegressor() schema(X_train) X_train = coerce(X_train, :Chas=>Continuous) X_test = coerce(X_test, :Chas=>Continuous) schema(X_train) ols_mach = machine(ols, X_train, y_train) |> fit! # See fitted parameters fitted_params(ols_mach) # Check RMSE evaluate!(ols_mach, resampling=CV(nfolds=5, rng=42), measure=rms, operation=predict) y_pred = predict(ols_mach, X_test); # Calculate RMSE ols_test_error = rms(y_pred, y_test) print("OLS Test RMSE: $ols_test_error") Standardizer = @load Standardizer pkg=MLJModels verbosity=0 # Just like the other models, we instantiate it and bind data to it with a machine standardizer = Standardizer() standardizer_machine = machine(standardizer, X_train) |> fit! X_train_standardized = MLJ.transform(standardizer_machine, X_train) X_test_standardized = MLJ.transform(standardizer_machine, X_test) first(X_train_standardized, 5) LassoRegressor = @load LassoRegressor pkg=MLJLinearModels verbosity=0 lasso = LassoRegressor() #default λ=1 lasso_mach = machine(lasso, X_train_standardized, y_train) |> fit! evaluate!(lasso_mach, resampling=CV(nfolds=5, rng=42), measure=rms, operation=predict) fitted_params(lasso_mach) # XGBoostRegressor = @load XGBoostRegressor pkg=XGBoost verbosity=0 # xgb = XGBoostRegressor() # xgb_mach = machine(xgb, X_train_standardized, y_train) |> fit! lasso parameter_range = range(lasso, :lambda, lower=0.001, upper=10.0, scale=:log) tuned_lasso = TunedModel(lasso, resampling=CV(nfolds=5, rng=42), tuning=Grid(resolution=10), # Search over 10 values between 'lower' and 'upper' in `parameter_range` range=parameter_range, measure=rms) # As before we then need to take this model and bind it to data as a machine tuned_lasso_machine = machine(tuned_lasso, X_train_standardized, y_train) |> fit! fitted_params(tuned_lasso_machine) best_lasso_model = fitted_params(tuned_lasso_machine).best_model Plots.plot(tuned_lasso_machine) report(tuned_lasso_machine) report(tuned_lasso_machine).plotting.measurements report(tuned_lasso_machine).best_history_entry y_pred = predict(tuned_lasso_machine, X_test_standardized) best_lasso_rmse = rms(y_pred, y_test) println("Best Lasso RMSE: $best_lasso_rmse") # Load in boston housing market data boston = load_boston() |> DataFrame # Train Test Split with `partition` train, test = partition(boston, 0.8, rng=42); # As before, unpack to horizontally split into y and X y_train, X_train = unpack(train, ==(:MedV)) y_test, X_test = unpack(test, ==(:MedV)); # 1. Coerce datatypes X_train = coerce(X_train, :Chas=>Continuous) X_test = coerce(X_test, :Chas=>Continuous) # 2. Standardize Standardizer = @load Standardizer pkg=MLJModels verbosity=0 # Just like the other models, we instantiate it and bind data to it with a machine standardizer = Standardizer() standardizer_machine = machine(standardizer, X_train) |> fit! # And transform X_train_standardized = MLJ.transform(standardizer_machine, X_train) X_test_standardized = MLJ.transform(standardizer_machine, X_test) # 3. Load and Tune Model # Load in our model and instantiate it LassoRegressor = @load LassoRegressor pkg=MLJLinearModels verbosity=0 lasso = LassoRegressor() parameter_range = range(lasso, :lambda, lower=0.001, upper=10.0, scale=:log) tuned_lasso = TunedModel(lasso, resampling=CV(nfolds=5, rng=42), tuning=Grid(resolution=10), # Search over 10 values between 'lower' and 'upper' in `parameter_range` range=parameter_range, measure=rms) # As before we then need to take this model and bind it to data as a machine, # note that we don't pipe it with `|>` into fit! this time. tuned_lasso_machine = machine(tuned_lasso, X_train_standardized, y_train) # Here we call fit! separately so we can set verbosity to 0 fit!(tuned_lasso_machine, verbosity=0) # 4. Predict y_pred = predict(tuned_lasso_machine, X_test_standardized) best_lasso_rmse = rms(y_pred, y_test) println("Best Lasso RMSE: $best_lasso_rmse") # Load in boston housing market data boston = load_boston() |> DataFrame # Train Test Split with `partition` train, test = partition(boston, 0.8, rng=42); # As before, unpack to horizontally split into y and X y_train, X_train = unpack(train, ==(:MedV)) y_test, X_test = unpack(test, ==(:MedV)); # Specify the tuned_lasso model parameter_range = range(lasso, :lambda, lower=0.001, upper=10.0, scale=:log) tuned_lasso = TunedModel(lasso, resampling=CV(nfolds=5, rng=42), tuning=Grid(resolution=10), # Search over 10 values between 'lower' and 'upper' in `parameter_range` range=parameter_range, measure=rms) # Return a `DeterministicPipeline` model pipe = (X_train -> coerce(X_train, :Chas=>Continuous)) |> Standardizer() |> tuned_lasso pipe_machine = machine(pipe, X_train, y_train) fit!(pipe_machine, verbosity=0) y_pred_pipeline = predict(pipe_machine, X_test) rms(y_pred_pipeline, y_test) # Using a `Pipeline` function instead of using the base `|>` pipes pipe = Pipeline( transformer = (X_train -> coerce(X_train, :Chas=>Continuous)), standardizer = Standardizer(), tuner = tuned_lasso ); pipe_machine = machine(pipe, X_train, y_train) fit!(pipe_machine, verbosity=0) y_pred_pipeline = predict(pipe_machine, X_test) rms(y_pred_pipeline, y_test)