diff --git a/DESCRIPTION b/DESCRIPTION index 918d0a4..225ed2b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: HMDA Type: Package Title: Holistic Multimodel Domain Analysis for Exploratory Machine Learning -Version: 0.1 +Version: 0.2.0 Authors@R: person("E. F. Haghish", role = c("aut", "cre", "cph"), @@ -24,5 +24,5 @@ License: MIT + file LICENSE Encoding: UTF-8 RoxygenNote: 7.3.2 LazyData: true -URL: http://dx.doi.org/10.13140/RG.2.2.32473.63846, https://github.com/haghish/HMDA, https://www.sv.uio.no/psi/english/people/academic/haghish/ +URL: http://dx.doi.org/10.13140/RG.2.2.32473.63846, https://github.com/haghish/HMDA BugReports: https://github.com/haghish/HMDA/issues diff --git a/NAMESPACE b/NAMESPACE index 95c9479..7806bcf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ export(dictionary) export(hmda.adjust.params) export(hmda.autoEnsemble) export(hmda.best.models) +export(hmda.compare.shap.plot) export(hmda.domain) export(hmda.efa) export(hmda.feature.selection) @@ -12,8 +13,11 @@ export(hmda.grid) export(hmda.grid.analysis) export(hmda.init) export(hmda.partition) +export(hmda.plot) +export(hmda.plot.metrics) export(hmda.search.param) export(hmda.suggest.param) +export(hmda.test) export(hmda.wmshap) export(hmda.wmshap.table) export(suggest_mtries) @@ -70,6 +74,9 @@ importFrom(psych,factor.scores) importFrom(psych,omega) importFrom(shapley,shapley) importFrom(shapley,shapley.domain) +importFrom(shapley,shapley.domain.test) +importFrom(shapley,shapley.feature.test) +importFrom(shapley,shapley.plot) importFrom(stats,aggregate) importFrom(stats,cor) importFrom(stats,formula) diff --git a/R/hmda.best.models.R b/R/hmda.best.models.R index 137c792..ad171fc 100644 --- a/R/hmda.best.models.R +++ b/R/hmda.best.models.R @@ -73,17 +73,30 @@ #' @importFrom utils head #' @export #' @author E. F. Haghish -hmda.best.models <- function(df, n_models = 1) { +hmda.best.models <- function(df, + n_models = NULL, + metrics = c("logloss", "mae", "mse", "rmse", "rmsle", + "mean_per_class_error", "auc", "aucpr", + "r2", "accuracy", "f1", "mcc", "f2"), + distance_percentage = NULL, + hyperparam = FALSE +) { if (!inherits(df, "hmda.grid.analysis")) { warning("'df' is not of class 'hmda.grid.analysis'... watch out!") } - # Define known performance metrics and their optimization directions - known_metrics <- c("logloss", "mae", "mse", "rmse", "rmsle", - "mean_per_class_error", "auc", "aucpr", - "r2", "accuracy", "f1", "mcc", "f2") + if (!is.null(n_models) & !is.null(distance_percentage)) { + stop("either specify 'n_models' or 'distance_percentage'") + } + else if (is.null(n_models) & is.null(distance_percentage)) { + n_models <- 1 + } + result <- NULL + sort_direction_list <- NULL + + # Define known performance metrics and their optimization directions directions <- c( logloss = "minimize", mae = "minimize", @@ -100,6 +113,24 @@ hmda.best.models <- function(df, n_models = 1) { f2 = "maximize" ) + sort_direction <- c( + logloss = FALSE, + mae = FALSE, + mse = FALSE, + rmse = FALSE, + rmsle = FALSE, + mean_per_class_error = FALSE, + auc = TRUE, + aucpr = TRUE, + r2 = TRUE, + accuracy = TRUE, + f1 = TRUE, + mcc = TRUE, + f2 = TRUE + ) + + #directions <- directions[directions %in% metrics] + # Check for the existence of the model_ids column if (!"model_ids" %in% names(df)) { stop("The data frame must contain a 'model_ids' column.") @@ -109,35 +140,75 @@ hmda.best.models <- function(df, n_models = 1) { best_model_ids <- c() # Loop over each known metric and determine the top n_models best model IDs - for (metric in known_metrics) { - if (metric %in% names(df)) { - vals <- df[[metric]] + for (met in metrics) { + if (met %in% names(df)) { + vals <- df[[met]] if (all(is.na(vals))) next # Skip metric if all values are NA - dir <- directions[[metric]] + dir <- directions[[met]] if (is.null(dir)) dir <- "minimize" # Order indices according to performance (top n_models) if (dir == "maximize") { ordered_idx <- order(vals, decreasing = TRUE) + sort_direction_list <- c(sort_direction_list, TRUE) } else { ordered_idx <- order(vals, decreasing = FALSE) + sort_direction_list <- c(sort_direction_list, FALSE) } - top_idx <- head(ordered_idx, n_models) - best_model_ids <- c(best_model_ids, df$model_ids[top_idx]) + # Select the models for each metric + if (!is.null(n_models)) { + top_idx <- head(ordered_idx, n_models) + best_model_ids <- c(best_model_ids, df$model_ids[top_idx]) + } + else if (!is.null(distance_percentage)) { + if (dir == "maximize") { + best_value <- max(df[[met]], na.rm = TRUE) + best_model_ids <- c(best_model_ids, df$model_ids[which(df[[met]] >= best_value * distance_percentage)]) + } + else { + best_value <- min(df[[met]], na.rm = TRUE) + best_model_ids <- c(best_model_ids, df$model_ids[which(df[[met]] <= best_value * (1-distance_percentage))]) + } + } } } - # Get the unique union of best model IDs - best_model_ids <- unique(best_model_ids) + # # Get the unique union of best model IDs + # best_model_ids <- unique(best_model_ids) # Determine which known metric columns exist in df - existing_metrics <- intersect(known_metrics, names(df)) + existing_metrics <- intersect(metrics, names(df)) + + # print(df[df$model_ids %in% best_model_ids, c("model_ids", existing_metrics), drop = FALSE]) # Subset the original data frame for the best models and only include # the model_ids and the performance metric columns. - result <- df[df$model_ids %in% best_model_ids, c("model_ids", existing_metrics), drop = FALSE] + if (hyperparam) result <- df[df$model_ids %in% best_model_ids, , drop = FALSE] + else result <- df[df$model_ids %in% best_model_ids, c("model_ids", existing_metrics), drop = FALSE] + + + # sort the columns for all the existing_metrics columns in order + # ========================================================== + # result <- result[with(result, order(existing_metrics)), ] + order_by_cols <- function(df, cols, decreasing = FALSE, na.last = TRUE) { + stopifnot(all(cols %in% names(df))) + if (length(decreasing) == 1L) decreasing <- rep(decreasing, length(cols)) + stopifnot(length(decreasing) == length(cols)) + + keys <- Map(function(col, dec) { + k <- xtfrm(df[[col]]) # numeric ranking for any type + if (dec) -k else k + }, cols, decreasing) + + ord <- do.call(order, c(keys, list(na.last = na.last))) + df[ord, , drop = FALSE] + } + result <- order_by_cols(result, existing_metrics, decreasing = sort_direction_list) + # ord <- do.call(order, c(as.list(result[existing_metrics]), list(na.last = TRUE))) + # result <- result[ord, , drop = FALSE] + # Drop metric columns that are entirely NA in the resulting subset for (col in existing_metrics) { @@ -148,3 +219,6 @@ hmda.best.models <- function(df, n_models = 1) { return(result) } + + + diff --git a/R/hmda.compare.shap.plot.R b/R/hmda.compare.shap.plot.R new file mode 100644 index 0000000..b903024 --- /dev/null +++ b/R/hmda.compare.shap.plot.R @@ -0,0 +1,124 @@ + + +#' @importFrom shapley shapley shapley.plot +#' +#' @export + +hmda.compare.shap.plot <- function(hmda.grid.analysis, + newdata = train.hex, + model_id = NULL, + metrics = c("aucpr", "mcc", "f2"), + plot = "shap", + top_n_features = 4) { + + # store the plots + plot_list <- list() + MODEL_IDS <- NULL + MODEL_NAMES <- NULL + n = 0 + + # get the model ids + if (is.null(model_id)) { + for (m in metrics) { + MODEL_IDS <- c(MODEL_IDS, hmda.best.models(hmda.grid.analysis, n_models = 1, metrics = m)$model_ids) + MODEL_NAMES<-c(MODEL_NAMES, paste("\nHighest", toupper(m))) + } + } + else { + MODEL_IDS <- model_id + for (m in seq(length(MODEL_IDS))) { + MODEL_NAMES<-c(MODEL_NAMES, paste("\nModel", m)) + } + } + + for (id in MODEL_IDS) { + n <- n + 1 + + # get the model + # ============================================================ + model <- h2o.getModel(id) + + if (plot == "shap") { + p <- h2o.shap_summary_plot(model, + newdata, + top_n = top_n_features) + + + scale_y_continuous(limits = c(-1, 1), + breaks = seq(-1, 1, by = 0.5)) + + + ggtitle(MODEL_NAMES[n]) + + xlab("") + + ylab("") + + theme_classic() + + labs(colour = "Normalized values\n") + + theme( + legend.position="none", + legend.justification = "right", + legend.title.align = 0.5, + legend.direction = "horizontal", + legend.text=element_text(colour="black", size=6, face="bold"), + plot.title = element_text(size = 10), + legend.key.height = grid::unit(0.3, "cm"), + legend.key.width = grid::unit(1, "cm"), + #legend.margin=margin(grid::unit(0,"cm")), + legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "cm"), + #plot.margin = margin(t = -0.3, r = .25, b = .25, l = .25, unit = "cm") # Reduce top plot margin + ) + } + + if (plot == "bar") { + resultsnow <- shapley( + n_models = 1, + standardize_performance_metric = TRUE, #for a single model + models = id, + newdata = newdata, + top_n_features = top_n_features, + plot = FALSE) + + p <- shapley.plot(resultsnow, top_n_features = top_n_features) + + p <- p + + ggtitle(MODEL_NAMES[n]) + + xlab("") + + ylab("") + # theme_classic() + + theme( + # legend.position="none", + # legend.justification = "right", + # legend.title.align = 0.5, + # legend.direction = "horizontal", + # legend.text=element_text(colour="black", size=6, face="bold"), + plot.title = element_text(size = 10), + # legend.key.height = grid::unit(0.3, "cm"), + # legend.key.width = grid::unit(1, "cm"), + # #legend.margin=margin(grid::unit(0,"cm")), + # legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "cm"), + # #plot.margin = margin(t = -0.3, r = .25, b = .25, l = .25, unit = "cm") # Reduce top plot margin + ) + } + + # save the plot in the list + plot_list[[n]] <- p + } + + + # COMPARE PLOTS + # ============================================================ + library(gridExtra) + combined <- grid.arrange(grobs = plot_list, + nrow = 1, + left = "Top 5 features", + bottom = "Comparison of SHAP contributions of top 5 features across three GBM models" + ) + + print(combined) + return(combined) +} + +# print(compare.shap.plot(hmda.grid.analysis = grid_analysis, +# newdata = splits$hmda.test.hex, +# metrics = c("aucpr", "mcc", "f2"), +# plot = "bar", +# top_n_features = 5)) + + diff --git a/R/hmda.domain.R b/R/hmda.domain.R index a0f8aa1..8854f53 100644 --- a/R/hmda.domain.R +++ b/R/hmda.domain.R @@ -1,6 +1,6 @@ #' @title compute and plot weighted mean SHAP contributions at group level (factors or domains) #' @description This function applies different criteria to visualize SHAP contributions -#' @param shapley object of class 'shapley', as returned by the 'shapley' function +#' @param wmshap object of class 'shapley', as returned by the 'shapley' function #' @param plot character, specifying the type of the plot, which can be either #' 'bar', 'waffle', or 'shap'. The default is 'bar'. #' @param domains character list, specifying the domains for grouping the features' @@ -93,7 +93,7 @@ #' Group2 = c("x25", "x23", "x6", "x27"), #' Group3 = c("x28", "x26")) #' -#' hmda.domain(shapley = wmshap, +#' hmda.domain(wmshap = wmshap, #' plot = "bar", #' domains = domains, #' print = TRUE) @@ -102,21 +102,23 @@ #' @author E. F. Haghish -hmda.domain <- function(shapley, +hmda.domain <- function(wmshap, domains, plot = "bar", legendstyle = "continuous", scale_colour_gradient = NULL, #this is a BUG because it is not implemented # COLORCODE IS MISSING :( - print = FALSE) { + print = FALSE, + xlab = "Factors") { return( - shapley.domain(shapley = shapley, + shapley.domain(shapley = wmshap, domains = domains, plot = plot, legendstyle = legendstyle, scale_colour_gradient = scale_colour_gradient, #this is a BUG because it is not implemented # COLORCODE IS MISSING :( - print = print) + print = print, + xlab = xlab) ) } diff --git a/R/hmda.efa.R b/R/hmda.efa.R index 02316fa..5b16ca0 100644 --- a/R/hmda.efa.R +++ b/R/hmda.efa.R @@ -159,7 +159,7 @@ hmda.efa <- function(df, # message(paste(pa$value$nfact, "factors are recommended, but there might be better solutions (for example, up to", sum(pa$value$fa.values > 0), "factors)")) # } # else message(paste(pa$value$nfact, "factors are recommended")) - message(paste(pa$value$nfact, "factors are recommended")) + message(paste(pa$value$nfact, "factors are recommended\n\n==============================\n\n")) } # Run the exploratory factor analysis @@ -169,10 +169,6 @@ hmda.efa <- function(df, fm = algorithm, rotate = rotation) - - - loadings <- EFAresults$loadings - # # EFAresults <- factanal(~ ., # data = df[, features], # factors = if (!is.null(nfactors)) nfactors else pa$value$nfact, diff --git a/R/hmda.grid.R b/R/hmda.grid.R index 19dfa98..4721665 100644 --- a/R/hmda.grid.R +++ b/R/hmda.grid.R @@ -86,7 +86,7 @@ #' nfolds = 10, #' ntrees = 100, #' seed = 1, -#' hyper_params = gbm_params1) +#' hyper_params = params) #' #' # Assess the performances of the models #' grid_performance <- hmda.grid.analysis(hmda_grid1) diff --git a/R/hmda.grid.analysis.R b/R/hmda.grid.analysis.R index cfd4bae..2c56378 100644 --- a/R/hmda.grid.analysis.R +++ b/R/hmda.grid.analysis.R @@ -87,7 +87,8 @@ hmda.grid.analysis <- function(grid, performance_metrics = c("logloss", "mse", "rmse", "rmsle", "auc", - "aucpr", "mean_per_class_error", "r2"), + "aucpr", "mean_per_class_error", "r2", + "f1","f2","mcc","kappa"), sort_by = "logloss") { # prepare the performance dataset @@ -118,12 +119,13 @@ hmda.grid.analysis <- function(grid, else if (j == "rmse") try(performance[i, j] <- h2o.rmse(MODEL, xval = TRUE), silent = TRUE) # for thresholds metrics... - else if (j %in% c("f1","f2","mcc","kappa")) { + else if (j %in% c("mean_per_class_error","f2","mcc","kappa")) { PERF <- h2o.performance(model = MODEL, xval= TRUE) - if (j == "f1") try(performance[i, j] <- h2o.F1(PERF), silent = TRUE) - else if (j == "f2") try(performance[i, j] <- h2o.F2(PERF), silent = TRUE) - else if (j == "mcc") try(performance[i, j] <- h2o.mcc(PERF), silent = TRUE) - else if (j == "kappa") try(performance[i, j] <- h2otools::kappa(PERF), silent = TRUE) + # if (j == "f1") try(performance[i, j] <- h2o.F1(PERF), silent = TRUE) + if (j == "mean_per_class_error") try(performance[i, j] <- PERF@metrics$mean_per_class_error, silent = TRUE) + else if (j == "f2") try(performance[i, j] <- PERF@metrics$max_criteria_and_metric_scores[2,3], silent = TRUE) + else if (j == "mcc") try(performance[i, j] <- PERF@metrics$max_criteria_and_metric_scores[8,3], silent = TRUE) + else if (j == "kappa") try(performance[i, j] <- h2otools::kappa(PER, max=TRUE), silent = TRUE) } } } diff --git a/R/hmda.plot.R b/R/hmda.plot.R new file mode 100644 index 0000000..1b4409e --- /dev/null +++ b/R/hmda.plot.R @@ -0,0 +1,104 @@ +#' @title Plot WMSHAP contributions +#' @description This function applies different criteria to visualize WMSHAP contributions +#' @param shapley object of class 'shapley', as returned by the 'shapley' function +#' @param plot character, specifying the type of the plot, which can be either +#' 'bar', 'waffle', or 'shap'. The default is 'bar'. +#' @param method Character. The column name in \code{summaryShaps} used +#' for feature selection. Default is \code{"mean"}, which +#' selects important features which have weighted mean shap +#' ratio (WMSHAP) higher than the specified cutoff. Other +#' alternative is "lowerCI", which selects features which +#' their lower bound of confidence interval is higher than +#' the cutoff. +#' @param cutoff numeric, specifying the cutoff for the method used for selecting +#' the top features. +#' @param top_n_features Integer. If specified, the top n features with the +#' highest weighted SHAP values will be selected, overrullung +#' the 'cutoff' and 'method' arguments. +#' @param features character vector, specifying the feature to be plotted. +#' @param legendstyle character, specifying the style of the plot legend, which +#' can be either 'continuous' (default) or 'discrete'. the +#' continuous legend is only applicable to 'shap' plots and +#' other plots only use 'discrete' legend. +#' @param scale_colour_gradient character vector for specifying the color gradients +#' for the plot. +#' @param labels provide feature labels for items in the dataset. If specified, +#' feature descriptions will be shown in the plot instead of the +#' feature names as included in the dataset. To specify the labels, +#' use the \code{c} function and list the labes, e.g., +#' \code{c(feature1 = label2, feature2 = label2, ...)}. +# @importFrom waffle waffle +#' @importFrom h2o h2o.shap_summary_plot h2o.getModel +#' @importFrom ggplot2 scale_colour_gradient2 theme guides guide_legend guide_colourbar +#' margin element_text theme_classic labs ylab xlab ggtitle +#' @author E. F. Haghish +#' @return ggplot object +#' @examples +#' +#' \dontrun{ +#' # load the required libraries for building the base-learners and the ensemble models +#' library(h2o) #shapley supports h2o models +#' library(shapley) +#' +#' # initiate the h2o server +#' h2o.init(ignore_config = TRUE, nthreads = 2, bind_to_localhost = FALSE, insecure = TRUE) +#' +#' # upload data to h2o cloud +#' prostate_path <- system.file("extdata", "prostate.csv", package = "h2o") +#' prostate <- h2o.importFile(path = prostate_path, header = TRUE) +#' +#' ### H2O provides 2 types of grid search for tuning the models, which are +#' ### AutoML and Grid. Below, I demonstrate how weighted mean shapley values +#' ### can be computed for both types. +#' +#' set.seed(10) +#' +#' ####################################################### +#' ### PREPARE AutoML Grid (takes a couple of minutes) +#' ####################################################### +#' # run AutoML to tune various models (GBM) for 60 seconds +#' y <- "CAPSULE" +#' prostate[,y] <- as.factor(prostate[,y]) #convert to factor for classification +#' aml <- h2o.automl(y = y, training_frame = prostate, max_runtime_secs = 120, +#' include_algos=c("GBM"), +#' +#' # this setting ensures the models are comparable for building a meta learner +#' seed = 2023, nfolds = 10, +#' keep_cross_validation_predictions = TRUE) +#' +#' ### call 'shapley' function to compute the weighted mean and weighted confidence intervals +#' ### of SHAP values across all trained models. +#' ### Note that the 'newdata' should be the testing dataset! +#' result <- shapley(models = aml, newdata = prostate, plot = TRUE) +#' +#' ####################################################### +#' ### PLOT THE WEIGHTED MEAN SHAP VALUES +#' ####################################################### +#' +#' shapley.plot(result, plot = "bar") +# shapley.plot(result, plot = "waffle") +#' } +#' @export + +hmda.plot <- function(wmshap, + plot = "bar", + method = "mean", + cutoff = 0.01, + top_n_features = NULL, + features = NULL, + legendstyle = "continuous", + scale_colour_gradient = NULL, + labels = NULL) { + + shapley.plot(wmshap, + plot = plot, + method = method, + cutoff = cutoff, + top_n_features = top_n_features, + features = features, + legendstyle = legendstyle, + scale_colour_gradient = scale_colour_gradient, + labels = labels) +} + + diff --git a/R/hmda.plot.metrics.R b/R/hmda.plot.metrics.R new file mode 100644 index 0000000..8bcc1e2 --- /dev/null +++ b/R/hmda.plot.metrics.R @@ -0,0 +1,128 @@ + + + + +#' @export +#' @author E. F. Haghish + +# plot the AUCPR, MCC, and F2 values on a line with ggplot2 +hmda.plot.metrics <- function(df, + metrics, + criteria = "top_models", + top_models = 100, + distance_percentage = 0.99, + plot = TRUE) { + library(reshape2) + library(ggplot2) + IDS <- NULL + # range <- NULL + # min <- NULL + # max <- NULL + + # subset models + # ============================================================ + if (criteria == "top_models") { + df <- df[1:top_models, ] + # range <- c(min(df[, metrics]), max(df[, metrics])) + } + else if (criteria == "distance_percentage") { + + # for each metric get the best model performance + for (met in metrics) { + best_value <- max(df[[met]], na.rm = TRUE) + IDS <- c(IDS, df$model_ids[which(df[[met]] >= best_value * distance_percentage)]) + # range <- c(range, best_value, best_value * distance_percentage) + } + + IDS <- unique(IDS) + df <- df[df$model_ids %in% IDS, ] + } + + # # round range to go to closest .05 + # min <- floor(range[1] * 20) / 20 + # max <- ceiling(range[2] * 20) / 20 + + # change the IDs to index + grid_legnth <- nrow(df) + df$model_ids <- as.integer(seq(1, grid_legnth)) + + + melted <- melt(df, id.vars = "model_ids", measure.vars = metrics) + + + trends_plot <- ggplot(melted, aes(x = model_ids, y = value, color = variable)) + + scale_color_brewer(palette = "Set1") + + geom_line(size = 1, alpha = .3) + + geom_point(size = 2, alpha = .5) + + scale_x_continuous(breaks = seq(0, grid_legnth, by = 10)) + + scale_y_continuous(#limits = c(min, max) , breaks = seq(0.6, 0.8, by = 0.1), expand = c(0, 0) + ) + + labs(x = "\nIndex of models (sorted by AUCPR)", + y = "Performance metrics\n", + color = "") + + #ggtitle("Comparison of AUCPR, MCC, and F2 across top 100 GBM models") + + theme_classic() + + theme( + legend.position="top", + legend.justification = "right", + legend.title.align = 0.5, + legend.direction = "horizontal", + legend.text=element_text(colour="black", size=8, face="bold"), + plot.title = element_text(size = 12), + legend.key.height = grid::unit(0.4, "cm"), + legend.key.width = grid::unit(1.2, "cm"), + #legend.margin=margin(grid::unit(0,"cm")), + legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "cm"), + #plot.margin = margin(t = 0.3, r = .3, b = .25, l = .2, unit = "cm") # Reduce top plot margin + ) + + # get the colors used in the plot + #plot_colors <- scales::hue_pal()(length(metrics)) + + # make a loop for "metrics", get the best value of each metric, and plot the circle with a higher-alpha color + for (i in seq_along(metrics)) { + m <- metrics[i] + best_metric_index <- which.max(df[[m]]) + trends_plot <- trends_plot + + geom_point(data = melted[melted$model_ids == best_metric_index & melted$variable == m, ], size = 2, alpha = 1) + + # geom_point(data = melted[melted$model_ids == best_metric_index & melted$variable == m, ], + # aes(x = model_ids, y = value), + # #color = "black", + # size = 3, + # shape = 21, + # #fill = "yellow" + # ) + + geom_vline(xintercept = best_metric_index, + linetype="dashed", + # add color black with alpha set to 0.5 + #color = "red", + alpha = 0.15 + + ) #+ + # annotate("text", x = best_metric_index + 5, y = melted$value[melted$model_ids == best_metric_index & melted$variable == m], + # label = paste("Best", toupper(m)), + # angle = 90, vjust = -0.5, size = 3) + } + + # make a loop for "metrics" and print the vertical line for all of them + # for (i in seq_along(metrics)) { + # m <- metrics[i] + # best_metric_index <- which.max(df[[m]]) + # trends_plot <- trends_plot + + # geom_vline(xintercept = best_metric_index, + # linetype="dashed", + # #color = plot_colors[i] + # ) + + # annotate("text", x = best_metric_index + 5, y = 0.6 + (i - 1) * 0.05, + # label = paste("Best", toupper(m)), + # #color = plot_colors[i], + # angle = 90, vjust = -0.5, size = 3) + # } + + if (plot) print(trends_plot) + + return(plot) +} + + + diff --git a/R/hmda.test.R b/R/hmda.test.R new file mode 100644 index 0000000..3f4c77b --- /dev/null +++ b/R/hmda.test.R @@ -0,0 +1,81 @@ +#' @title Normalize a vector based on specified minimum and maximum values +#' @description This function normalizes a vector based on specified minimum +#' and maximum values. If the minimum and maximum values are not +#' specified, the function will use the minimum and maximum values +#' of the vector. +#' @param wmshap object of class 'shapley', as returned by the 'shapley' function +#' @param features character, name of two features to be compared with permutation test +#' @param domains list of two character vectors, each including a number of features. +#' @param n integer, number of permutations +#' @importFrom shapley shapley.feature.test shapley.domain.test +#' @author E. F. Haghish +#' @return normalized numeric vector +#' @examples +#' +#' \dontrun{ +#' # load the required libraries for building the base-learners and the ensemble models +#' library(h2o) #shapley supports h2o models +#' library(autoEnsemble) #autoEnsemble models, particularly useful under severe class imbalance +#' library(shapley) +#' +#' # initiate the h2o server +#' h2o.init(ignore_config = TRUE, nthreads = 2, bind_to_localhost = FALSE, insecure = TRUE) +#' +#' # upload data to h2o cloud +#' prostate_path <- system.file("extdata", "prostate.csv", package = "h2o") +#' prostate <- h2o.importFile(path = prostate_path, header = TRUE) +#' +#' ### H2O provides 2 types of grid search for tuning the models, which are +#' ### AutoML and Grid. Below, I demonstrate how weighted mean shapley values +#' ### can be computed for both types. +#' +#' set.seed(10) +#' +#' ####################################################### +#' ### PREPARE AutoML Grid (takes a couple of minutes) +#' ####################################################### +#' # run AutoML to tune various models (GBM) for 60 seconds +#' y <- "CAPSULE" +#' prostate[,y] <- as.factor(prostate[,y]) #convert to factor for classification +#' aml <- h2o.automl(y = y, training_frame = prostate, max_runtime_secs = 120, +#' include_algos=c("GBM"), +#' +#' # this setting ensures the models are comparable for building a meta learner +#' seed = 2023, nfolds = 10, +#' keep_cross_validation_predictions = TRUE) +#' +#' ### call 'shapley' function to compute the weighted mean and weighted confidence intervals +#' ### of SHAP values across all trained models. +#' ### Note that the 'newdata' should be the testing dataset! +#' result <- shapley(models = aml, newdata = prostate, plot = TRUE) +#' +#' ####################################################### +#' ### Significance testing of contributions of two features +#' ####################################################### +#' # testing the WMSHAP contributions of two features +#' hmda.test(result, features = c("GLEASON", "PSA"), n = 5000) +#' +#' # testing the WMSHAP contributions of two domains (groups of features, latent factors, etc.) +#' hmda.test(result, +#' n = 5000, +#' domains = list(Demographic = c("RACE", "AGE"), +#' Cancer = c("VOL", "PSA", "GLEASON"))) +#' } +#' @export + +hmda.test <- function(wmshap, features = NULL, domains = NULL, n = 5000) { + + if (!is.null(features) & !is.null(domains)) + stop("either specify a vector of 'features' or a list of 'domains'") + else if (is.null(features) & is.null(domains)) + stop("either specify a vector of 'features' or a list of 'domains'") + + else if (!is.null(features)) { + shapley.feature.test(shapley = wmshap, features = features, n = n) + } + else if (!is.null(domains)) { + shapley.domain.test(shapley = wmshap, domains, n = n) + } +} + + diff --git a/R/hmda.wmshap.R b/R/hmda.wmshap.R index b4ee72d..28b1560 100644 --- a/R/hmda.wmshap.R +++ b/R/hmda.wmshap.R @@ -170,7 +170,7 @@ hmda.wmshap <- function(models, cutoff = 0.01, top_n_features = NULL, n_models = 10, - sample_size = nrow(newdata) + sample_size = NULL #normalize_to = "upperCI" ) { return( @@ -186,7 +186,7 @@ hmda.wmshap <- function(models, cutoff = cutoff, top_n_features = top_n_features, n_models = n_models, - sample_size = nrow(newdata) + sample_size = sample_size #normalize_to = "upperCI" ) ) diff --git a/man/figures/.DS_Store b/man/figures/.DS_Store index cb9523b..e58dca6 100644 Binary files a/man/figures/.DS_Store and b/man/figures/.DS_Store differ diff --git a/man/hmda.best.models.Rd b/man/hmda.best.models.Rd index 3f88b8b..408ece1 100644 --- a/man/hmda.best.models.Rd +++ b/man/hmda.best.models.Rd @@ -4,7 +4,14 @@ \alias{hmda.best.models} \title{Select Best Models Across All Models in HMDA Grid} \usage{ -hmda.best.models(df, n_models = 1) +hmda.best.models( + df, + n_models = NULL, + metrics = c("logloss", "mae", "mse", "rmse", "rmsle", "mean_per_class_error", "auc", + "aucpr", "r2", "accuracy", "f1", "mcc", "f2"), + distance_percentage = NULL, + hyperparam = FALSE +) } \arguments{ \item{df}{A data frame of class \code{"hmda.grid.analysis"} containing diff --git a/man/hmda.domain.Rd b/man/hmda.domain.Rd index a201f9e..0cc3c60 100644 --- a/man/hmda.domain.Rd +++ b/man/hmda.domain.Rd @@ -5,16 +5,17 @@ \title{compute and plot weighted mean SHAP contributions at group level (factors or domains)} \usage{ hmda.domain( - shapley, + wmshap, domains, plot = "bar", legendstyle = "continuous", scale_colour_gradient = NULL, - print = FALSE + print = FALSE, + xlab = "Factors" ) } \arguments{ -\item{shapley}{object of class 'shapley', as returned by the 'shapley' function} +\item{wmshap}{object of class 'shapley', as returned by the 'shapley' function} \item{domains}{character list, specifying the domains for grouping the features' contributions. Domains are clusters of features' names, that @@ -111,7 +112,7 @@ This function applies different criteria to visualize SHAP contributions Group2 = c("x25", "x23", "x6", "x27"), Group3 = c("x28", "x26")) - hmda.domain(shapley = wmshap, + hmda.domain(wmshap = wmshap, plot = "bar", domains = domains, print = TRUE) diff --git a/man/hmda.grid.Rd b/man/hmda.grid.Rd index c1c9171..0077ef2 100644 --- a/man/hmda.grid.Rd +++ b/man/hmda.grid.Rd @@ -122,7 +122,7 @@ The function executes the following steps: nfolds = 10, ntrees = 100, seed = 1, - hyper_params = gbm_params1) + hyper_params = params) # Assess the performances of the models grid_performance <- hmda.grid.analysis(hmda_grid1) diff --git a/man/hmda.grid.analysis.Rd b/man/hmda.grid.analysis.Rd index 8be7830..9089385 100644 --- a/man/hmda.grid.analysis.Rd +++ b/man/hmda.grid.analysis.Rd @@ -7,7 +7,7 @@ hmda.grid.analysis( grid, performance_metrics = c("logloss", "mse", "rmse", "rmsle", "auc", "aucpr", - "mean_per_class_error", "r2"), + "mean_per_class_error", "r2", "f1", "f2", "mcc", "kappa"), sort_by = "logloss" ) } diff --git a/man/hmda.plot.Rd b/man/hmda.plot.Rd new file mode 100644 index 0000000..81f36a9 --- /dev/null +++ b/man/hmda.plot.Rd @@ -0,0 +1,109 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hmda.plot.R +\name{hmda.plot} +\alias{hmda.plot} +\title{Plot WMSHAP contributions} +\usage{ +hmda.plot( + wmshap, + plot = "bar", + method = "mean", + cutoff = 0.01, + top_n_features = NULL, + features = NULL, + legendstyle = "continuous", + scale_colour_gradient = NULL, + labels = NULL +) +} +\arguments{ +\item{plot}{character, specifying the type of the plot, which can be either +'bar', 'waffle', or 'shap'. The default is 'bar'.} + +\item{method}{Character. The column name in \code{summaryShaps} used +for feature selection. Default is \code{"mean"}, which +selects important features which have weighted mean shap +ratio (WMSHAP) higher than the specified cutoff. Other +alternative is "lowerCI", which selects features which +their lower bound of confidence interval is higher than +the cutoff.} + +\item{cutoff}{numeric, specifying the cutoff for the method used for selecting +the top features.} + +\item{top_n_features}{Integer. If specified, the top n features with the +highest weighted SHAP values will be selected, overrullung +the 'cutoff' and 'method' arguments.} + +\item{features}{character vector, specifying the feature to be plotted.} + +\item{legendstyle}{character, specifying the style of the plot legend, which +can be either 'continuous' (default) or 'discrete'. the +continuous legend is only applicable to 'shap' plots and +other plots only use 'discrete' legend.} + +\item{scale_colour_gradient}{character vector for specifying the color gradients +for the plot.} + +\item{labels}{provide feature labels for items in the dataset. If specified, +feature descriptions will be shown in the plot instead of the +feature names as included in the dataset. To specify the labels, +use the \code{c} function and list the labes, e.g., +\code{c(feature1 = label2, feature2 = label2, ...)}.} + +\item{shapley}{object of class 'shapley', as returned by the 'shapley' function} +} +\value{ +ggplot object +} +\description{ +This function applies different criteria to visualize WMSHAP contributions +} +\examples{ + +\dontrun{ +# load the required libraries for building the base-learners and the ensemble models +library(h2o) #shapley supports h2o models +library(shapley) + +# initiate the h2o server +h2o.init(ignore_config = TRUE, nthreads = 2, bind_to_localhost = FALSE, insecure = TRUE) + +# upload data to h2o cloud +prostate_path <- system.file("extdata", "prostate.csv", package = "h2o") +prostate <- h2o.importFile(path = prostate_path, header = TRUE) + +### H2O provides 2 types of grid search for tuning the models, which are +### AutoML and Grid. Below, I demonstrate how weighted mean shapley values +### can be computed for both types. + +set.seed(10) + +####################################################### +### PREPARE AutoML Grid (takes a couple of minutes) +####################################################### +# run AutoML to tune various models (GBM) for 60 seconds +y <- "CAPSULE" +prostate[,y] <- as.factor(prostate[,y]) #convert to factor for classification +aml <- h2o.automl(y = y, training_frame = prostate, max_runtime_secs = 120, + include_algos=c("GBM"), + + # this setting ensures the models are comparable for building a meta learner + seed = 2023, nfolds = 10, + keep_cross_validation_predictions = TRUE) + +### call 'shapley' function to compute the weighted mean and weighted confidence intervals +### of SHAP values across all trained models. +### Note that the 'newdata' should be the testing dataset! +result <- shapley(models = aml, newdata = prostate, plot = TRUE) + +####################################################### +### PLOT THE WEIGHTED MEAN SHAP VALUES +####################################################### + +shapley.plot(result, plot = "bar") +} +} +\author{ +E. F. Haghish +} diff --git a/man/hmda.test.Rd b/man/hmda.test.Rd new file mode 100644 index 0000000..0acf4f8 --- /dev/null +++ b/man/hmda.test.Rd @@ -0,0 +1,81 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hmda.test.R +\name{hmda.test} +\alias{hmda.test} +\title{Normalize a vector based on specified minimum and maximum values} +\usage{ +hmda.test(wmshap, features = NULL, domains = NULL, n = 5000) +} +\arguments{ +\item{wmshap}{object of class 'shapley', as returned by the 'shapley' function} + +\item{features}{character, name of two features to be compared with permutation test} + +\item{domains}{list of two character vectors, each including a number of features.} + +\item{n}{integer, number of permutations} +} +\value{ +normalized numeric vector +} +\description{ +This function normalizes a vector based on specified minimum + and maximum values. If the minimum and maximum values are not + specified, the function will use the minimum and maximum values + of the vector. +} +\examples{ + +\dontrun{ +# load the required libraries for building the base-learners and the ensemble models +library(h2o) #shapley supports h2o models +library(autoEnsemble) #autoEnsemble models, particularly useful under severe class imbalance +library(shapley) + +# initiate the h2o server +h2o.init(ignore_config = TRUE, nthreads = 2, bind_to_localhost = FALSE, insecure = TRUE) + +# upload data to h2o cloud +prostate_path <- system.file("extdata", "prostate.csv", package = "h2o") +prostate <- h2o.importFile(path = prostate_path, header = TRUE) + +### H2O provides 2 types of grid search for tuning the models, which are +### AutoML and Grid. Below, I demonstrate how weighted mean shapley values +### can be computed for both types. + +set.seed(10) + +####################################################### +### PREPARE AutoML Grid (takes a couple of minutes) +####################################################### +# run AutoML to tune various models (GBM) for 60 seconds +y <- "CAPSULE" +prostate[,y] <- as.factor(prostate[,y]) #convert to factor for classification +aml <- h2o.automl(y = y, training_frame = prostate, max_runtime_secs = 120, + include_algos=c("GBM"), + + # this setting ensures the models are comparable for building a meta learner + seed = 2023, nfolds = 10, + keep_cross_validation_predictions = TRUE) + +### call 'shapley' function to compute the weighted mean and weighted confidence intervals +### of SHAP values across all trained models. +### Note that the 'newdata' should be the testing dataset! +result <- shapley(models = aml, newdata = prostate, plot = TRUE) + +####################################################### +### Significance testing of contributions of two features +####################################################### +# testing the WMSHAP contributions of two features +hmda.test(result, features = c("GLEASON", "PSA"), n = 5000) + +# testing the WMSHAP contributions of two domains (groups of features, latent factors, etc.) +hmda.test(result, + n = 5000, + domains = list(Demographic = c("RACE", "AGE"), + Cancer = c("VOL", "PSA", "GLEASON"))) +} +} +\author{ +E. F. Haghish +} diff --git a/man/hmda.wmshap.Rd b/man/hmda.wmshap.Rd index 561812d..be77232 100644 --- a/man/hmda.wmshap.Rd +++ b/man/hmda.wmshap.Rd @@ -16,7 +16,7 @@ hmda.wmshap( cutoff = 0.01, top_n_features = NULL, n_models = 10, - sample_size = nrow(newdata) + sample_size = NULL ) } \arguments{