EpistasisLab · trangdata · Feb 27, 2025 · Feb 25, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,7 +5,7 @@ Description: Check available classification and regression data sets from the PM
     These data sets cover a range of applications, and include binary/multi-class classification problems and 
     regression problems, as well as combinations of categorical, ordinal, and continuous features.
     There are currently over 150 datasets included in the PMLB repository.
-Version: 0.2.3
+Version: 0.3.0
 Authors@R: c(
     person("Trang", "Le", email = "grixor@gmail.com", role = c("aut", "cre"), comment = "https://trang.page/"),
     person("makeyourownmaker", email = "makeyourownmaker@gmx.com", role = "aut", comment = "https://github.com/makeyourownmaker"),
@@ -22,3 +22,6 @@ URL: https://github.com/EpistasisLab/pmlbr
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.2
+Suggests: 
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,5 +3,10 @@
 S3method(nearest_datasets,character)
 S3method(nearest_datasets,data.frame)
 S3method(nearest_datasets,default)
+export(classification_datasets)
+export(dataset_names)
 export(fetch_data)
 export(nearest_datasets)
+export(pmlb_metadata)
+export(regression_datasets)
+export(summary_stats)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,5 @@
+# pmlbr 0.3.0
+
 # pmlbr 0.2.3
 
 * Use interactive()

diff --git a/R/data.R b/R/data.R
diff --git a/R/globals.R b/R/globals.R
diff --git a/R/list_datasets.R b/R/list_datasets.R
@@ -0,0 +1,100 @@
+#' Get metadata for all datasets in PMLB.
+#'
+#' Metadata like summary statistics and names of available datasets
+#' on the PMLB repository.
+#'
+#' @return A list containing summary_stats, dataset_names, classification_datasets, and regression_datasets
+#' @export
+#' @examples
+#' if (interactive()) {
+#'   sample(pmlb_metadata()$dataset_names, 10)
+#' }
+pmlb_metadata <- function() {
+  if (!exists("summary_stats", envir = .pmlbr_env)) {
+    links_to_stats <- 'https://github.com/EpistasisLab/pmlb/raw/master/pmlb/all_summary_stats.tsv'
+    summary_stats <- utils::read.csv(links_to_stats, sep = '\t')
+    colnames(summary_stats) <- tolower(gsub(
+      'X.',
+      'n_',
+      colnames(summary_stats)
+    ))
+    assign(
+      "summary_stats",
+      summary_stats,
+      envir = .pmlbr_env
+    )
+    assign(
+      "dataset_names",
+      summary_stats$dataset,
+      envir = .pmlbr_env
+    )
+    assign(
+      "regression_datasets",
+      sort(summary_stats[summary_stats$task == "regression", "dataset"]),
+      envir = .pmlbr_env
+    )
+    assign(
+      "classification_datasets",
+      sort(summary_stats[summary_stats$task == "classification", "dataset"]),
+      envir = .pmlbr_env
+    )
+  }
+
+  list(
+    summary_stats = .pmlbr_env$summary_stats,
+    dataset_names = .pmlbr_env$dataset_names,
+    classification_datasets = .pmlbr_env$classification_datasets,
+    regression_datasets = .pmlbr_env$regression_datasets
+  )
+}
+
+
+#' All available datasets
+#'
+#' @return A character vector of all dataset names.
+#' @export
+#' @examples
+#' if (interactive()) {
+#'   sample(dataset_names(), 10)
+#' }
+dataset_names <- function() {
+  pmlb_metadata()$dataset_names
+}
+
+#' Classification datasets
+#'
+#' @return A character vector of classification dataset names.
+#' @export
+#' @examples
+#' if (interactive()) {
+#'   sample(classification_datasets(), 10)
+#' }
+classification_datasets <- function() {
+  pmlb_metadata()$classification_datasets
+}
+
+#' Regression datasets
+#'
+#' @return A character vector of regression dataset names.
+#' @export
+#' @examples
+#' if (interactive()) {
+#'   sample(regression_datasets(), 10)
+#' }
+regression_datasets <- function() {
+  pmlb_metadata()$regression_datasets
+}
+
+#' Summary statistics
+#'
+#' @return A dataframe of summary statistics of all available datasets,
+#' including number of instances/rows, number of columns/features, task, etc.
+#'
+#' @export
+#' @examples
+#' if (interactive()) {
+#'   head(summary_stats())
+#' }
+summary_stats <- function() {
+  pmlb_metadata()$summary_stats
+}
diff --git a/R/nearest.R b/R/nearest.R
@@ -10,7 +10,7 @@
 #' @param n_neighbors Integer. The number of dataset names to return as neighbors.
 #' @param dimensions Character vector specifying dataset characteristics to include in similarity calculation.
 #' Dimensions must correspond to numeric columns of
-#' [all_summary_stats.tsv](https://github.com/EpistasisLab/pmlb/blob/master/pmlb/all_summary_stats.tsv).
+#' [all_summary_stats.tsv](https://github.com/EpistasisLab/pmlb/blob/master/pmlb/all_summarystats.tsv).
 #' If 'all' (default), uses all numeric columns.
 #' @param task Character string specifying classification or regression for summary stat generation.
 #' @param target_name Character string specifying column of target/dependent variable.
@@ -26,34 +26,45 @@
 #'   nearest_datasets('penguins')
 #'   nearest_datasets(fetch_data('penguins'))
 #' }
-nearest_datasets <- function(x, ...){
+nearest_datasets <- function(x, ...) {
   UseMethod('nearest_datasets', x)
 }
 
 
 #' @rdname nearest_datasets-methods
 #' @export
-nearest_datasets.default <- function(x, ...){
+nearest_datasets.default <- function(x, ...) {
   stop('`x` must be of class `data.frame` or `character`.')
 }
 
 
 #' @rdname nearest_datasets-methods
 #' @export
 nearest_datasets.character <- function(
-  x, n_neighbors = 5,
+  x,
+  n_neighbors = 5,
   dimensions = c('n_instances', 'n_features'),
-  target_name = 'target', ...) {
-
-  if (!(x %in% dataset_names))
-    stop("'dataset_name' ", x, " not found in PMLB.\n * Check spelling, capitalisation etc.", call.=FALSE)
-  dataset_stats <- summary_stats[summary_stats$dataset == x, ]
-
-  num_cols <- unlist(lapply(summary_stats, function(x) is.numeric(x)||is.integer(x)))
-  summary_task <- summary_stats[summary_stats$task == dataset_stats$task, ] # restrict to same task
+  target_name = 'target',
+  ...
+) {
+  if (!(x %in% dataset_names()))
+    stop(
+      "'dataset_name' ",
+      x,
+      " not found in PMLB.\n * Check spelling, capitalisation etc.",
+      call. = FALSE
+    )
+  sum_stats <- summary_stats()
+  dataset_stats <- sum_stats[sum_stats$dataset == x, ]
+
+  num_cols <- unlist(lapply(
+    sum_stats,
+    function(x) is.numeric(x) || is.integer(x)
+  ))
+  summary_task <- sum_stats[sum_stats$task == dataset_stats$task, ] # restrict to same task
   summary_i <- summary_task[, num_cols]
 
-  if (length(dimensions) == 1 && dimensions == 'all'){
+  if (length(dimensions) == 1 && dimensions == 'all') {
     dimensions <- colnames(summary_i)
   } else {
     stopifnot(dimensions %in% colnames(summary_i))
@@ -70,28 +81,36 @@ nearest_datasets.character <- function(
 #' @rdname nearest_datasets-methods
 #' @export
 nearest_datasets.data.frame <- function(
-  x, y = NULL, n_neighbors = 5,
+  x,
+  y = NULL,
+  n_neighbors = 5,
   dimensions = c('n_instances', 'n_features'),
   task = c('classification', 'regression'),
-  target_name = 'target', ...) {
-
+  target_name = 'target',
+  ...
+) {
   df <- if (is.null(y)) x else data.frame(x, target = y)
 
   # get summary stats for dataset
-  if (is.null(task)){
-    task <- if (length(unique(df$target)) < 5) 'classification' else 'regression'
+  if (is.null(task)) {
+    task <- if (length(unique(df$target)) < 5) 'classification' else
+      'regression'
   } else {
     task <- match.arg(task)
   }
 
   if (!(target_name %in% colnames(df)))
     stop(paste('Either x or y must contain', target_name))
 
-  num_cols <- unlist(lapply(summary_stats, function(x) is.numeric(x)||is.integer(x)))
-  summary_task <- summary_stats[summary_stats$task == task, ] # restrict to same task
+  sum_stats <- summary_stats()
+  num_cols <- unlist(lapply(
+    sum_stats,
+    function(x) is.numeric(x) || is.integer(x)
+  ))
+  summary_task <- sum_stats[sum_stats$task == task, ] # restrict to same task
   summary_i <- summary_task[, num_cols]
 
-  if (length(dimensions) == 1 && dimensions == 'all'){
+  if (length(dimensions) == 1 && dimensions == 'all') {
     dimensions <- colnames(summary_i)
   } else {
     stopifnot(dimensions %in% colnames(summary_i))
@@ -100,22 +119,22 @@ nearest_datasets.data.frame <- function(
 
   feat_names <- setdiff(colnames(df), target_name)
   types <- vector('character')
-  for (i in feat_names){
-    types[i] <- get_type(df[,i], include_binary = TRUE)
+  for (i in feat_names) {
+    types[i] <- get_type(df[, i], include_binary = TRUE)
   }
 
   feat <- table(types)
-  for (type in c('binary', 'categorical', 'continuous')){
+  for (type in c('binary', 'categorical', 'continuous')) {
     if (!type %in% names(feat)) feat[type] <- 0
   }
   imb <- compute_imbalance(df[, target_name])
 
   dataset_stats <- data.frame(
     n_instances = nrow(df),
     n_features = length(feat_names),
-    n_binary_features = feat['binary'],
-    n_categorical_features = feat['categorical'],
-    n_continuous_features = feat['continuous'],
+    n_binary_features = feat[['binary']],
+    n_categorical_features = feat[['categorical']],
+    n_continuous_features = feat[['continuous']],
     endpoint_type = get_type(df[, target_name]),
     n_classes = imb[['num_classes']],
     imbalance = imb[['imbalance']],
@@ -136,23 +155,25 @@ nearest_datasets.data.frame <- function(
 #' where zero means that the dataset is perfectly balanced
 #' and the higher the value, the more imbalanced the dataset.
 #'
-compute_imbalance <- function(target_col){
+compute_imbalance <- function(target_col) {
   imb <- 0
   classes_count <- table(target_col)
   num_classes <- length(classes_count)
-  for (x in classes_count){
-    p_x = x/length(target_col)
+  for (x in classes_count) {
+    p_x = x / length(target_col)
   }
 
-  if (p_x > 0){
-    imb = imb + (p_x - 1/num_classes)*(p_x - 1/num_classes)
+  if (p_x > 0) {
+    imb = imb + (p_x - 1 / num_classes) * (p_x - 1 / num_classes)
   }
 
   # worst case scenario: all but 1 examplars in 1st class
   # the remaining one in 2nd class
-  worst_case <- (num_classes-1)*(1/num_classes)^2 + (1-1/num_classes)^2
+  worst_case <- (num_classes - 1) *
+    (1 / num_classes)^2 +
+    (1 - 1 / num_classes)^2
 
-  list(num_classes = num_classes, imbalance = imb/worst_case)
+  list(num_classes = num_classes, imbalance = imb / worst_case)
 }
 
 #' Get type/class of given vector.
@@ -163,14 +184,17 @@ compute_imbalance <- function(target_col){
 #'
 #' @return Type/class of `x`.
 #'
-get_type <- function(x, include_binary = FALSE){
+get_type <- function(x, include_binary = FALSE) {
   x <- stats::na.omit(x)
 
-  if (inherits(x, 'numeric')){
+  if (inherits(x, 'numeric')) {
     return('continuous')
-  } else if (inherits(x, 'integer') || inherits(x, 'factor')){
-    if (include_binary){
-      if (length(unique(x)) == 2) return('binary')}
+  } else if (inherits(x, 'integer') || inherits(x, 'factor')) {
+    if (include_binary) {
+      if (length(unique(x)) == 2) return('binary')
+    }
     return('categorical')
-  } else {stop("Cannot get types for dataset columns")}
+  } else {
+    stop("Cannot get types for dataset columns")
+  }
 }