Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dynamic dataset names #5

Merged
merged 9 commits into from
Feb 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Description: Check available classification and regression data sets from the PM
These data sets cover a range of applications, and include binary/multi-class classification problems and
regression problems, as well as combinations of categorical, ordinal, and continuous features.
There are currently over 150 datasets included in the PMLB repository.
Version: 0.2.3
Version: 0.3.0
Authors@R: c(
person("Trang", "Le", email = "grixor@gmail.com", role = c("aut", "cre"), comment = "https://trang.page/"),
person("makeyourownmaker", email = "makeyourownmaker@gmx.com", role = "aut", comment = "https://github.com/makeyourownmaker"),
Expand All @@ -22,3 +22,6 @@ URL: https://github.com/EpistasisLab/pmlbr
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.3.2
Suggests:
testthat (>= 3.0.0)
Config/testthat/edition: 3
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,10 @@
S3method(nearest_datasets,character)
S3method(nearest_datasets,data.frame)
S3method(nearest_datasets,default)
export(classification_datasets)
export(dataset_names)
export(fetch_data)
export(nearest_datasets)
export(pmlb_metadata)
export(regression_datasets)
export(summary_stats)
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# pmlbr 0.3.0

# pmlbr 0.2.3

* Use interactive()
Expand Down
39 changes: 0 additions & 39 deletions R/data.R

This file was deleted.

1 change: 0 additions & 1 deletion R/globals.R

This file was deleted.

100 changes: 100 additions & 0 deletions R/list_datasets.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#' Get metadata for all datasets in PMLB.
#'
#' Metadata like summary statistics and names of available datasets
#' on the PMLB repository.
#'
#' @return A list containing summary_stats, dataset_names, classification_datasets, and regression_datasets
#' @export
#' @examples
#' if (interactive()) {
#' sample(pmlb_metadata()$dataset_names, 10)
#' }
pmlb_metadata <- function() {
if (!exists("summary_stats", envir = .pmlbr_env)) {
links_to_stats <- 'https://github.com/EpistasisLab/pmlb/raw/master/pmlb/all_summary_stats.tsv'
summary_stats <- utils::read.csv(links_to_stats, sep = '\t')
colnames(summary_stats) <- tolower(gsub(
'X.',
'n_',
colnames(summary_stats)
))
assign(
"summary_stats",
summary_stats,
envir = .pmlbr_env
)
assign(
"dataset_names",
summary_stats$dataset,
envir = .pmlbr_env
)
assign(
"regression_datasets",
sort(summary_stats[summary_stats$task == "regression", "dataset"]),
envir = .pmlbr_env
)
assign(
"classification_datasets",
sort(summary_stats[summary_stats$task == "classification", "dataset"]),
envir = .pmlbr_env
)
}

list(
summary_stats = .pmlbr_env$summary_stats,
dataset_names = .pmlbr_env$dataset_names,
classification_datasets = .pmlbr_env$classification_datasets,
regression_datasets = .pmlbr_env$regression_datasets
)
}


#' All available datasets
#'
#' @return A character vector of all dataset names.
#' @export
#' @examples
#' if (interactive()) {
#' sample(dataset_names(), 10)
#' }
dataset_names <- function() {
pmlb_metadata()$dataset_names
}

#' Classification datasets
#'
#' @return A character vector of classification dataset names.
#' @export
#' @examples
#' if (interactive()) {
#' sample(classification_datasets(), 10)
#' }
classification_datasets <- function() {
pmlb_metadata()$classification_datasets
}

#' Regression datasets
#'
#' @return A character vector of regression dataset names.
#' @export
#' @examples
#' if (interactive()) {
#' sample(regression_datasets(), 10)
#' }
regression_datasets <- function() {
pmlb_metadata()$regression_datasets
}

#' Summary statistics
#'
#' @return A dataframe of summary statistics of all available datasets,
#' including number of instances/rows, number of columns/features, task, etc.
#'
#' @export
#' @examples
#' if (interactive()) {
#' head(summary_stats())
#' }
summary_stats <- function() {
pmlb_metadata()$summary_stats
}
104 changes: 64 additions & 40 deletions R/nearest.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#' @param n_neighbors Integer. The number of dataset names to return as neighbors.
#' @param dimensions Character vector specifying dataset characteristics to include in similarity calculation.
#' Dimensions must correspond to numeric columns of
#' [all_summary_stats.tsv](https://github.com/EpistasisLab/pmlb/blob/master/pmlb/all_summary_stats.tsv).
#' [all_summary_stats.tsv](https://github.com/EpistasisLab/pmlb/blob/master/pmlb/all_summarystats.tsv).
#' If 'all' (default), uses all numeric columns.
#' @param task Character string specifying classification or regression for summary stat generation.
#' @param target_name Character string specifying column of target/dependent variable.
Expand All @@ -26,34 +26,45 @@
#' nearest_datasets('penguins')
#' nearest_datasets(fetch_data('penguins'))
#' }
nearest_datasets <- function(x, ...){
nearest_datasets <- function(x, ...) {
UseMethod('nearest_datasets', x)
}


#' @rdname nearest_datasets-methods
#' @export
nearest_datasets.default <- function(x, ...){
nearest_datasets.default <- function(x, ...) {
stop('`x` must be of class `data.frame` or `character`.')
}


#' @rdname nearest_datasets-methods
#' @export
nearest_datasets.character <- function(
x, n_neighbors = 5,
x,
n_neighbors = 5,
dimensions = c('n_instances', 'n_features'),
target_name = 'target', ...) {

if (!(x %in% dataset_names))
stop("'dataset_name' ", x, " not found in PMLB.\n * Check spelling, capitalisation etc.", call.=FALSE)
dataset_stats <- summary_stats[summary_stats$dataset == x, ]

num_cols <- unlist(lapply(summary_stats, function(x) is.numeric(x)||is.integer(x)))
summary_task <- summary_stats[summary_stats$task == dataset_stats$task, ] # restrict to same task
target_name = 'target',
...
) {
if (!(x %in% dataset_names()))
stop(
"'dataset_name' ",
x,
" not found in PMLB.\n * Check spelling, capitalisation etc.",
call. = FALSE
)
sum_stats <- summary_stats()
dataset_stats <- sum_stats[sum_stats$dataset == x, ]

num_cols <- unlist(lapply(
sum_stats,
function(x) is.numeric(x) || is.integer(x)
))
summary_task <- sum_stats[sum_stats$task == dataset_stats$task, ] # restrict to same task
summary_i <- summary_task[, num_cols]

if (length(dimensions) == 1 && dimensions == 'all'){
if (length(dimensions) == 1 && dimensions == 'all') {
dimensions <- colnames(summary_i)
} else {
stopifnot(dimensions %in% colnames(summary_i))
Expand All @@ -70,28 +81,36 @@ nearest_datasets.character <- function(
#' @rdname nearest_datasets-methods
#' @export
nearest_datasets.data.frame <- function(
x, y = NULL, n_neighbors = 5,
x,
y = NULL,
n_neighbors = 5,
dimensions = c('n_instances', 'n_features'),
task = c('classification', 'regression'),
target_name = 'target', ...) {

target_name = 'target',
...
) {
df <- if (is.null(y)) x else data.frame(x, target = y)

# get summary stats for dataset
if (is.null(task)){
task <- if (length(unique(df$target)) < 5) 'classification' else 'regression'
if (is.null(task)) {
task <- if (length(unique(df$target)) < 5) 'classification' else
'regression'
} else {
task <- match.arg(task)
}

if (!(target_name %in% colnames(df)))
stop(paste('Either x or y must contain', target_name))

num_cols <- unlist(lapply(summary_stats, function(x) is.numeric(x)||is.integer(x)))
summary_task <- summary_stats[summary_stats$task == task, ] # restrict to same task
sum_stats <- summary_stats()
num_cols <- unlist(lapply(
sum_stats,
function(x) is.numeric(x) || is.integer(x)
))
summary_task <- sum_stats[sum_stats$task == task, ] # restrict to same task
summary_i <- summary_task[, num_cols]

if (length(dimensions) == 1 && dimensions == 'all'){
if (length(dimensions) == 1 && dimensions == 'all') {
dimensions <- colnames(summary_i)
} else {
stopifnot(dimensions %in% colnames(summary_i))
Expand All @@ -100,22 +119,22 @@ nearest_datasets.data.frame <- function(

feat_names <- setdiff(colnames(df), target_name)
types <- vector('character')
for (i in feat_names){
types[i] <- get_type(df[,i], include_binary = TRUE)
for (i in feat_names) {
types[i] <- get_type(df[, i], include_binary = TRUE)
}

feat <- table(types)
for (type in c('binary', 'categorical', 'continuous')){
for (type in c('binary', 'categorical', 'continuous')) {
if (!type %in% names(feat)) feat[type] <- 0
}
imb <- compute_imbalance(df[, target_name])

dataset_stats <- data.frame(
n_instances = nrow(df),
n_features = length(feat_names),
n_binary_features = feat['binary'],
n_categorical_features = feat['categorical'],
n_continuous_features = feat['continuous'],
n_binary_features = feat[['binary']],
n_categorical_features = feat[['categorical']],
n_continuous_features = feat[['continuous']],
endpoint_type = get_type(df[, target_name]),
n_classes = imb[['num_classes']],
imbalance = imb[['imbalance']],
Expand All @@ -136,23 +155,25 @@ nearest_datasets.data.frame <- function(
#' where zero means that the dataset is perfectly balanced
#' and the higher the value, the more imbalanced the dataset.
#'
compute_imbalance <- function(target_col){
compute_imbalance <- function(target_col) {
imb <- 0
classes_count <- table(target_col)
num_classes <- length(classes_count)
for (x in classes_count){
p_x = x/length(target_col)
for (x in classes_count) {
p_x = x / length(target_col)
}

if (p_x > 0){
imb = imb + (p_x - 1/num_classes)*(p_x - 1/num_classes)
if (p_x > 0) {
imb = imb + (p_x - 1 / num_classes) * (p_x - 1 / num_classes)
}

# worst case scenario: all but 1 examplars in 1st class
# the remaining one in 2nd class
worst_case <- (num_classes-1)*(1/num_classes)^2 + (1-1/num_classes)^2
worst_case <- (num_classes - 1) *
(1 / num_classes)^2 +
(1 - 1 / num_classes)^2

list(num_classes = num_classes, imbalance = imb/worst_case)
list(num_classes = num_classes, imbalance = imb / worst_case)
}

#' Get type/class of given vector.
Expand All @@ -163,14 +184,17 @@ compute_imbalance <- function(target_col){
#'
#' @return Type/class of `x`.
#'
get_type <- function(x, include_binary = FALSE){
get_type <- function(x, include_binary = FALSE) {
x <- stats::na.omit(x)

if (inherits(x, 'numeric')){
if (inherits(x, 'numeric')) {
return('continuous')
} else if (inherits(x, 'integer') || inherits(x, 'factor')){
if (include_binary){
if (length(unique(x)) == 2) return('binary')}
} else if (inherits(x, 'integer') || inherits(x, 'factor')) {
if (include_binary) {
if (length(unique(x)) == 2) return('binary')
}
return('categorical')
} else {stop("Cannot get types for dataset columns")}
} else {
stop("Cannot get types for dataset columns")
}
}
Loading
Loading