feat: new cut option to simplify factors to only the top N levels

2026-06-19 12:37:30 +02:00 · 2025-10-21 21:30:07 +03:00 · 2025-10-21 21:30:07 +03:00 · 0c2b061708
commit 0c2b061708
parent a06177481b
12 changed files with 337 additions and 236 deletions
--- a/R/cut_var.R
+++ b/R/cut_var.R
@ -0,0 +1,232 @@
+#' Extended cutting function with fall-back to the native base::cut
+#'
+#' @param x an object inheriting from class "hms"
+#' @param ... passed on
+#'
+#' @export
+#' @name cut_var
+cut_var <- function(x, ...) {
+  UseMethod("cut_var")
+}
+
+#' @export
+#' @name cut_var
+cut_var.default <- function(x, ...) {
+  base::cut(x, ...)
+}
+
+#' @name cut_var
+#'
+#' @return factor
+#' @export
+#'
+#' @examples
+#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) |> cut_var(2)
+#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) |> cut_var("min")
+#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) |> cut_var(breaks = "hour")
+#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) |> cut_var(breaks = hms::as_hms(c("01:00:00", "03:01:20", "9:20:20")))
+#' d_t <- readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "03:02:20", NA))
+#' f <- d_t |> cut_var(2)
+#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "03:02:20", NA)) |> cut_var(breaks = lubridate::as_datetime(c(hms::as_hms(levels(f)), hms::as_hms(max(d_t, na.rm = TRUE) + 1))), right = FALSE)
+cut_var.hms <- function(x, breaks, ...) {
+  ## as_hms keeps returning warnings on tz(); ignored
+  suppressWarnings({
+    if (hms::is_hms(breaks)) {
+      breaks <- lubridate::as_datetime(breaks)
+    }
+    x <- lubridate::as_datetime(x)
+    out <- cut_var.POSIXt(x, breaks = breaks, ...)
+    attr(out, which = "brks") <- hms::as_hms(lubridate::as_datetime(attr(out, which = "brks")))
+    attr(out, which = "levels") <- as.character(hms::as_hms(lubridate::as_datetime(attr(out, which = "levels"))))
+  })
+  out
+}
+
+#' @name cut_var
+#' @param x an object inheriting from class "POSIXt" or "Date"
+#'
+#' @examples
+#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(2)
+#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = "weekday")
+#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = "month_only")
+#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = NULL, format = "%A-%H")
+#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = NULL, format = "%W")
+cut_var.POSIXt <- function(x, breaks, right = FALSE, include.lowest = TRUE, start.on.monday = TRUE, ...) {
+  breaks_o <- breaks
+  args <- list(...)
+  # browser()
+  if (is.numeric(breaks)) {
+    breaks <- quantile(
+      x,
+      probs = seq(0, 1, 1 / breaks),
+      right = right,
+      include.lowest = include.lowest,
+      na.rm = TRUE
+    )
+  }
+
+  if ("format" %in% names(args)) {
+    assertthat::assert_that(is.character(args$format))
+    out <- forcats::as_factor(format(x, format = args$format))
+  } else if (identical(breaks, "weekday")) {
+    ## This is
+    ds <- as.Date(1:7) |>
+      (\(.x){
+        sort_by(format(.x, "%A"), as.numeric(format(.x, "%w")))
+      })()
+
+    if (start.on.monday) {
+      ds <- ds[c(7, 1:6)]
+    }
+    out <- factor(weekdays(x), levels = ds) |> forcats::fct_drop()
+  } else if (identical(breaks, "month_only")) {
+    ## Simplest way to create a vector of all months in order
+    ## which will also follow the locale of the machine
+    ms <- paste0("1970-", 1:12, "-01") |>
+      as.Date() |>
+      months()
+
+    out <- factor(months(x), levels = ms) |> forcats::fct_drop()
+  } else {
+    ## Doesn't really work very well for breaks other than the special character cases as right border is excluded
+    out <- base::cut.POSIXt(x, breaks = breaks, right = right, ...) |> forcats::fct_drop()
+    # browser()
+  }
+  l <- levels(out)
+  if (is.numeric(breaks_o)) {
+    l <- breaks
+  } else if (is.character(breaks) && length(breaks) == 1 && !(identical(breaks, "weekday") | identical(breaks, "month_only"))) {
+    if (include.lowest) {
+      if (right) {
+        l <- c(l, min(as.character(x)))
+      } else {
+        l <- c(l, max(as.character(x)))
+      }
+    }
+  } else if (length(l) < length(breaks_o)) {
+    l <- breaks_o
+  }
+
+  attr(out, which = "brks") <- l
+  out
+}
+
+#' @name cut_var
+#' @param x an object inheriting from class "POSIXct"
+cut_var.POSIXct <- cut_var.POSIXt
+
+#' @name cut_var
+#' @param x an object inheriting from class "POSIXct"
+#'
+#' @examples
+#' as.Date(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(2)
+#' as.Date(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = "weekday")
+#' as.Date(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(format = "%W")
+cut_var.Date <- function(x, breaks = NULL, start.on.monday = TRUE, ...) {
+  args <- list(...)
+
+  if ("format" %in% names(args)) {
+    assertthat::assert_that(is.character(args$format))
+    out <- forcats::as_factor(format(x, format = args$format))
+  } else if (identical(breaks, "weekday")) {
+    ds <- as.Date(1:7) |>
+      (\(.x){
+        sort_by(format(.x, "%A"), as.numeric(format(.x, "%w")))
+      })()
+
+    if (start.on.monday) {
+      ds <- ds[c(7, 1:6)]
+    }
+    out <- factor(weekdays(x), levels = ds) |> forcats::fct_drop()
+  } else if (identical(breaks, "month_only")) {
+    ms <- paste0("1970-", 1:12, "-01") |>
+      as.Date() |>
+      months()
+
+    out <- factor(months(x), levels = ms) |> forcats::fct_drop()
+  } else {
+    ## Doesn't really work very well for breaks other than the special character cases as right border is excluded
+    out <- base::cut.Date(x, breaks = breaks, ...) |> forcats::fct_drop()
+    # browser()
+  }
+  out
+}
+
+
+#' Simplify a factor to only the top or bottom n levels
+#'
+#' @param type
+#'
+#' @name cut_var
+#'
+#' @returns factor
+#' @export
+#'
+#' @examples
+#' mtcars$carb |>
+#'   as.factor() |>
+#'   cut_var(2) |>
+#'   table()
+cut_var.factor <- function(x, breaks = NULL, type = c("top", "bottom"), other = "Other", ...) {
+  args <- list(...)
+
+  if (is.null(breaks)) {
+    return(x)
+  }
+
+  type <- match.arg(type)
+
+  if (type == "top") {
+    lvls <- names(sort(table(x), decreasing = TRUE)[seq_len(breaks)])
+  } else if (type == "bottom") {
+    lvls <- names(sort(table(x), decreasing = FALSE)[seq_len(breaks)])
+  }
+
+  if (other %in% lvls) {
+    other <- paste(other, "_freesearchr")
+  }
+
+  ## Relabel and relevel
+  out <- forcats::fct_relabel(
+    x,
+    \(.x){
+      ifelse(.x %in% lvls, .x, other)
+    }
+  ) |>
+    forcats::fct_relevel(lvls,other)
+
+  attr(out, which = "brks") <- breaks
+  out
+}
+
+
+#' Test class
+#'
+#' @param data data
+#' @param class.vec vector of class names to test
+#'
+#' @return factor
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' vapply(REDCapCAST::redcapcast_data, \(.x){
+#'   is_any_class(.x, c("hms", "Date", "POSIXct", "POSIXt"))
+#' }, logical(1))
+#' }
+is_any_class <- function(data, class.vec) {
+  any(class(data) %in% class.vec)
+}
+
+#' Test is date/datetime/time
+#'
+#' @param data data
+#'
+#' @return factor
+#' @export
+#'
+#' @examples
+#' vapply(REDCapCAST::redcapcast_data, is_datetime, logical(1))
+is_datetime <- function(data) {
+  is_any_class(data, class.vec = c("hms", "Date", "POSIXct", "POSIXt"))
+}