FreesearchR/R/cut_var.R

#' Extended cutting function with fall-back to the native base::cut
#'
#' @param x an object inheriting from class "hms"
#' @param ... passed on
#'
#' @export
#' @name cut_var
cut_var <- function(x, ...) {
  UseMethod("cut_var")
}

#' @export
#' @name cut_var
cut_var.default <- function(x, ...) {
  base::cut(x, ...)
}

#' @name cut_var
#'
#' @return factor
#' @export
#'
#' @examples
#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) |> cut_var(2)
#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) |> cut_var("min")
#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) |> cut_var(breaks = "hour")
#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) |> cut_var(breaks = hms::as_hms(c("01:00:00", "03:01:20", "9:20:20")))
#' d_t <- readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "03:02:20", NA))
#' f <- d_t |> cut_var(2)
#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "03:02:20", NA)) |> cut_var(breaks = lubridate::as_datetime(c(hms::as_hms(levels(f)), hms::as_hms(max(d_t, na.rm = TRUE) + 1))), right = FALSE)
cut_var.hms <- function(x, breaks, ...) {
  ## as_hms keeps returning warnings on tz(); ignored
  suppressWarnings({
    if (hms::is_hms(breaks)) {
      breaks <- lubridate::as_datetime(breaks)
    }
    x <- lubridate::as_datetime(x)
    out <- cut_var.POSIXt(x, breaks = breaks, ...)
    attr(out, which = "brks") <- hms::as_hms(lubridate::as_datetime(attr(out, which = "brks")))
    attr(out, which = "levels") <- as.character(hms::as_hms(lubridate::as_datetime(attr(out, which = "levels"))))
  })
  out
}

#' @name cut_var
#' @param x an object inheriting from class "POSIXt" or "Date"
#'
#' @examples
#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(2)
#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = "weekday")
#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = "month_only")
#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = NULL, format = "%A-%H")
#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = NULL, format = "%W")
cut_var.POSIXt <- function(x, breaks, right = FALSE, include.lowest = TRUE, start.on.monday = TRUE, ...) {
  breaks_o <- breaks
  args <- list(...)
  # browser()
  if (is.numeric(breaks)) {
    breaks <- quantile(
      x,
      probs = seq(0, 1, 1 / breaks),
      right = right,
      include.lowest = include.lowest,
      na.rm = TRUE
    )
  }

  if ("format" %in% names(args)) {
    assertthat::assert_that(is.character(args$format))
    out <- forcats::as_factor(format(x, format = args$format))
  } else if (identical(breaks, "weekday")) {
    ## This is
    ds <- as.Date(1:7) |>
      (\(.x){
        sort_by(format(.x, "%A"), as.numeric(format(.x, "%w")))
      })()

    if (start.on.monday) {
      ds <- ds[c(7, 1:6)]
    }
    out <- factor(weekdays(x), levels = ds) |> forcats::fct_drop()
  } else if (identical(breaks, "month_only")) {
    ## Simplest way to create a vector of all months in order
    ## which will also follow the locale of the machine
    ms <- paste0("1970-", 1:12, "-01") |>
      as.Date() |>
      months()

    out <- factor(months(x), levels = ms) |> forcats::fct_drop()
  } else {
    ## Doesn't really work very well for breaks other than the special character cases as right border is excluded
    out <- base::cut.POSIXt(x, breaks = breaks, right = right, ...) |> forcats::fct_drop()
    # browser()
  }
  l <- levels(out)
  if (is.numeric(breaks_o)) {
    l <- breaks
  } else if (is.character(breaks) && length(breaks) == 1 && !(identical(breaks, "weekday") | identical(breaks, "month_only"))) {
    if (include.lowest) {
      if (right) {
        l <- c(l, min(as.character(x)))
      } else {
        l <- c(l, max(as.character(x)))
      }
    }
  } else if (length(l) < length(breaks_o)) {
    l <- breaks_o
  }

  attr(out, which = "brks") <- l
  out
}

#' @name cut_var
#' @param x an object inheriting from class "POSIXct"
cut_var.POSIXct <- cut_var.POSIXt

#' @name cut_var
#' @param x an object inheriting from class "POSIXct"
#'
#' @examples
#' as.Date(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(2)
#' as.Date(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = "weekday")
#' as.Date(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(format = "%W")
cut_var.Date <- function(x, breaks = NULL, start.on.monday = TRUE, ...) {
  args <- list(...)

  if ("format" %in% names(args)) {
    assertthat::assert_that(is.character(args$format))
    out <- forcats::as_factor(format(x, format = args$format))
  } else if (identical(breaks, "weekday")) {
    ds <- as.Date(1:7) |>
      (\(.x){
        sort_by(format(.x, "%A"), as.numeric(format(.x, "%w")))
      })()

    if (start.on.monday) {
      ds <- ds[c(7, 1:6)]
    }
    out <- factor(weekdays(x), levels = ds) |> forcats::fct_drop()
  } else if (identical(breaks, "month_only")) {
    ms <- paste0("1970-", 1:12, "-01") |>
      as.Date() |>
      months()

    out <- factor(months(x), levels = ms) |> forcats::fct_drop()
  } else {
    ## Doesn't really work very well for breaks other than the special character cases as right border is excluded
    out <- base::cut.Date(x, breaks = breaks, ...) |> forcats::fct_drop()
    # browser()
  }
  out
}


#' Simplify a factor to only the top or bottom n levels
#'
#' @param type
#'
#' @name cut_var
#'
#' @returns factor
#' @export
#'
#' @examples
#' mtcars$carb |>
#'   as.factor() |>
#'   cut_var(2) |>
#'   table()
cut_var.factor <- function(x, breaks = NULL, type = c("top", "bottom"), other = "Other", ...) {
  args <- list(...)

  if (is.null(breaks)) {
    return(x)
  }

  type <- match.arg(type)

  if (type == "top") {
    lvls <- names(sort(table(x), decreasing = TRUE)[seq_len(breaks)])
  } else if (type == "bottom") {
    lvls <- names(sort(table(x), decreasing = FALSE)[seq_len(breaks)])
  }

  if (other %in% lvls) {
    other <- paste(other, "_freesearchr")
  }

  ## Relabel and relevel
  out <- forcats::fct_relabel(
    x,
    \(.x){
      ifelse(.x %in% lvls, .x, other)
    }
  ) |>
    forcats::fct_relevel(lvls,other)

  attr(out, which = "brks") <- breaks
  out
}


#' Test class
#'
#' @param data data
#' @param class.vec vector of class names to test
#'
#' @return factor
#' @export
#'
#' @examples
#' \dontrun{
#' vapply(REDCapCAST::redcapcast_data, \(.x){
#'   is_any_class(.x, c("hms", "Date", "POSIXct", "POSIXt"))
#' }, logical(1))
#' }
is_any_class <- function(data, class.vec) {
  any(class(data) %in% class.vec)
}

#' Test is date/datetime/time
#'
#' @param data data
#'
#' @return factor
#' @export
#'
#' @examples
#' vapply(REDCapCAST::redcapcast_data, is_datetime, logical(1))
is_datetime <- function(data) {
  is_any_class(data, class.vec = c("hms", "Date", "POSIXct", "POSIXt"))
}
feat: new cut option to simplify factors to only the top N levels 2025-10-21 21:30:07 +03:00			`#' Extended cutting function with fall-back to the native base::cut`
			`#'`
			`#' @param x an object inheriting from class "hms"`
			`#' @param ... passed on`
			`#'`
			`#' @export`
			`#' @name cut_var`
			`cut_var <- function(x, ...) {`
			`UseMethod("cut_var")`
			`}`

			`#' @export`
			`#' @name cut_var`
			`cut_var.default <- function(x, ...) {`
			`base::cut(x, ...)`
			`}`

			`#' @name cut_var`
			`#'`
			`#' @return factor`
			`#' @export`
			`#'`
			`#' @examples`
			`#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) \|> cut_var(2)`
			`#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) \|> cut_var("min")`
			`#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) \|> cut_var(breaks = "hour")`
			`#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) \|> cut_var(breaks = hms::as_hms(c("01:00:00", "03:01:20", "9:20:20")))`
			`#' d_t <- readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "03:02:20", NA))`
			`#' f <- d_t \|> cut_var(2)`
			`#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "03:02:20", NA)) \|> cut_var(breaks = lubridate::as_datetime(c(hms::as_hms(levels(f)), hms::as_hms(max(d_t, na.rm = TRUE) + 1))), right = FALSE)`
			`cut_var.hms <- function(x, breaks, ...) {`
			`## as_hms keeps returning warnings on tz(); ignored`
			`suppressWarnings({`
			`if (hms::is_hms(breaks)) {`
			`breaks <- lubridate::as_datetime(breaks)`
			`}`
			`x <- lubridate::as_datetime(x)`
			`out <- cut_var.POSIXt(x, breaks = breaks, ...)`
			`attr(out, which = "brks") <- hms::as_hms(lubridate::as_datetime(attr(out, which = "brks")))`
			`attr(out, which = "levels") <- as.character(hms::as_hms(lubridate::as_datetime(attr(out, which = "levels"))))`
			`})`
			`out`
			`}`

			`#' @name cut_var`
			`#' @param x an object inheriting from class "POSIXt" or "Date"`
			`#'`
			`#' @examples`
			`#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) \|> cut_var(2)`
			`#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) \|> cut_var(breaks = "weekday")`
			`#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) \|> cut_var(breaks = "month_only")`
			`#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) \|> cut_var(breaks = NULL, format = "%A-%H")`
			`#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) \|> cut_var(breaks = NULL, format = "%W")`
			`cut_var.POSIXt <- function(x, breaks, right = FALSE, include.lowest = TRUE, start.on.monday = TRUE, ...) {`
			`breaks_o <- breaks`
			`args <- list(...)`
			`# browser()`
			`if (is.numeric(breaks)) {`
			`breaks <- quantile(`
			`x,`
			`probs = seq(0, 1, 1 / breaks),`
			`right = right,`
			`include.lowest = include.lowest,`
			`na.rm = TRUE`
			`)`
			`}`

			`if ("format" %in% names(args)) {`
			`assertthat::assert_that(is.character(args$format))`
			`out <- forcats::as_factor(format(x, format = args$format))`
			`} else if (identical(breaks, "weekday")) {`
			`## This is`
			`ds <- as.Date(1:7) \|>`
			`(\(.x){`
			`sort_by(format(.x, "%A"), as.numeric(format(.x, "%w")))`
			`})()`

			`if (start.on.monday) {`
			`ds <- ds[c(7, 1:6)]`
			`}`
			`out <- factor(weekdays(x), levels = ds) \|> forcats::fct_drop()`
			`} else if (identical(breaks, "month_only")) {`
			`## Simplest way to create a vector of all months in order`
			`## which will also follow the locale of the machine`
			`ms <- paste0("1970-", 1:12, "-01") \|>`
			`as.Date() \|>`
			`months()`

			`out <- factor(months(x), levels = ms) \|> forcats::fct_drop()`
			`} else {`
			`## Doesn't really work very well for breaks other than the special character cases as right border is excluded`
			`out <- base::cut.POSIXt(x, breaks = breaks, right = right, ...) \|> forcats::fct_drop()`
			`# browser()`
			`}`
			`l <- levels(out)`
			`if (is.numeric(breaks_o)) {`
			`l <- breaks`
			`} else if (is.character(breaks) && length(breaks) == 1 && !(identical(breaks, "weekday") \| identical(breaks, "month_only"))) {`
			`if (include.lowest) {`
			`if (right) {`
			`l <- c(l, min(as.character(x)))`
			`} else {`
			`l <- c(l, max(as.character(x)))`
			`}`
			`}`
			`} else if (length(l) < length(breaks_o)) {`
			`l <- breaks_o`
			`}`

			`attr(out, which = "brks") <- l`
			`out`
			`}`

			`#' @name cut_var`
			`#' @param x an object inheriting from class "POSIXct"`
			`cut_var.POSIXct <- cut_var.POSIXt`

			`#' @name cut_var`
			`#' @param x an object inheriting from class "POSIXct"`
			`#'`
			`#' @examples`
			`#' as.Date(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) \|> cut_var(2)`
			`#' as.Date(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) \|> cut_var(breaks = "weekday")`
			`#' as.Date(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) \|> cut_var(format = "%W")`
			`cut_var.Date <- function(x, breaks = NULL, start.on.monday = TRUE, ...) {`
			`args <- list(...)`

			`if ("format" %in% names(args)) {`
			`assertthat::assert_that(is.character(args$format))`
			`out <- forcats::as_factor(format(x, format = args$format))`
			`} else if (identical(breaks, "weekday")) {`
			`ds <- as.Date(1:7) \|>`
			`(\(.x){`
			`sort_by(format(.x, "%A"), as.numeric(format(.x, "%w")))`
			`})()`

			`if (start.on.monday) {`
			`ds <- ds[c(7, 1:6)]`
			`}`
			`out <- factor(weekdays(x), levels = ds) \|> forcats::fct_drop()`
			`} else if (identical(breaks, "month_only")) {`
			`ms <- paste0("1970-", 1:12, "-01") \|>`
			`as.Date() \|>`
			`months()`

			`out <- factor(months(x), levels = ms) \|> forcats::fct_drop()`
			`} else {`
			`## Doesn't really work very well for breaks other than the special character cases as right border is excluded`
			`out <- base::cut.Date(x, breaks = breaks, ...) \|> forcats::fct_drop()`
			`# browser()`
			`}`
			`out`
			`}`


			`#' Simplify a factor to only the top or bottom n levels`
			`#'`
			`#' @param type`
			`#'`
			`#' @name cut_var`
			`#'`
			`#' @returns factor`
			`#' @export`
			`#'`
			`#' @examples`
			`#' mtcars$carb \|>`
			`#' as.factor() \|>`
			`#' cut_var(2) \|>`
			`#' table()`
			`cut_var.factor <- function(x, breaks = NULL, type = c("top", "bottom"), other = "Other", ...) {`
			`args <- list(...)`

			`if (is.null(breaks)) {`
			`return(x)`
			`}`

			`type <- match.arg(type)`

			`if (type == "top") {`
			`lvls <- names(sort(table(x), decreasing = TRUE)[seq_len(breaks)])`
			`} else if (type == "bottom") {`
			`lvls <- names(sort(table(x), decreasing = FALSE)[seq_len(breaks)])`
			`}`

			`if (other %in% lvls) {`
			`other <- paste(other, "_freesearchr")`
			`}`

			`## Relabel and relevel`
			`out <- forcats::fct_relabel(`
			`x,`
			`\(.x){`
			`ifelse(.x %in% lvls, .x, other)`
			`}`
			`) \|>`
			`forcats::fct_relevel(lvls,other)`

			`attr(out, which = "brks") <- breaks`
			`out`
			`}`


			`#' Test class`
			`#'`
			`#' @param data data`
			`#' @param class.vec vector of class names to test`
			`#'`
			`#' @return factor`
			`#' @export`
			`#'`
			`#' @examples`
			`#' \dontrun{`
			`#' vapply(REDCapCAST::redcapcast_data, \(.x){`
			`#' is_any_class(.x, c("hms", "Date", "POSIXct", "POSIXt"))`
			`#' }, logical(1))`
			`#' }`
			`is_any_class <- function(data, class.vec) {`
			`any(class(data) %in% class.vec)`
			`}`

			`#' Test is date/datetime/time`
			`#'`
			`#' @param data data`
			`#'`
			`#' @return factor`
			`#' @export`
			`#'`
			`#' @examples`
			`#' vapply(REDCapCAST::redcapcast_data, is_datetime, logical(1))`
			`is_datetime <- function(data) {`
			`is_any_class(data, class.vec = c("hms", "Date", "POSIXct", "POSIXt"))`
			`}`