mirror of
https://github.com/agdamsbo/FreesearchR.git
synced 2026-06-19 12:37:30 +02:00
feat: new cut option to simplify factors to only the top N levels
This commit is contained in:
parent
a06177481b
commit
0c2b061708
12 changed files with 337 additions and 236 deletions
232
R/cut_var.R
Normal file
232
R/cut_var.R
Normal file
|
|
@ -0,0 +1,232 @@
|
|||
#' Extended cutting function with fall-back to the native base::cut
|
||||
#'
|
||||
#' @param x an object inheriting from class "hms"
|
||||
#' @param ... passed on
|
||||
#'
|
||||
#' @export
|
||||
#' @name cut_var
|
||||
cut_var <- function(x, ...) {
|
||||
UseMethod("cut_var")
|
||||
}
|
||||
|
||||
#' @export
|
||||
#' @name cut_var
|
||||
cut_var.default <- function(x, ...) {
|
||||
base::cut(x, ...)
|
||||
}
|
||||
|
||||
#' @name cut_var
|
||||
#'
|
||||
#' @return factor
|
||||
#' @export
|
||||
#'
|
||||
#' @examples
|
||||
#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) |> cut_var(2)
|
||||
#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) |> cut_var("min")
|
||||
#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) |> cut_var(breaks = "hour")
|
||||
#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "08:20:20", "21:20:20", "03:02:20")) |> cut_var(breaks = hms::as_hms(c("01:00:00", "03:01:20", "9:20:20")))
|
||||
#' d_t <- readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "03:02:20", NA))
|
||||
#' f <- d_t |> cut_var(2)
|
||||
#' readr::parse_time(c("01:00:20", "03:00:20", "01:20:20", "03:02:20", NA)) |> cut_var(breaks = lubridate::as_datetime(c(hms::as_hms(levels(f)), hms::as_hms(max(d_t, na.rm = TRUE) + 1))), right = FALSE)
|
||||
cut_var.hms <- function(x, breaks, ...) {
|
||||
## as_hms keeps returning warnings on tz(); ignored
|
||||
suppressWarnings({
|
||||
if (hms::is_hms(breaks)) {
|
||||
breaks <- lubridate::as_datetime(breaks)
|
||||
}
|
||||
x <- lubridate::as_datetime(x)
|
||||
out <- cut_var.POSIXt(x, breaks = breaks, ...)
|
||||
attr(out, which = "brks") <- hms::as_hms(lubridate::as_datetime(attr(out, which = "brks")))
|
||||
attr(out, which = "levels") <- as.character(hms::as_hms(lubridate::as_datetime(attr(out, which = "levels"))))
|
||||
})
|
||||
out
|
||||
}
|
||||
|
||||
#' @name cut_var
|
||||
#' @param x an object inheriting from class "POSIXt" or "Date"
|
||||
#'
|
||||
#' @examples
|
||||
#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(2)
|
||||
#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = "weekday")
|
||||
#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = "month_only")
|
||||
#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = NULL, format = "%A-%H")
|
||||
#' readr::parse_datetime(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = NULL, format = "%W")
|
||||
cut_var.POSIXt <- function(x, breaks, right = FALSE, include.lowest = TRUE, start.on.monday = TRUE, ...) {
|
||||
breaks_o <- breaks
|
||||
args <- list(...)
|
||||
# browser()
|
||||
if (is.numeric(breaks)) {
|
||||
breaks <- quantile(
|
||||
x,
|
||||
probs = seq(0, 1, 1 / breaks),
|
||||
right = right,
|
||||
include.lowest = include.lowest,
|
||||
na.rm = TRUE
|
||||
)
|
||||
}
|
||||
|
||||
if ("format" %in% names(args)) {
|
||||
assertthat::assert_that(is.character(args$format))
|
||||
out <- forcats::as_factor(format(x, format = args$format))
|
||||
} else if (identical(breaks, "weekday")) {
|
||||
## This is
|
||||
ds <- as.Date(1:7) |>
|
||||
(\(.x){
|
||||
sort_by(format(.x, "%A"), as.numeric(format(.x, "%w")))
|
||||
})()
|
||||
|
||||
if (start.on.monday) {
|
||||
ds <- ds[c(7, 1:6)]
|
||||
}
|
||||
out <- factor(weekdays(x), levels = ds) |> forcats::fct_drop()
|
||||
} else if (identical(breaks, "month_only")) {
|
||||
## Simplest way to create a vector of all months in order
|
||||
## which will also follow the locale of the machine
|
||||
ms <- paste0("1970-", 1:12, "-01") |>
|
||||
as.Date() |>
|
||||
months()
|
||||
|
||||
out <- factor(months(x), levels = ms) |> forcats::fct_drop()
|
||||
} else {
|
||||
## Doesn't really work very well for breaks other than the special character cases as right border is excluded
|
||||
out <- base::cut.POSIXt(x, breaks = breaks, right = right, ...) |> forcats::fct_drop()
|
||||
# browser()
|
||||
}
|
||||
l <- levels(out)
|
||||
if (is.numeric(breaks_o)) {
|
||||
l <- breaks
|
||||
} else if (is.character(breaks) && length(breaks) == 1 && !(identical(breaks, "weekday") | identical(breaks, "month_only"))) {
|
||||
if (include.lowest) {
|
||||
if (right) {
|
||||
l <- c(l, min(as.character(x)))
|
||||
} else {
|
||||
l <- c(l, max(as.character(x)))
|
||||
}
|
||||
}
|
||||
} else if (length(l) < length(breaks_o)) {
|
||||
l <- breaks_o
|
||||
}
|
||||
|
||||
attr(out, which = "brks") <- l
|
||||
out
|
||||
}
|
||||
|
||||
#' @name cut_var
|
||||
#' @param x an object inheriting from class "POSIXct"
|
||||
cut_var.POSIXct <- cut_var.POSIXt
|
||||
|
||||
#' @name cut_var
|
||||
#' @param x an object inheriting from class "POSIXct"
|
||||
#'
|
||||
#' @examples
|
||||
#' as.Date(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(2)
|
||||
#' as.Date(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(breaks = "weekday")
|
||||
#' as.Date(c("1992-02-01 01:00:20", "1992-02-06 03:00:20", "1992-05-01 01:20:20", "1992-09-01 08:20:20", "1999-02-01 21:20:20", "1992-12-01 03:02:20")) |> cut_var(format = "%W")
|
||||
cut_var.Date <- function(x, breaks = NULL, start.on.monday = TRUE, ...) {
|
||||
args <- list(...)
|
||||
|
||||
if ("format" %in% names(args)) {
|
||||
assertthat::assert_that(is.character(args$format))
|
||||
out <- forcats::as_factor(format(x, format = args$format))
|
||||
} else if (identical(breaks, "weekday")) {
|
||||
ds <- as.Date(1:7) |>
|
||||
(\(.x){
|
||||
sort_by(format(.x, "%A"), as.numeric(format(.x, "%w")))
|
||||
})()
|
||||
|
||||
if (start.on.monday) {
|
||||
ds <- ds[c(7, 1:6)]
|
||||
}
|
||||
out <- factor(weekdays(x), levels = ds) |> forcats::fct_drop()
|
||||
} else if (identical(breaks, "month_only")) {
|
||||
ms <- paste0("1970-", 1:12, "-01") |>
|
||||
as.Date() |>
|
||||
months()
|
||||
|
||||
out <- factor(months(x), levels = ms) |> forcats::fct_drop()
|
||||
} else {
|
||||
## Doesn't really work very well for breaks other than the special character cases as right border is excluded
|
||||
out <- base::cut.Date(x, breaks = breaks, ...) |> forcats::fct_drop()
|
||||
# browser()
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
|
||||
#' Simplify a factor to only the top or bottom n levels
|
||||
#'
|
||||
#' @param type
|
||||
#'
|
||||
#' @name cut_var
|
||||
#'
|
||||
#' @returns factor
|
||||
#' @export
|
||||
#'
|
||||
#' @examples
|
||||
#' mtcars$carb |>
|
||||
#' as.factor() |>
|
||||
#' cut_var(2) |>
|
||||
#' table()
|
||||
cut_var.factor <- function(x, breaks = NULL, type = c("top", "bottom"), other = "Other", ...) {
|
||||
args <- list(...)
|
||||
|
||||
if (is.null(breaks)) {
|
||||
return(x)
|
||||
}
|
||||
|
||||
type <- match.arg(type)
|
||||
|
||||
if (type == "top") {
|
||||
lvls <- names(sort(table(x), decreasing = TRUE)[seq_len(breaks)])
|
||||
} else if (type == "bottom") {
|
||||
lvls <- names(sort(table(x), decreasing = FALSE)[seq_len(breaks)])
|
||||
}
|
||||
|
||||
if (other %in% lvls) {
|
||||
other <- paste(other, "_freesearchr")
|
||||
}
|
||||
|
||||
## Relabel and relevel
|
||||
out <- forcats::fct_relabel(
|
||||
x,
|
||||
\(.x){
|
||||
ifelse(.x %in% lvls, .x, other)
|
||||
}
|
||||
) |>
|
||||
forcats::fct_relevel(lvls,other)
|
||||
|
||||
attr(out, which = "brks") <- breaks
|
||||
out
|
||||
}
|
||||
|
||||
|
||||
#' Test class
|
||||
#'
|
||||
#' @param data data
|
||||
#' @param class.vec vector of class names to test
|
||||
#'
|
||||
#' @return factor
|
||||
#' @export
|
||||
#'
|
||||
#' @examples
|
||||
#' \dontrun{
|
||||
#' vapply(REDCapCAST::redcapcast_data, \(.x){
|
||||
#' is_any_class(.x, c("hms", "Date", "POSIXct", "POSIXt"))
|
||||
#' }, logical(1))
|
||||
#' }
|
||||
is_any_class <- function(data, class.vec) {
|
||||
any(class(data) %in% class.vec)
|
||||
}
|
||||
|
||||
#' Test is date/datetime/time
|
||||
#'
|
||||
#' @param data data
|
||||
#'
|
||||
#' @return factor
|
||||
#' @export
|
||||
#'
|
||||
#' @examples
|
||||
#' vapply(REDCapCAST::redcapcast_data, is_datetime, logical(1))
|
||||
is_datetime <- function(data) {
|
||||
is_any_class(data, class.vec = c("hms", "Date", "POSIXct", "POSIXt"))
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue