2025-03-12 18:27:46 +01:00
|
|
|
#' Readying data for sankey plot
|
|
|
|
#'
|
|
|
|
#' @name data-plots
|
|
|
|
#'
|
|
|
|
#' @returns data.frame
|
|
|
|
#' @export
|
|
|
|
#'
|
|
|
|
#' @examples
|
|
|
|
#' ds <- data.frame(g = sample(LETTERS[1:2], 100, TRUE), first = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)), last = sample(c(letters[1:4], NA), 100, TRUE, prob = c(rep(.23, 4), .08)))
|
|
|
|
#' ds |> sankey_ready("first", "last")
|
|
|
|
#' ds |> sankey_ready("first", "last", numbers = "percentage")
|
|
|
|
#' data.frame(
|
|
|
|
#' g = sample(LETTERS[1:2], 100, TRUE),
|
|
|
|
#' first = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)),
|
|
|
|
#' last = sample(c(TRUE, FALSE, FALSE), 100, TRUE)
|
|
|
|
#' ) |>
|
|
|
|
#' sankey_ready("first", "last")
|
2025-04-15 12:04:32 +02:00
|
|
|
sankey_ready <- function(data, pri, sec, numbers = "count", ...) {
|
2025-03-12 18:27:46 +01:00
|
|
|
## TODO: Ensure ordering x and y
|
|
|
|
|
|
|
|
## Ensure all are factors
|
2025-04-15 12:04:32 +02:00
|
|
|
data[c(pri, sec)] <- data[c(pri, sec)] |>
|
2025-03-12 18:27:46 +01:00
|
|
|
dplyr::mutate(dplyr::across(!dplyr::where(is.factor), forcats::as_factor))
|
|
|
|
|
2025-04-23 14:25:55 +02:00
|
|
|
out <- dplyr::count(data, !!dplyr::sym(pri), !!dplyr::sym(sec), .drop = FALSE)
|
2025-03-12 18:27:46 +01:00
|
|
|
|
|
|
|
out <- out |>
|
2025-04-15 12:04:32 +02:00
|
|
|
dplyr::group_by(!!dplyr::sym(pri)) |>
|
2025-03-12 18:27:46 +01:00
|
|
|
dplyr::mutate(gx.sum = sum(n)) |>
|
|
|
|
dplyr::ungroup() |>
|
2025-04-15 12:04:32 +02:00
|
|
|
dplyr::group_by(!!dplyr::sym(sec)) |>
|
2025-03-12 18:27:46 +01:00
|
|
|
dplyr::mutate(gy.sum = sum(n)) |>
|
|
|
|
dplyr::ungroup()
|
|
|
|
|
|
|
|
if (numbers == "count") {
|
|
|
|
out <- out |> dplyr::mutate(
|
2025-04-15 12:04:32 +02:00
|
|
|
lx = factor(paste0(!!dplyr::sym(pri), "\n(n=", gx.sum, ")")),
|
|
|
|
ly = factor(paste0(!!dplyr::sym(sec), "\n(n=", gy.sum, ")"))
|
2025-03-12 18:27:46 +01:00
|
|
|
)
|
|
|
|
} else if (numbers == "percentage") {
|
|
|
|
out <- out |> dplyr::mutate(
|
2025-04-15 12:04:32 +02:00
|
|
|
lx = factor(paste0(!!dplyr::sym(pri), "\n(", round((gx.sum / sum(n)) * 100, 1), "%)")),
|
|
|
|
ly = factor(paste0(!!dplyr::sym(sec), "\n(", round((gy.sum / sum(n)) * 100, 1), "%)"))
|
2025-03-12 18:27:46 +01:00
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2025-04-15 12:04:32 +02:00
|
|
|
if (is.factor(data[[pri]])) {
|
|
|
|
index <- match(levels(data[[pri]]), str_remove_last(levels(out$lx), "\n"))
|
2025-03-12 18:27:46 +01:00
|
|
|
out$lx <- factor(out$lx, levels = levels(out$lx)[index])
|
|
|
|
}
|
|
|
|
|
2025-04-15 12:04:32 +02:00
|
|
|
if (is.factor(data[[sec]])) {
|
|
|
|
index <- match(levels(data[[sec]]), str_remove_last(levels(out$ly), "\n"))
|
2025-03-12 18:27:46 +01:00
|
|
|
out$ly <- factor(out$ly, levels = levels(out$ly)[index])
|
|
|
|
}
|
|
|
|
|
|
|
|
out
|
|
|
|
}
|
|
|
|
|
|
|
|
str_remove_last <- function(data, pattern = "\n") {
|
|
|
|
strsplit(data, split = pattern) |>
|
|
|
|
lapply(\(.x)paste(unlist(.x[[-length(.x)]]), collapse = pattern)) |>
|
|
|
|
unlist()
|
|
|
|
}
|
|
|
|
|
|
|
|
#' Beautiful sankey plot with option to split by a tertiary group
|
|
|
|
#'
|
|
|
|
#' @returns ggplot2 object
|
|
|
|
#' @export
|
|
|
|
#'
|
|
|
|
#' @name data-plots
|
|
|
|
#'
|
|
|
|
#' @examples
|
|
|
|
#' ds <- data.frame(g = sample(LETTERS[1:2], 100, TRUE), first = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)), last = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)))
|
|
|
|
#' ds |> plot_sankey("first", "last")
|
2025-04-23 14:25:55 +02:00
|
|
|
#' ds |> plot_sankey("first", "last", color.group = "sec")
|
|
|
|
#' ds |> plot_sankey("first", "last", ter = "g", color.group = "sec")
|
|
|
|
#' mtcars |>
|
|
|
|
#' default_parsing() |>
|
|
|
|
#' plot_sankey("cyl", "gear", "am", color.group = "pri")
|
|
|
|
#' ## In this case, the last plot as the secondary variable in wrong order
|
|
|
|
#' ## Dont know why...
|
|
|
|
#' mtcars |>
|
|
|
|
#' default_parsing() |>
|
|
|
|
#' plot_sankey("cyl", "gear", "vs", color.group = "pri")
|
|
|
|
plot_sankey <- function(data, pri, sec, ter = NULL, color.group = "pri", colors = NULL) {
|
2025-04-15 12:04:32 +02:00
|
|
|
if (!is.null(ter)) {
|
|
|
|
ds <- split(data, data[ter])
|
2025-03-12 18:27:46 +01:00
|
|
|
} else {
|
|
|
|
ds <- list(data)
|
|
|
|
}
|
|
|
|
|
|
|
|
out <- lapply(ds, \(.ds){
|
2025-04-23 14:25:55 +02:00
|
|
|
plot_sankey_single(.ds, pri = pri, sec = sec, color.group = color.group, colors = colors)
|
2025-03-12 18:27:46 +01:00
|
|
|
})
|
|
|
|
|
|
|
|
patchwork::wrap_plots(out)
|
|
|
|
}
|
|
|
|
|
|
|
|
#' Beautiful sankey plot
|
|
|
|
#'
|
|
|
|
#' @param color.group set group to colour by. "x" or "y".
|
|
|
|
#' @param colors optinally specify colors. Give NA color, color for each level
|
|
|
|
#' in primary group and color for each level in secondary group.
|
|
|
|
#' @param ... passed to sankey_ready()
|
|
|
|
#'
|
|
|
|
#' @returns ggplot2 object
|
|
|
|
#' @export
|
|
|
|
#'
|
|
|
|
#' @examples
|
|
|
|
#' ds <- data.frame(g = sample(LETTERS[1:2], 100, TRUE), first = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)), last = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)))
|
|
|
|
#' ds |> plot_sankey_single("first", "last")
|
2025-04-23 14:25:55 +02:00
|
|
|
#' ds |> plot_sankey_single("first", "last", color.group = "sec")
|
2025-03-12 18:27:46 +01:00
|
|
|
#' data.frame(
|
|
|
|
#' g = sample(LETTERS[1:2], 100, TRUE),
|
|
|
|
#' first = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)),
|
|
|
|
#' last = sample(c(TRUE, FALSE, FALSE), 100, TRUE)
|
|
|
|
#' ) |>
|
2025-04-15 12:04:32 +02:00
|
|
|
#' plot_sankey_single("first", "last", color.group = "pri")
|
2025-04-23 14:25:55 +02:00
|
|
|
#' mtcars |>
|
|
|
|
#' default_parsing() |>
|
|
|
|
#' plot_sankey_single("cyl", "vs", color.group = "pri")
|
2025-04-15 12:04:32 +02:00
|
|
|
plot_sankey_single <- function(data, pri, sec, color.group = c("pri", "sec"), colors = NULL, ...) {
|
2025-03-12 18:27:46 +01:00
|
|
|
color.group <- match.arg(color.group)
|
2025-04-23 14:25:55 +02:00
|
|
|
|
|
|
|
data_orig <- data
|
|
|
|
data[c(pri, sec)] <- data[c(pri, sec)] |>
|
|
|
|
dplyr::mutate(dplyr::across(dplyr::where(is.factor), forcats::fct_drop))
|
|
|
|
|
|
|
|
# browser()
|
|
|
|
|
2025-04-15 12:04:32 +02:00
|
|
|
data <- data |> sankey_ready(pri = pri, sec = sec, ...)
|
2025-03-12 18:27:46 +01:00
|
|
|
|
|
|
|
na.color <- "#2986cc"
|
|
|
|
box.color <- "#1E4B66"
|
|
|
|
|
|
|
|
if (is.null(colors)) {
|
2025-04-15 12:04:32 +02:00
|
|
|
if (color.group == "sec") {
|
2025-04-23 14:25:55 +02:00
|
|
|
main.colors <- viridisLite::viridis(n = length(levels(data_orig[[sec]])))
|
|
|
|
## Only keep colors for included levels
|
|
|
|
main.colors <- main.colors[match(levels(data[[sec]]), levels(data_orig[[sec]]))]
|
|
|
|
|
2025-04-15 12:04:32 +02:00
|
|
|
secondary.colors <- rep(na.color, length(levels(data[[pri]])))
|
2025-03-12 18:27:46 +01:00
|
|
|
label.colors <- Reduce(c, lapply(list(secondary.colors, rev(main.colors)), contrast_text))
|
|
|
|
} else {
|
2025-04-23 14:25:55 +02:00
|
|
|
main.colors <- viridisLite::viridis(n = length(levels(data_orig[[pri]])))
|
|
|
|
## Only keep colors for included levels
|
|
|
|
main.colors <- main.colors[match(levels(data[[pri]]), levels(data_orig[[pri]]))]
|
|
|
|
|
2025-04-15 12:04:32 +02:00
|
|
|
secondary.colors <- rep(na.color, length(levels(data[[sec]])))
|
2025-03-12 18:27:46 +01:00
|
|
|
label.colors <- Reduce(c, lapply(list(rev(main.colors), secondary.colors), contrast_text))
|
|
|
|
}
|
|
|
|
colors <- c(na.color, main.colors, secondary.colors)
|
|
|
|
} else {
|
|
|
|
label.colors <- contrast_text(colors)
|
|
|
|
}
|
|
|
|
|
2025-04-15 12:04:32 +02:00
|
|
|
group_labels <- c(get_label(data, pri), get_label(data, sec)) |>
|
2025-03-12 18:27:46 +01:00
|
|
|
sapply(line_break) |>
|
|
|
|
unname()
|
|
|
|
|
|
|
|
p <- ggplot2::ggplot(data, ggplot2::aes(y = n, axis1 = lx, axis2 = ly))
|
|
|
|
|
2025-04-15 12:04:32 +02:00
|
|
|
if (color.group == "sec") {
|
2025-03-12 18:27:46 +01:00
|
|
|
p <- p +
|
|
|
|
ggalluvial::geom_alluvium(
|
2025-04-23 14:25:55 +02:00
|
|
|
ggplot2::aes(
|
|
|
|
fill = !!dplyr::sym(sec) # ,
|
|
|
|
## Including will print strings when levels are empty
|
|
|
|
# color = !!dplyr::sym(sec)
|
|
|
|
),
|
2025-03-12 18:27:46 +01:00
|
|
|
width = 1 / 16,
|
|
|
|
alpha = .8,
|
|
|
|
knot.pos = 0.4,
|
|
|
|
curve_type = "sigmoid"
|
2025-04-15 12:04:32 +02:00
|
|
|
) + ggalluvial::geom_stratum(ggplot2::aes(fill = !!dplyr::sym(sec)),
|
2025-03-12 18:27:46 +01:00
|
|
|
size = 2,
|
|
|
|
width = 1 / 3.4
|
|
|
|
)
|
|
|
|
} else {
|
|
|
|
p <- p +
|
|
|
|
ggalluvial::geom_alluvium(
|
2025-04-23 14:25:55 +02:00
|
|
|
ggplot2::aes(
|
|
|
|
fill = !!dplyr::sym(pri) # ,
|
|
|
|
# color = !!dplyr::sym(pri)
|
|
|
|
),
|
2025-03-12 18:27:46 +01:00
|
|
|
width = 1 / 16,
|
|
|
|
alpha = .8,
|
|
|
|
knot.pos = 0.4,
|
|
|
|
curve_type = "sigmoid"
|
2025-04-15 12:04:32 +02:00
|
|
|
) + ggalluvial::geom_stratum(ggplot2::aes(fill = !!dplyr::sym(pri)),
|
2025-03-12 18:27:46 +01:00
|
|
|
size = 2,
|
|
|
|
width = 1 / 3.4
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2025-05-10 13:02:04 +02:00
|
|
|
## Will fail to use stat="stratum" if library is not loaded.
|
|
|
|
library(ggalluvial)
|
2025-03-12 18:27:46 +01:00
|
|
|
p +
|
|
|
|
ggplot2::geom_text(
|
|
|
|
stat = "stratum",
|
|
|
|
ggplot2::aes(label = after_stat(stratum)),
|
|
|
|
colour = label.colors,
|
|
|
|
size = 8,
|
|
|
|
lineheight = 1
|
|
|
|
) +
|
|
|
|
ggplot2::scale_x_continuous(
|
|
|
|
breaks = 1:2,
|
|
|
|
labels = group_labels
|
|
|
|
) +
|
|
|
|
ggplot2::scale_fill_manual(values = colors[-1], na.value = colors[1]) +
|
2025-04-23 14:25:55 +02:00
|
|
|
# ggplot2::scale_color_manual(values = main.colors) +
|
2025-03-12 18:27:46 +01:00
|
|
|
ggplot2::theme_void() +
|
|
|
|
ggplot2::theme(
|
|
|
|
legend.position = "none",
|
|
|
|
# panel.grid.major = element_blank(),
|
|
|
|
# panel.grid.minor = element_blank(),
|
|
|
|
# axis.text.y = element_blank(),
|
|
|
|
# axis.title.y = element_blank(),
|
|
|
|
axis.text.x = ggplot2::element_text(size = 20),
|
|
|
|
# text = element_text(size = 5),
|
|
|
|
# plot.title = element_blank(),
|
|
|
|
# panel.background = ggplot2::element_rect(fill = "white"),
|
|
|
|
plot.background = ggplot2::element_rect(fill = "white"),
|
|
|
|
panel.border = ggplot2::element_blank()
|
|
|
|
)
|
|
|
|
}
|