FreesearchR/R/plot_sankey.R
Andreas Gammelgaard Damsbo efc3f8acc3
Some checks are pending
pkgdown.yaml / pkgdown (push) Waiting to run
minor steps
2025-03-12 18:27:46 +01:00

200 lines
6.5 KiB
R

#' Readying data for sankey plot
#'
#' @name data-plots
#'
#' @returns data.frame
#' @export
#'
#' @examples
#' ds <- data.frame(g = sample(LETTERS[1:2], 100, TRUE), first = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)), last = sample(c(letters[1:4], NA), 100, TRUE, prob = c(rep(.23, 4), .08)))
#' ds |> sankey_ready("first", "last")
#' ds |> sankey_ready("first", "last", numbers = "percentage")
#' data.frame(
#' g = sample(LETTERS[1:2], 100, TRUE),
#' first = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)),
#' last = sample(c(TRUE, FALSE, FALSE), 100, TRUE)
#' ) |>
#' sankey_ready("first", "last")
sankey_ready <- function(data, x, y, numbers = "count", ...) {
## TODO: Ensure ordering x and y
## Ensure all are factors
data[c(x, y)] <- data[c(x, y)] |>
dplyr::mutate(dplyr::across(!dplyr::where(is.factor), forcats::as_factor))
out <- dplyr::count(data, !!dplyr::sym(x), !!dplyr::sym(y))
out <- out |>
dplyr::group_by(!!dplyr::sym(x)) |>
dplyr::mutate(gx.sum = sum(n)) |>
dplyr::ungroup() |>
dplyr::group_by(!!dplyr::sym(y)) |>
dplyr::mutate(gy.sum = sum(n)) |>
dplyr::ungroup()
if (numbers == "count") {
out <- out |> dplyr::mutate(
lx = factor(paste0(!!dplyr::sym(x), "\n(n=", gx.sum, ")")),
ly = factor(paste0(!!dplyr::sym(y), "\n(n=", gy.sum, ")"))
)
} else if (numbers == "percentage") {
out <- out |> dplyr::mutate(
lx = factor(paste0(!!dplyr::sym(x), "\n(", round((gx.sum / sum(n)) * 100, 1), "%)")),
ly = factor(paste0(!!dplyr::sym(y), "\n(", round((gy.sum / sum(n)) * 100, 1), "%)"))
)
}
if (is.factor(data[[x]])) {
index <- match(levels(data[[x]]), str_remove_last(levels(out$lx), "\n"))
out$lx <- factor(out$lx, levels = levels(out$lx)[index])
}
if (is.factor(data[[y]])) {
index <- match(levels(data[[y]]), str_remove_last(levels(out$ly), "\n"))
out$ly <- factor(out$ly, levels = levels(out$ly)[index])
}
out
}
str_remove_last <- function(data, pattern = "\n") {
strsplit(data, split = pattern) |>
lapply(\(.x)paste(unlist(.x[[-length(.x)]]), collapse = pattern)) |>
unlist()
}
#' Beautiful sankey plot with option to split by a tertiary group
#'
#' @returns ggplot2 object
#' @export
#'
#' @name data-plots
#'
#' @examples
#' ds <- data.frame(g = sample(LETTERS[1:2], 100, TRUE), first = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)), last = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)))
#' ds |> plot_sankey("first", "last")
#' ds |> plot_sankey("first", "last", color.group = "y")
#' ds |> plot_sankey("first", "last", z = "g", color.group = "y")
plot_sankey <- function(data, x, y, z = NULL, color.group = "x", colors = NULL) {
if (!is.null(z)) {
ds <- split(data, data[z])
} else {
ds <- list(data)
}
out <- lapply(ds, \(.ds){
plot_sankey_single(.ds, x = x, y = y, color.group = color.group, colors = colors)
})
patchwork::wrap_plots(out)
}
default_theme <- function() {
theme_void()
}
#' Beautiful sankey plot
#'
#' @param color.group set group to colour by. "x" or "y".
#' @param colors optinally specify colors. Give NA color, color for each level
#' in primary group and color for each level in secondary group.
#' @param ... passed to sankey_ready()
#'
#' @returns ggplot2 object
#' @export
#'
#' @examples
#' ds <- data.frame(g = sample(LETTERS[1:2], 100, TRUE), first = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)), last = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)))
#' ds |> plot_sankey_single("first", "last")
#' ds |> plot_sankey_single("first", "last", color.group = "y")
#' data.frame(
#' g = sample(LETTERS[1:2], 100, TRUE),
#' first = REDCapCAST::as_factor(sample(letters[1:4], 100, TRUE)),
#' last = sample(c(TRUE, FALSE, FALSE), 100, TRUE)
#' ) |>
#' plot_sankey_single("first", "last", color.group = "x")
plot_sankey_single <- function(data, x, y, color.group = c("x", "y"), colors = NULL, ...) {
color.group <- match.arg(color.group)
data <- data |> sankey_ready(x = x, y = y, ...)
library(ggalluvial)
na.color <- "#2986cc"
box.color <- "#1E4B66"
if (is.null(colors)) {
if (color.group == "y") {
main.colors <- viridisLite::viridis(n = length(levels(data[[y]])))
secondary.colors <- rep(na.color, length(levels(data[[x]])))
label.colors <- Reduce(c, lapply(list(secondary.colors, rev(main.colors)), contrast_text))
} else {
main.colors <- viridisLite::viridis(n = length(levels(data[[x]])))
secondary.colors <- rep(na.color, length(levels(data[[y]])))
label.colors <- Reduce(c, lapply(list(rev(main.colors), secondary.colors), contrast_text))
}
colors <- c(na.color, main.colors, secondary.colors)
} else {
label.colors <- contrast_text(colors)
}
group_labels <- c(get_label(data, x), get_label(data, y)) |>
sapply(line_break) |>
unname()
p <- ggplot2::ggplot(data, ggplot2::aes(y = n, axis1 = lx, axis2 = ly))
if (color.group == "y") {
p <- p +
ggalluvial::geom_alluvium(
ggplot2::aes(fill = !!dplyr::sym(y), color = !!dplyr::sym(y)),
width = 1 / 16,
alpha = .8,
knot.pos = 0.4,
curve_type = "sigmoid"
) + ggalluvial::geom_stratum(ggplot2::aes(fill = !!dplyr::sym(y)),
size = 2,
width = 1 / 3.4
)
} else {
p <- p +
ggalluvial::geom_alluvium(
ggplot2::aes(fill = !!dplyr::sym(x), color = !!dplyr::sym(x)),
width = 1 / 16,
alpha = .8,
knot.pos = 0.4,
curve_type = "sigmoid"
) + ggalluvial::geom_stratum(ggplot2::aes(fill = !!dplyr::sym(x)),
size = 2,
width = 1 / 3.4
)
}
p +
ggplot2::geom_text(
stat = "stratum",
ggplot2::aes(label = after_stat(stratum)),
colour = label.colors,
size = 8,
lineheight = 1
) +
ggplot2::scale_x_continuous(
breaks = 1:2,
labels = group_labels
) +
ggplot2::scale_fill_manual(values = colors[-1], na.value = colors[1]) +
ggplot2::scale_color_manual(values = main.colors) +
ggplot2::theme_void() +
ggplot2::theme(
legend.position = "none",
# panel.grid.major = element_blank(),
# panel.grid.minor = element_blank(),
# axis.text.y = element_blank(),
# axis.title.y = element_blank(),
axis.text.x = ggplot2::element_text(size = 20),
# text = element_text(size = 5),
# plot.title = element_blank(),
# panel.background = ggplot2::element_rect(fill = "white"),
plot.background = ggplot2::element_rect(fill = "white"),
panel.border = ggplot2::element_blank()
)
}