updated data import

2026-06-19 12:37:30 +02:00 · 2025-03-11 13:42:57 +01:00 · 2025-03-11 13:42:57 +01:00 · 912fff7474
commit 912fff7474
parent 6c44be558d
32 changed files with 2340 additions and 273 deletions
--- a/R/wide2long.R
+++ b/R/wide2long.R
@ -0,0 +1,153 @@
+#' Alternative pivoting method for easily pivoting based on name pattern
+#'
+#' @description
+#' This function requires and assumes a systematic naming of variables.
+#' For now only supports one level pivoting. Adding more levels would require
+#' an added "ignore" string pattern or similarly. Example 2.
+#'
+#'
+#' @param data data
+#' @param pattern pattern(s) to match. Character vector of length 1 or more.
+#' @param type type of match. can be one of "prefix","infix" or "suffix".
+#' @param id.col ID column. Will fill ID for all. Column name or numeric index.
+#' Default is "1", first column.
+#' @param instance.name
+#'
+#' @returns data.frame
+#' @export
+#'
+#' @examples
+#' data.frame(
+#'   1:20, sample(70:80, 20, TRUE),
+#'   sample(70:100, 20, TRUE),
+#'   sample(70:100, 20, TRUE),
+#'   sample(170:200, 20, TRUE)
+#' ) |>
+#'   setNames(c("id", "age", "weight_0", "weight_1", "height_1")) |>
+#'   wide2long(pattern = c("_0", "_1"), type = "suffix")
+#' data.frame(
+#'   1:20, sample(70:80, 20, TRUE),
+#'   sample(70:100, 20, TRUE),
+#'   sample(70:100, 20, TRUE),
+#'   sample(170:200, 20, TRUE)
+#' ) |>
+#'   setNames(c("id", "age", "weight_0", "weight_a_1", "height_b_1")) |>
+#'   wide2long(pattern = c("_0", "_1"), type = "suffix")
+#' # Optional filling of missing values by last observation carried forward
+#' # Needed for mmrm analyses
+#' long_missings |>
+#'   # Fills record ID assuming none are missing
+#'   tidyr::fill(record_id) |>
+#'   # Grouping by ID for the last step
+#'   dplyr::group_by(record_id) |>
+#'   # Filling missing data by ID
+#'   tidyr::fill(names(long_missings)[!names(long_missings) %in% new_names]) |>
+#'   # Remove grouping
+#'   dplyr::ungroup()
+wide2long <- function(
+    data,
+    pattern,
+    type = c("prefix", "infix", "suffix"),
+    id.col = 1,
+    instance.name = "instance") {
+  type <- match.arg(type)
+
+  ## Give the unique suffix names to use for identifying repeated measures
+  # suffixes <- c("_0", "_1")
+
+  ## If no ID column is present, one is added
+  if (id.col == "none" | is.null(id.col)) {
+    data <- stats::setNames(
+      data.frame(seq_len(nrow(data)), data),
+      make.names(c("id", names(data)), unique = TRUE)
+    )
+    id.col <- 1
+  }
+# browser()
+  ## Relevant columns are determined based on suffixes
+  cols <- names(data)[grepl_fix(names(data), pattern = pattern, type = type)]
+
+  ## New colnames are created by removing suffixes
+  new_names <- unique(gsub(paste(pattern, collapse = "|"), "", cols))
+
+  out <- split(data, seq_len(nrow(data))) |> # Splits dataset by row
+    # Starts data modifications for each subject
+    lapply(\(.x){
+      ## Pivots data with repeated measures as determined by the defined suffixes
+      long_ls <- split.default(
+        # Subset only repeated data
+        .x[cols],
+        # ... and split by meassure
+        gsub(paste(new_names, collapse = "|"), "", cols)
+      ) |>
+        # Sort data by order of given suffixes to ensure chronology
+        sort_by(pattern) |>
+        # New colnames are applied
+        lapply(\(.y){
+          setNames(
+            .y,
+            gsub(paste(pattern, collapse = "|"), "", names(.y))
+          )
+        })
+
+      # Subsets non-pivotted data (this is assumed to belong to same )
+      single <- .x[-match(cols, names(.x))]
+
+      # Extends with empty rows to get same dimensions as long data
+      single[(nrow(single) + 1):length(long_ls), ] <- NA
+
+      # Fills ID col
+      single[id.col] <- single[1, id.col]
+
+      # Everything is merged together
+      merged <- dplyr::bind_cols(
+        single,
+        # Instance names are defined as suffixes without leading non-characters
+        REDCapCAST::as_factor(data.frame(gsub(
+          "^[^[:alnum:]]+", "",
+          names(long_ls)
+        ))),
+        dplyr::bind_rows(long_ls)
+      )
+
+      # Ensure unique new names based on supplied
+      colnames(merged) <- make.names(
+        c(
+          names(single),
+          instance.name,
+          names(merged)[(NCOL(single) + 2):NCOL(merged)]
+        ),
+        unique = TRUE
+      )
+
+      merged
+    }) |> dplyr::bind_rows()
+
+  rownames(out) <- NULL
+
+  out
+}
+
+
+#' Matches pattern to vector based on match type
+#'
+#' @param data vector
+#' @param pattern pattern(s) to match. Character vector of length 1 or more.
+#' @param type type of match. can be one of "prefix","infix" or "suffix".
+#'
+#' @returns logical vector
+#' @export
+#'
+#' @examples
+#' c("id", "age", "weight_0", "weight_1") |> grepl_fix(pattern = c("_0", "_1"), type = "suffix")
+grepl_fix <- function(data, pattern, type = c("prefix", "infix", "suffix")) {
+  type <- match.arg(type)
+
+  if (type == "prefix") {
+    grepl(paste0("^(", paste(pattern, collapse = "|"), ")*"), data)
+  } else if (type == "suffix") {
+    grepl(paste0("*(", paste(pattern, collapse = "|"), ")$"), data)
+  } else if (type == "infix") {
+    grepl(paste0("*(", paste(pattern, collapse = "|"), ")*"), data)
+  }
+}