FreesearchR/R/wide2long.R

#' Alternative pivoting method for easily pivoting based on name pattern
#'
#' @description
#' This function requires and assumes a systematic naming of variables.
#' For now only supports one level pivoting. Adding more levels would require
#' an added "ignore" string pattern or similarly. Example 2.
#'
#'
#' @param data data
#' @param pattern pattern(s) to match. Character vector of length 1 or more.
#' @param type type of match. can be one of "prefix","infix" or "suffix".
#' @param id.col ID column. Will fill ID for all. Column name or numeric index.
#' Default is "1", first column.
#' @param instance.name
#'
#' @returns data.frame
#' @export
#'
#' @examples
#' data.frame(
#'   1:20, sample(70:80, 20, TRUE),
#'   sample(70:100, 20, TRUE),
#'   sample(70:100, 20, TRUE),
#'   sample(170:200, 20, TRUE)
#' ) |>
#'   setNames(c("id", "age", "weight_0", "weight_1", "height_1")) |>
#'   wide2long(pattern = c("_0", "_1"), type = "suffix")
#' data.frame(
#'   1:20, sample(70:80, 20, TRUE),
#'   sample(70:100, 20, TRUE),
#'   sample(70:100, 20, TRUE),
#'   sample(170:200, 20, TRUE)
#' ) |>
#'   setNames(c("id", "age", "weight_0", "weight_a_1", "height_b_1")) |>
#'   wide2long(pattern = c("_0", "_1"), type = "suffix")
#' # Optional filling of missing values by last observation carried forward
#' # Needed for mmrm analyses
#' long_missings |>
#'   # Fills record ID assuming none are missing
#'   tidyr::fill(record_id) |>
#'   # Grouping by ID for the last step
#'   dplyr::group_by(record_id) |>
#'   # Filling missing data by ID
#'   tidyr::fill(names(long_missings)[!names(long_missings) %in% new_names]) |>
#'   # Remove grouping
#'   dplyr::ungroup()
wide2long <- function(
    data,
    pattern,
    type = c("prefix", "infix", "suffix"),
    id.col = 1,
    instance.name = "instance") {
  type <- match.arg(type)

  ## Give the unique suffix names to use for identifying repeated measures
  # suffixes <- c("_0", "_1")

  ## If no ID column is present, one is added
  if (id.col == "none" | is.null(id.col)) {
    data <- stats::setNames(
      data.frame(seq_len(nrow(data)), data),
      make.names(c("id", names(data)), unique = TRUE)
    )
    id.col <- 1
  }
# browser()
  ## Relevant columns are determined based on suffixes
  cols <- names(data)[grepl_fix(names(data), pattern = pattern, type = type)]

  ## New colnames are created by removing suffixes
  new_names <- unique(gsub(paste(pattern, collapse = "|"), "", cols))

  out <- split(data, seq_len(nrow(data))) |> # Splits dataset by row
    # Starts data modifications for each subject
    lapply(\(.x){
      ## Pivots data with repeated measures as determined by the defined suffixes
      long_ls <- split.default(
        # Subset only repeated data
        .x[cols],
        # ... and split by meassure
        gsub(paste(new_names, collapse = "|"), "", cols)
      ) |>
        # Sort data by order of given suffixes to ensure chronology
        sort_by(pattern) |>
        # New colnames are applied
        lapply(\(.y){
          setNames(
            .y,
            gsub(paste(pattern, collapse = "|"), "", names(.y))
          )
        })

      # Subsets non-pivotted data (this is assumed to belong to same )
      single <- .x[-match(cols, names(.x))]

      # Extends with empty rows to get same dimensions as long data
      single[(nrow(single) + 1):length(long_ls), ] <- NA

      # Fills ID col
      single[id.col] <- single[1, id.col]

      # Everything is merged together
      merged <- dplyr::bind_cols(
        single,
        # Instance names are defined as suffixes without leading non-characters
        REDCapCAST::as_factor(data.frame(gsub(
          "^[^[:alnum:]]+", "",
          names(long_ls)
        ))),
        dplyr::bind_rows(long_ls)
      )

      # Ensure unique new names based on supplied
      colnames(merged) <- make.names(
        c(
          names(single),
          instance.name,
          names(merged)[(NCOL(single) + 2):NCOL(merged)]
        ),
        unique = TRUE
      )

      merged
    }) |> dplyr::bind_rows()

  rownames(out) <- NULL

  out
}


#' Matches pattern to vector based on match type
#'
#' @param data vector
#' @param pattern pattern(s) to match. Character vector of length 1 or more.
#' @param type type of match. can be one of "prefix","infix" or "suffix".
#'
#' @returns logical vector
#' @export
#'
#' @examples
#' c("id", "age", "weight_0", "weight_1") |> grepl_fix(pattern = c("_0", "_1"), type = "suffix")
grepl_fix <- function(data, pattern, type = c("prefix", "infix", "suffix")) {
  type <- match.arg(type)

  if (type == "prefix") {
    grepl(paste0("^(", paste(pattern, collapse = "|"), ")*"), data)
  } else if (type == "suffix") {
    grepl(paste0("*(", paste(pattern, collapse = "|"), ")$"), data)
  } else if (type == "infix") {
    grepl(paste0("*(", paste(pattern, collapse = "|"), ")*"), data)
  }
}
updated data import 2025-03-11 13:42:57 +01:00			`#' Alternative pivoting method for easily pivoting based on name pattern`
			`#'`
			`#' @description`
			`#' This function requires and assumes a systematic naming of variables.`
			`#' For now only supports one level pivoting. Adding more levels would require`
			`#' an added "ignore" string pattern or similarly. Example 2.`
			`#'`
			`#'`
			`#' @param data data`
			`#' @param pattern pattern(s) to match. Character vector of length 1 or more.`
			`#' @param type type of match. can be one of "prefix","infix" or "suffix".`
			`#' @param id.col ID column. Will fill ID for all. Column name or numeric index.`
			`#' Default is "1", first column.`
			`#' @param instance.name`
			`#'`
			`#' @returns data.frame`
			`#' @export`
			`#'`
			`#' @examples`
			`#' data.frame(`
			`#' 1:20, sample(70:80, 20, TRUE),`
			`#' sample(70:100, 20, TRUE),`
			`#' sample(70:100, 20, TRUE),`
			`#' sample(170:200, 20, TRUE)`
			`#' ) \|>`
			`#' setNames(c("id", "age", "weight_0", "weight_1", "height_1")) \|>`
			`#' wide2long(pattern = c("_0", "_1"), type = "suffix")`
			`#' data.frame(`
			`#' 1:20, sample(70:80, 20, TRUE),`
			`#' sample(70:100, 20, TRUE),`
			`#' sample(70:100, 20, TRUE),`
			`#' sample(170:200, 20, TRUE)`
			`#' ) \|>`
			`#' setNames(c("id", "age", "weight_0", "weight_a_1", "height_b_1")) \|>`
			`#' wide2long(pattern = c("_0", "_1"), type = "suffix")`
			`#' # Optional filling of missing values by last observation carried forward`
			`#' # Needed for mmrm analyses`
			`#' long_missings \|>`
			`#' # Fills record ID assuming none are missing`
			`#' tidyr::fill(record_id) \|>`
			`#' # Grouping by ID for the last step`
			`#' dplyr::group_by(record_id) \|>`
			`#' # Filling missing data by ID`
			`#' tidyr::fill(names(long_missings)[!names(long_missings) %in% new_names]) \|>`
			`#' # Remove grouping`
			`#' dplyr::ungroup()`
			`wide2long <- function(`
			`data,`
			`pattern,`
			`type = c("prefix", "infix", "suffix"),`
			`id.col = 1,`
			`instance.name = "instance") {`
			`type <- match.arg(type)`

			`## Give the unique suffix names to use for identifying repeated measures`
			`# suffixes <- c("_0", "_1")`

			`## If no ID column is present, one is added`
			`if (id.col == "none" \| is.null(id.col)) {`
			`data <- stats::setNames(`
			`data.frame(seq_len(nrow(data)), data),`
			`make.names(c("id", names(data)), unique = TRUE)`
			`)`
			`id.col <- 1`
			`}`
			`# browser()`
			`## Relevant columns are determined based on suffixes`
			`cols <- names(data)[grepl_fix(names(data), pattern = pattern, type = type)]`

			`## New colnames are created by removing suffixes`
			`new_names <- unique(gsub(paste(pattern, collapse = "\|"), "", cols))`

			`out <- split(data, seq_len(nrow(data))) \|> # Splits dataset by row`
			`# Starts data modifications for each subject`
			`lapply(\(.x){`
			`## Pivots data with repeated measures as determined by the defined suffixes`
			`long_ls <- split.default(`
			`# Subset only repeated data`
			`.x[cols],`
			`# ... and split by meassure`
			`gsub(paste(new_names, collapse = "\|"), "", cols)`
			`) \|>`
			`# Sort data by order of given suffixes to ensure chronology`
			`sort_by(pattern) \|>`
			`# New colnames are applied`
			`lapply(\(.y){`
			`setNames(`
			`.y,`
			`gsub(paste(pattern, collapse = "\|"), "", names(.y))`
			`)`
			`})`

			`# Subsets non-pivotted data (this is assumed to belong to same )`
			`single <- .x[-match(cols, names(.x))]`

			`# Extends with empty rows to get same dimensions as long data`
			`single[(nrow(single) + 1):length(long_ls), ] <- NA`

			`# Fills ID col`
			`single[id.col] <- single[1, id.col]`

			`# Everything is merged together`
			`merged <- dplyr::bind_cols(`
			`single,`
			`# Instance names are defined as suffixes without leading non-characters`
			`REDCapCAST::as_factor(data.frame(gsub(`
			`"^[^[:alnum:]]+", "",`
			`names(long_ls)`
			`))),`
			`dplyr::bind_rows(long_ls)`
			`)`

			`# Ensure unique new names based on supplied`
			`colnames(merged) <- make.names(`
			`c(`
			`names(single),`
			`instance.name,`
			`names(merged)[(NCOL(single) + 2):NCOL(merged)]`
			`),`
			`unique = TRUE`
			`)`

			`merged`
			`}) \|> dplyr::bind_rows()`

			`rownames(out) <- NULL`

			`out`
			`}`


			`#' Matches pattern to vector based on match type`
			`#'`
			`#' @param data vector`
			`#' @param pattern pattern(s) to match. Character vector of length 1 or more.`
			`#' @param type type of match. can be one of "prefix","infix" or "suffix".`
			`#'`
			`#' @returns logical vector`
			`#' @export`
			`#'`
			`#' @examples`
			`#' c("id", "age", "weight_0", "weight_1") \|> grepl_fix(pattern = c("_0", "_1"), type = "suffix")`
			`grepl_fix <- function(data, pattern, type = c("prefix", "infix", "suffix")) {`
			`type <- match.arg(type)`

			`if (type == "prefix") {`
			`grepl(paste0("^(", paste(pattern, collapse = "\|"), ")*"), data)`
			`} else if (type == "suffix") {`
			`grepl(paste0("*(", paste(pattern, collapse = "\|"), ")$"), data)`
			`} else if (type == "infix") {`
			`grepl(paste0("(", paste(pattern, collapse = "\|"), ")"), data)`
			`}`
			`}`