Major update. New functions and improvements. See NEWS.md.

2026-06-19 05:07:30 +02:00 · 2023-03-07 15:38:28 +01:00 · 2023-03-07 15:38:28 +01:00 · 9f68e27f5a
commit 9f68e27f5a
parent b57e130395
20 changed files with 443 additions and 97 deletions
--- a/R/read_redcap_tables.R
+++ b/R/read_redcap_tables.R
@ -1,6 +1,8 @@
 #' Download REDCap data
 #'
-#' Wrapper function for using REDCapR::redcap_read and REDCapRITS::REDCap_split
+#' Implementation of REDCap_split with a focused data acquisition approach using
+#' REDCapR::redcap_read nad only downloading specified fields, forms and/or events
+#' using the built-in focused_metadata
 #' including some clean-up. Works with longitudinal projects with repeating
 #' instruments.
 #' @param uri REDCap database uri
@ -10,6 +12,7 @@
 #' @param events events to download
 #' @param forms forms to download
 #' @param raw_or_label raw or label tags
+#' @param split_forms Whether to split "repeating" or "all" forms, default is all.
 #' @param generics vector of auto-generated generic variable names to
 #' ignore when discarding empty rows
 #'
@ -27,6 +30,7 @@ read_redcap_tables <- function(uri,
                               events = NULL,
                               forms = NULL,
                               raw_or_label = "label",
+                               split_forms = "all",
                               generics = c(
                                 "record_id",
                                 "redcap_event_name",
@ -57,6 +61,7 @@ read_redcap_tables <- function(uri,
    }
  }

+  # Getting dataset
  d <- REDCapR::redcap_read(
    redcap_uri = uri,
    token = token,
@ -65,23 +70,33 @@ read_redcap_tables <- function(uri,
    forms = forms,
    records = records,
    raw_or_label = raw_or_label
-  )
+  )[["data"]]

+  # Process repeat instrument naming
+  # Removes any extra characters other than a-z, 0-9 and "_", to mimic raw instrument names.
+  if ("redcap_repeat_instrument" %in% names(d)) {
+    d$redcap_repeat_instrument <-
+    gsub("[^a-z0-9_]", "", gsub(" ", "_", tolower(d$redcap_repeat_instrument)))
+  }
+
+  # Getting metadata
  m <-
-    REDCapR::redcap_metadata_read (redcap_uri = uri, token = token)
+    REDCapR::redcap_metadata_read (redcap_uri = uri, token = token)[["data"]]

-  l <- REDCap_split(d$data,
-                    focused_metadata(m$data,names(d$data)),
-                    forms = "all")
+  # Processing metadata to reflect dataset
+  if (!is.null(c(fields,forms,events))){
+    m <- focused_metadata(m,names(d))
+  }

-  lapply(l, function(i) {
-    if (ncol(i) > 2) {
-      s <- data.frame(i[, !colnames(i) %in% generics])
-      i[!apply(is.na(s), MARGIN = 1, FUN = all), ]
-    } else {
-      i
-    }
-  })
+  # Splitting
+  l <- REDCap_split(d,
+                    m,
+                    forms = split_forms,
+                    primary_table_name = "nonrepeating")
+
+  # Sanitizing split list by removing completely empty rows apart from colnames
+  # in "generics"
+  sanitize_split(l,generics)

 }

--- a/R/redcap_wider.R
+++ b/R/redcap_wider.R
@ -1,13 +1,17 @@
-
+utils::globalVariables(c("redcap_wider",
+"event.glue",
+"inst.glue"))

 #' @title Redcap Wider
 #' @description Converts a list of REDCap data frames from long to wide format.
 #' Handles longitudinal projects, but not yet repeated instruments.
 #' @param list A list of data frames.
-#' @param names.glud A string to glue the column names together.
+#' @param event.glue A dplyr::glue string for repeated events naming
+#' @param inst.glue A dplyr::glue string for repeated instruments naming
 #' @return The list of data frames in wide format.
 #' @export
 #' @importFrom tidyr pivot_wider
+#' @importFrom tidyselect all_of
 #'
 #' @examples
 #' list <- list(data.frame(record_id = c(1,2,1,2),
@ -17,26 +21,77 @@
 #' redcap_event_name = c("baseline", "baseline"),
 #' gender = c("male", "female")))
 #' redcap_wider(list)
-redcap_wider <- function(list,names.glud="{.value}_{redcap_event_name}_long") {
-  l <- lapply(list,function(i){
-    incl <- any(duplicated(i[["record_id"]]))
+redcap_wider <-
+  function(list,
+           event.glue = "{.value}_{redcap_event_name}",
+           inst.glue = "{.value}_{redcap_repeat_instance}") {
+    all_names <- unique(do.call(c, lapply(list, names)))

-    cname <- colnames(i)
-    vals <- cname[!cname%in%c("record_id","redcap_event_name")]
+    if (!any(c("redcap_event_name", "redcap_repeat_instrument") %in% all_names)) {
+      stop(
+        "The dataset does not include a 'redcap_event_name' variable.
+         redcap_wider only handles projects with repeating instruments or
+         longitudinal projects"
+      )
+    }

-    i$redcap_event_name <- tolower(gsub(" ","_",i$redcap_event_name))
+    # if (any(grepl("_timestamp",all_names))){
+    #   stop("The dataset includes a '_timestamp' variable, which is not supported
+    #        by this function yet. Sorry! Feel free to contribute :)")
+    # }

-    if (incl){
-      s <- tidyr::pivot_wider(i,
-                              names_from = redcap_event_name,
-                              values_from = all_of(vals),
-                              names_glue = names.glud)
-      s[colnames(s)!="redcap_event_name"]
-    } else (i[colnames(i)!="redcap_event_name"])
+    id.name <- all_names[1]

-  })
+    l <- lapply(list, function(i) {
+      rep_inst <- "redcap_repeat_instrument" %in% names(i)

-  ## Additional conditioning is needed to handle repeated instruments.
+      if (rep_inst) {
+        k <- lapply(split(i, f = i[[id.name]]), function(j) {
+          cname <- colnames(j)
+          vals <-
+            cname[!cname %in% c(
+              id.name,
+              "redcap_event_name",
+              "redcap_repeat_instrument",
+              "redcap_repeat_instance"
+            )]
+          s <- tidyr::pivot_wider(
+            j,
+            names_from = "redcap_repeat_instance",
+            values_from = all_of(vals),
+            names_glue = inst.glue
+          )
+          s[!colnames(s) %in% c("redcap_repeat_instrument")]
+        })
+        i <- Reduce(dplyr::bind_rows, k)
+      }

-  data.frame(Reduce(f = dplyr::full_join, x = l))
+      event <- "redcap_event_name" %in% names(i)
+
+      if (event) {
+        event.n <- length(unique(i[["redcap_event_name"]])) > 1
+
+        i[["redcap_event_name"]] <-
+          gsub(" ", "_", tolower(i[["redcap_event_name"]]))
+
+        if (event.n) {
+          cname <- colnames(i)
+          vals <- cname[!cname %in% c(id.name, "redcap_event_name")]
+
+          s <- tidyr::pivot_wider(
+            i,
+            names_from = "redcap_event_name",
+            values_from = all_of(vals),
+            names_glue = event.glue
+          )
+          s[colnames(s) != "redcap_event_name"]
+        } else
+          (i[colnames(i) != "redcap_event_name"])
+      } else
+        (i)
+    })
+
+    ## Additional conditioning is needed to handle repeated instruments.
+
+    data.frame(Reduce(f = dplyr::full_join, x = l))
  }
--- a/R/utils.r
+++ b/R/utils.r
@ -1,48 +1,60 @@
+
+
+#' focused_metadata
+#' @description Extracts limited metadata for variables in a dataset
+#' @param metadata A dataframe containing metadata
+#' @param vars_in_data Vector of variable names in the dataset
+#' @return A dataframe containing metadata for the variables in the dataset
+#' @export
+#' @examples
+#'
 focused_metadata <- function(metadata, vars_in_data) {
-  # metadata <- m$data
-  # vars_in_data <- names(d$data)
+
+  if (any(c("tbl_df", "tbl") %in% class(metadata))) {
+    metadata <- data.frame(metadata)
+  }
+
+  field_name <- grepl(".*[Ff]ield[._][Nn]ame$", names(metadata))
+  field_type <- grepl(".*[Ff]ield[._][Tt]ype$", names(metadata))

  fields <-
-    metadata[!metadata$field_type %in% c("descriptive", "checkbox") &
-               metadata$field_name %in% vars_in_data,
-             "field_name"]
+    metadata[!metadata[, field_type] %in% c("descriptive", "checkbox") &
+               metadata[, field_name] %in% vars_in_data,
+             field_name]

  # Process checkbox fields
-  if (any(metadata$field_type == "checkbox")) {
-
+  if (any(metadata[, field_type] == "checkbox")) {
    # Getting base field names from checkbox fields
-    vars_check <- gsub(pattern = "___(\\d+)",replacement = "", vars_in_data)
+    vars_check <-
+      sub(pattern = "___.*$", replacement = "", vars_in_data)

    # Processing
    checkbox_basenames <-
-      metadata[metadata$field_type == "checkbox" &
-                 metadata$field_name %in% vars_check,
-               "field_name"]
+      metadata[metadata[, field_type] == "checkbox" &
+                 metadata[, field_name] %in% vars_check,
+               field_name]

-    fields <- rbind(fields, checkbox_basenames)
+    fields <- c(fields, checkbox_basenames)

  }

  # Process instrument status fields
-  form_names <- unique(metadata$form_name[metadata$field_name %in% fields$field_name])
+  form_names <-
+    unique(metadata[, grepl(".*[Ff]orm[._][Nn]ame$",
+                            names(metadata))][metadata[, field_name]
+                                              %in% fields])

-  form_complete_fields <- data.frame(
-    field_name = paste0(form_names, "_complete"),
-    stringsAsFactors = FALSE
-  )
+  form_complete_fields <- paste0(form_names, "_complete")

-  fields <- rbind(fields, form_complete_fields)
+  fields <- c(fields, form_complete_fields)

  # Process survey timestamps
  timestamps <-
    intersect(vars_in_data, paste0(form_names, "_timestamp"))
  if (length(timestamps)) {
-    timestamp_fields <- data.frame(
-      field_name = timestamps,
-      stringsAsFactors = FALSE
-    )
+    timestamp_fields <- timestamps

-    fields <- rbind(fields, timestamp_fields)
+    fields <- c(fields, timestamp_fields)

  }

@ -64,20 +76,73 @@ focused_metadata <- function(metadata, vars_in_data) {
                    },
                    y = vars_in_data))

-    fields <- rbind(fields, factor_fields)
+    fields <- c(fields, factor_fields[, 1])

  }

-  metadata[metadata$field_name %in% fields$field_name,]
+  metadata[metadata[, field_name] %in% fields, ]

 }

+
+
+# function to convert the list of dataframes
+
+
+#' Sanitize list of data frames
+#'
+#' Removing empty rows
+#' @param l A list of data frames.
+#' @param generic.names A vector of generic names to be excluded.
+#'
+#' @return A list of data frames with generic names excluded.
+#'
+#' @export
+#'
+#' @examples
+#'
+sanitize_split <- function(l,
+                           generic.names = c(
+                             "record_id",
+                             "redcap_event_name",
+                             "redcap_repeat_instrument",
+                             "redcap_repeat_instance"
+                           )) {
+  lapply(l, function(i) {
+    if (ncol(i) > 2) {
+      s <- data.frame(i[, !colnames(i) %in% generic.names])
+      i[!apply(is.na(s), MARGIN = 1, FUN = all),]
+    } else {
+      i
+    }
+  })
+}
+
+
+#' Match fields to forms
+#'
+#' @param metadata A data frame containing field names and form names
+#' @param vars_in_data A character vector of variable names
+#'
+#' @return A data frame containing field names and form names
+#'
+#' @export
+#'
+#' @examples
+#'
+#'
 match_fields_to_form <- function(metadata, vars_in_data) {
-  fields <- metadata[!metadata$field_type %in% c("descriptive", "checkbox"),
-                     c("field_name", "form_name")]
+
+  field_form_name <- grepl(".*([Ff]ield|[Ff]orm)[._][Nn]ame$",names(metadata))
+  field_type <- grepl(".*[Ff]ield[._][Tt]ype$",names(metadata))
+
+  fields <- metadata[!metadata[,field_type] %in% c("descriptive", "checkbox"),
+                     field_form_name]
+
+  names(fields) <- c("field_name", "form_name")

  # Process instrument status fields
-  form_names <- unique(metadata$form_name)
+  form_names <- unique(metadata[,grepl(".*[Ff]orm[._][Nn]ame$",names(metadata))])
  form_complete_fields <- data.frame(
    field_name = paste0(form_names, "_complete"),
    form_name = form_names,
@ -101,9 +166,9 @@ match_fields_to_form <- function(metadata, vars_in_data) {
  }

  # Process checkbox fields
-  if (any(metadata$field_type == "checkbox")) {
-    checkbox_basenames <- metadata[metadata$field_type == "checkbox",
-                                   c("field_name", "form_name")]
+  if (any(metadata[,field_type] == "checkbox")) {
+    checkbox_basenames <- metadata[metadata[,field_type] == "checkbox",
+                                   field_form_name]

    checkbox_fields <-
      do.call("rbind",
@ -111,7 +176,9 @@ match_fields_to_form <- function(metadata, vars_in_data) {
                    1,
                    function(x, y)
                      data.frame(
-                        field_name = y[grepl(paste0("^", x[1], "___((?!\\.factor).)+$"), y, perl = TRUE)],
+                        field_name =
+                          y[grepl(paste0("^", x[1], "___((?!\\.factor).)+$"),
+                                  y, perl = TRUE)],
                        form_name = x[2],
                        stringsAsFactors = FALSE,
                        row.names = NULL
@ -148,14 +215,50 @@ match_fields_to_form <- function(metadata, vars_in_data) {

 }

-
+#' Split a data frame into separate tables for each form
+#'
+#' @param table A data frame
+#' @param universal_fields A character vector of fields that should be included
+#' in every table
+#' @param fields A two-column matrix containing the names of fields that should
+#' be included in each form
+#'
+#' @return A list of data frames, one for each non-repeating form
+#'
+#' @export
+#'
+#' @examples
+#' # Create a table
+#' table <- data.frame(
+#'   id = c(1, 2, 3, 4, 5),
+#'   form_a_name = c("John", "Alice", "Bob", "Eve", "Mallory"),
+#'   form_a_age = c(25, 30, 25, 15, 20),
+#'   form_b_name = c("John", "Alice", "Bob", "Eve", "Mallory"),
+#'   form_b_gender = c("M", "F", "M", "F", "F")
+#' )
+#'
+#' # Create the universal fields
+#' universal_fields <- c("id")
+#'
+#' # Create the fields
+#' fields <- matrix(
+#'   c("form_a_name", "form_a",
+#'     "form_a_age", "form_a",
+#'     "form_b_name", "form_b",
+#'     "form_b_gender", "form_b"),
+#'   ncol = 2, byrow = TRUE
+#' )
+#'
+#' # Split the table
+#' split_non_repeating_forms(table, universal_fields, fields)
 split_non_repeating_forms <-
  function(table, universal_fields, fields) {
    forms <- unique(fields[[2]])

    x <- lapply(forms,
                function (x) {
-                  table[names(table) %in% union(universal_fields, fields[fields[, 2] == x, 1])]
+                  table[names(table) %in% union(universal_fields,
+                                                fields[fields[, 2] == x, 1])]
                })

    structure(x, names = forms)