Major update. New functions and improvements. See NEWS.md.

This commit is contained in:
AG Damsbo 2023-03-07 15:38:28 +01:00
commit 9f68e27f5a
20 changed files with 443 additions and 97 deletions

View file

@ -1,6 +1,8 @@
#' Download REDCap data
#'
#' Wrapper function for using REDCapR::redcap_read and REDCapRITS::REDCap_split
#' Implementation of REDCap_split with a focused data acquisition approach using
#' REDCapR::redcap_read nad only downloading specified fields, forms and/or events
#' using the built-in focused_metadata
#' including some clean-up. Works with longitudinal projects with repeating
#' instruments.
#' @param uri REDCap database uri
@ -10,6 +12,7 @@
#' @param events events to download
#' @param forms forms to download
#' @param raw_or_label raw or label tags
#' @param split_forms Whether to split "repeating" or "all" forms, default is all.
#' @param generics vector of auto-generated generic variable names to
#' ignore when discarding empty rows
#'
@ -27,6 +30,7 @@ read_redcap_tables <- function(uri,
events = NULL,
forms = NULL,
raw_or_label = "label",
split_forms = "all",
generics = c(
"record_id",
"redcap_event_name",
@ -57,6 +61,7 @@ read_redcap_tables <- function(uri,
}
}
# Getting dataset
d <- REDCapR::redcap_read(
redcap_uri = uri,
token = token,
@ -65,23 +70,33 @@ read_redcap_tables <- function(uri,
forms = forms,
records = records,
raw_or_label = raw_or_label
)
)[["data"]]
# Process repeat instrument naming
# Removes any extra characters other than a-z, 0-9 and "_", to mimic raw instrument names.
if ("redcap_repeat_instrument" %in% names(d)) {
d$redcap_repeat_instrument <-
gsub("[^a-z0-9_]", "", gsub(" ", "_", tolower(d$redcap_repeat_instrument)))
}
# Getting metadata
m <-
REDCapR::redcap_metadata_read (redcap_uri = uri, token = token)
REDCapR::redcap_metadata_read (redcap_uri = uri, token = token)[["data"]]
l <- REDCap_split(d$data,
focused_metadata(m$data,names(d$data)),
forms = "all")
# Processing metadata to reflect dataset
if (!is.null(c(fields,forms,events))){
m <- focused_metadata(m,names(d))
}
lapply(l, function(i) {
if (ncol(i) > 2) {
s <- data.frame(i[, !colnames(i) %in% generics])
i[!apply(is.na(s), MARGIN = 1, FUN = all), ]
} else {
i
}
})
# Splitting
l <- REDCap_split(d,
m,
forms = split_forms,
primary_table_name = "nonrepeating")
# Sanitizing split list by removing completely empty rows apart from colnames
# in "generics"
sanitize_split(l,generics)
}

View file

@ -1,13 +1,17 @@
utils::globalVariables(c("redcap_wider",
"event.glue",
"inst.glue"))
#' @title Redcap Wider
#' @description Converts a list of REDCap data frames from long to wide format.
#' Handles longitudinal projects, but not yet repeated instruments.
#' @param list A list of data frames.
#' @param names.glud A string to glue the column names together.
#' @param event.glue A dplyr::glue string for repeated events naming
#' @param inst.glue A dplyr::glue string for repeated instruments naming
#' @return The list of data frames in wide format.
#' @export
#' @importFrom tidyr pivot_wider
#' @importFrom tidyselect all_of
#'
#' @examples
#' list <- list(data.frame(record_id = c(1,2,1,2),
@ -17,26 +21,77 @@
#' redcap_event_name = c("baseline", "baseline"),
#' gender = c("male", "female")))
#' redcap_wider(list)
redcap_wider <- function(list,names.glud="{.value}_{redcap_event_name}_long") {
l <- lapply(list,function(i){
incl <- any(duplicated(i[["record_id"]]))
redcap_wider <-
function(list,
event.glue = "{.value}_{redcap_event_name}",
inst.glue = "{.value}_{redcap_repeat_instance}") {
all_names <- unique(do.call(c, lapply(list, names)))
cname <- colnames(i)
vals <- cname[!cname%in%c("record_id","redcap_event_name")]
if (!any(c("redcap_event_name", "redcap_repeat_instrument") %in% all_names)) {
stop(
"The dataset does not include a 'redcap_event_name' variable.
redcap_wider only handles projects with repeating instruments or
longitudinal projects"
)
}
i$redcap_event_name <- tolower(gsub(" ","_",i$redcap_event_name))
# if (any(grepl("_timestamp",all_names))){
# stop("The dataset includes a '_timestamp' variable, which is not supported
# by this function yet. Sorry! Feel free to contribute :)")
# }
if (incl){
s <- tidyr::pivot_wider(i,
names_from = redcap_event_name,
values_from = all_of(vals),
names_glue = names.glud)
s[colnames(s)!="redcap_event_name"]
} else (i[colnames(i)!="redcap_event_name"])
id.name <- all_names[1]
})
l <- lapply(list, function(i) {
rep_inst <- "redcap_repeat_instrument" %in% names(i)
## Additional conditioning is needed to handle repeated instruments.
if (rep_inst) {
k <- lapply(split(i, f = i[[id.name]]), function(j) {
cname <- colnames(j)
vals <-
cname[!cname %in% c(
id.name,
"redcap_event_name",
"redcap_repeat_instrument",
"redcap_repeat_instance"
)]
s <- tidyr::pivot_wider(
j,
names_from = "redcap_repeat_instance",
values_from = all_of(vals),
names_glue = inst.glue
)
s[!colnames(s) %in% c("redcap_repeat_instrument")]
})
i <- Reduce(dplyr::bind_rows, k)
}
data.frame(Reduce(f = dplyr::full_join, x = l))
event <- "redcap_event_name" %in% names(i)
if (event) {
event.n <- length(unique(i[["redcap_event_name"]])) > 1
i[["redcap_event_name"]] <-
gsub(" ", "_", tolower(i[["redcap_event_name"]]))
if (event.n) {
cname <- colnames(i)
vals <- cname[!cname %in% c(id.name, "redcap_event_name")]
s <- tidyr::pivot_wider(
i,
names_from = "redcap_event_name",
values_from = all_of(vals),
names_glue = event.glue
)
s[colnames(s) != "redcap_event_name"]
} else
(i[colnames(i) != "redcap_event_name"])
} else
(i)
})
## Additional conditioning is needed to handle repeated instruments.
data.frame(Reduce(f = dplyr::full_join, x = l))
}

171
R/utils.r
View file

@ -1,48 +1,60 @@
#' focused_metadata
#' @description Extracts limited metadata for variables in a dataset
#' @param metadata A dataframe containing metadata
#' @param vars_in_data Vector of variable names in the dataset
#' @return A dataframe containing metadata for the variables in the dataset
#' @export
#' @examples
#'
focused_metadata <- function(metadata, vars_in_data) {
# metadata <- m$data
# vars_in_data <- names(d$data)
if (any(c("tbl_df", "tbl") %in% class(metadata))) {
metadata <- data.frame(metadata)
}
field_name <- grepl(".*[Ff]ield[._][Nn]ame$", names(metadata))
field_type <- grepl(".*[Ff]ield[._][Tt]ype$", names(metadata))
fields <-
metadata[!metadata$field_type %in% c("descriptive", "checkbox") &
metadata$field_name %in% vars_in_data,
"field_name"]
metadata[!metadata[, field_type] %in% c("descriptive", "checkbox") &
metadata[, field_name] %in% vars_in_data,
field_name]
# Process checkbox fields
if (any(metadata$field_type == "checkbox")) {
if (any(metadata[, field_type] == "checkbox")) {
# Getting base field names from checkbox fields
vars_check <- gsub(pattern = "___(\\d+)",replacement = "", vars_in_data)
vars_check <-
sub(pattern = "___.*$", replacement = "", vars_in_data)
# Processing
checkbox_basenames <-
metadata[metadata$field_type == "checkbox" &
metadata$field_name %in% vars_check,
"field_name"]
metadata[metadata[, field_type] == "checkbox" &
metadata[, field_name] %in% vars_check,
field_name]
fields <- rbind(fields, checkbox_basenames)
fields <- c(fields, checkbox_basenames)
}
# Process instrument status fields
form_names <- unique(metadata$form_name[metadata$field_name %in% fields$field_name])
form_names <-
unique(metadata[, grepl(".*[Ff]orm[._][Nn]ame$",
names(metadata))][metadata[, field_name]
%in% fields])
form_complete_fields <- data.frame(
field_name = paste0(form_names, "_complete"),
stringsAsFactors = FALSE
)
form_complete_fields <- paste0(form_names, "_complete")
fields <- rbind(fields, form_complete_fields)
fields <- c(fields, form_complete_fields)
# Process survey timestamps
timestamps <-
intersect(vars_in_data, paste0(form_names, "_timestamp"))
if (length(timestamps)) {
timestamp_fields <- data.frame(
field_name = timestamps,
stringsAsFactors = FALSE
)
timestamp_fields <- timestamps
fields <- rbind(fields, timestamp_fields)
fields <- c(fields, timestamp_fields)
}
@ -64,20 +76,73 @@ focused_metadata <- function(metadata, vars_in_data) {
},
y = vars_in_data))
fields <- rbind(fields, factor_fields)
fields <- c(fields, factor_fields[, 1])
}
metadata[metadata$field_name %in% fields$field_name,]
metadata[metadata[, field_name] %in% fields, ]
}
# function to convert the list of dataframes
#' Sanitize list of data frames
#'
#' Removing empty rows
#' @param l A list of data frames.
#' @param generic.names A vector of generic names to be excluded.
#'
#' @return A list of data frames with generic names excluded.
#'
#' @export
#'
#' @examples
#'
sanitize_split <- function(l,
generic.names = c(
"record_id",
"redcap_event_name",
"redcap_repeat_instrument",
"redcap_repeat_instance"
)) {
lapply(l, function(i) {
if (ncol(i) > 2) {
s <- data.frame(i[, !colnames(i) %in% generic.names])
i[!apply(is.na(s), MARGIN = 1, FUN = all),]
} else {
i
}
})
}
#' Match fields to forms
#'
#' @param metadata A data frame containing field names and form names
#' @param vars_in_data A character vector of variable names
#'
#' @return A data frame containing field names and form names
#'
#' @export
#'
#' @examples
#'
#'
match_fields_to_form <- function(metadata, vars_in_data) {
fields <- metadata[!metadata$field_type %in% c("descriptive", "checkbox"),
c("field_name", "form_name")]
field_form_name <- grepl(".*([Ff]ield|[Ff]orm)[._][Nn]ame$",names(metadata))
field_type <- grepl(".*[Ff]ield[._][Tt]ype$",names(metadata))
fields <- metadata[!metadata[,field_type] %in% c("descriptive", "checkbox"),
field_form_name]
names(fields) <- c("field_name", "form_name")
# Process instrument status fields
form_names <- unique(metadata$form_name)
form_names <- unique(metadata[,grepl(".*[Ff]orm[._][Nn]ame$",names(metadata))])
form_complete_fields <- data.frame(
field_name = paste0(form_names, "_complete"),
form_name = form_names,
@ -101,9 +166,9 @@ match_fields_to_form <- function(metadata, vars_in_data) {
}
# Process checkbox fields
if (any(metadata$field_type == "checkbox")) {
checkbox_basenames <- metadata[metadata$field_type == "checkbox",
c("field_name", "form_name")]
if (any(metadata[,field_type] == "checkbox")) {
checkbox_basenames <- metadata[metadata[,field_type] == "checkbox",
field_form_name]
checkbox_fields <-
do.call("rbind",
@ -111,7 +176,9 @@ match_fields_to_form <- function(metadata, vars_in_data) {
1,
function(x, y)
data.frame(
field_name = y[grepl(paste0("^", x[1], "___((?!\\.factor).)+$"), y, perl = TRUE)],
field_name =
y[grepl(paste0("^", x[1], "___((?!\\.factor).)+$"),
y, perl = TRUE)],
form_name = x[2],
stringsAsFactors = FALSE,
row.names = NULL
@ -148,14 +215,50 @@ match_fields_to_form <- function(metadata, vars_in_data) {
}
#' Split a data frame into separate tables for each form
#'
#' @param table A data frame
#' @param universal_fields A character vector of fields that should be included
#' in every table
#' @param fields A two-column matrix containing the names of fields that should
#' be included in each form
#'
#' @return A list of data frames, one for each non-repeating form
#'
#' @export
#'
#' @examples
#' # Create a table
#' table <- data.frame(
#' id = c(1, 2, 3, 4, 5),
#' form_a_name = c("John", "Alice", "Bob", "Eve", "Mallory"),
#' form_a_age = c(25, 30, 25, 15, 20),
#' form_b_name = c("John", "Alice", "Bob", "Eve", "Mallory"),
#' form_b_gender = c("M", "F", "M", "F", "F")
#' )
#'
#' # Create the universal fields
#' universal_fields <- c("id")
#'
#' # Create the fields
#' fields <- matrix(
#' c("form_a_name", "form_a",
#' "form_a_age", "form_a",
#' "form_b_name", "form_b",
#' "form_b_gender", "form_b"),
#' ncol = 2, byrow = TRUE
#' )
#'
#' # Split the table
#' split_non_repeating_forms(table, universal_fields, fields)
split_non_repeating_forms <-
function(table, universal_fields, fields) {
forms <- unique(fields[[2]])
x <- lapply(forms,
function (x) {
table[names(table) %in% union(universal_fields, fields[fields[, 2] == x, 1])]
table[names(table) %in% union(universal_fields,
fields[fields[, 2] == x, 1])]
})
structure(x, names = forms)