#' Create a regression model programatically #' #' @param data data set #' @param fun Name of function as character vector or function to use for model creation. #' @param vars character vector of variables to include #' @param outcome.str Name of outcome variable. Character vector. #' @param auto.mode Make assumptions on function dependent on outcome data format. Overwrites other arguments. #' @param formula.str Formula as string. Passed through 'glue::glue'. If given, 'outcome.str' and 'vars' are ignored. Optional. #' @param args.list List of arguments passed to 'fun' with 'do.call'. #' @param ... ignored for now #' #' @importFrom stats as.formula #' #' @return object of standard class for fun #' @export #' @rdname regression_model #' #' @examples #' gtsummary::trial |> #' regression_model(outcome.str = "age") #' gtsummary::trial |> #' regression_model( #' outcome.str = "age", #' auto.mode = FALSE, #' fun = "stats::lm", #' formula.str = "{outcome.str}~.", #' args.list = NULL #' ) #' gtsummary::trial |> #' default_parsing() |> #' regression_model( #' outcome.str = "trt", #' auto.mode = FALSE, #' fun = "stats::glm", #' args.list = list(family = binomial(link = "logit")) #' ) #' m <- mtcars |> #' default_parsing() |> #' regression_model( #' outcome.str = "mpg", #' auto.mode = FALSE, #' fun = "stats::lm", #' formula.str = "{outcome.str}~{paste(vars,collapse='+')}", #' args.list = NULL, #' vars = c("mpg", "cyl") #' ) #' broom::tidy(m) regression_model <- function(data, outcome.str = NULL, auto.mode = FALSE, formula.str = NULL, args.list = NULL, fun = NULL, vars = NULL, ...) { if (!is.null(formula.str)) { if (formula.str == "") { formula.str <- NULL } } ## This will handle if outcome is not in data for nicer shiny behavior if (isTRUE(!outcome.str %in% names(data))) { outcome.str <- names(data)[1] print("Outcome variable is not in data, first column is used") } if (!is.null(formula.str)) { formula.glue <- glue::glue(formula.str) outcome.str <- NULL } else { assertthat::assert_that(outcome.str %in% names(data), msg = "Outcome variable is not present in the provided dataset" ) formula.glue <- glue::glue("{outcome.str}~{paste(vars,collapse='+')}") } if (is.null(vars)) { vars <- names(data)[!names(data) %in% outcome.str] } else if (!is.null(outcome.str)) { if (outcome.str %in% vars) { vars <- vars[!vars %in% outcome.str] } data <- data |> dplyr::select(dplyr::all_of(c(vars, outcome.str))) } # Formatting character variables as factor # Improvement should add a missing vector to format as NA data <- data |> purrr::map(\(.x){ if (is.character(.x)) { suppressWarnings(REDCapCAST::as_factor(.x)) } else { .x } }) |> dplyr::bind_cols(.name_repair = "unique_quiet") if (is.null(fun)) auto.mode <- TRUE if (isTRUE(auto.mode)) { if (is.numeric(data[[outcome.str]])) { fun <- "stats::lm" } else if (is.factor(data[[outcome.str]])) { if (length(levels(data[[outcome.str]])) == 2) { fun <- "stats::glm" args.list <- list(family = stats::binomial(link = "logit")) } else if (length(levels(data[[outcome.str]])) > 2) { fun <- "MASS::polr" args.list <- list( Hess = TRUE, method = "logistic" ) } else { stop("The provided output variable only has one level") } } else { stop("Output variable should be either numeric or factor for auto.mode") } } assertthat::assert_that("character" %in% class(fun), msg = "Please provide the function as a character vector." ) out <- do.call( getfun(fun), c( list( data = data, formula = as.formula(formula.glue) ), args.list ) ) # out <- REDCapCAST::set_attr(out,label = fun,attr = "fun.call") # Recreating the call # out$call <- match.call(definition=eval(parse(text=fun)), call(fun, data = 'data',formula = as.formula(formula.str),args.list)) return(out) } #' Create a regression model programatically #' #' @param data data set #' @param fun Name of function as character vector or function to use for model creation. #' @param vars character vector of variables to include #' @param outcome.str Name of outcome variable. Character vector. #' @param args.list List of arguments passed to 'fun' with 'do.call'. #' @param ... ignored for now #' #' @importFrom stats as.formula #' @rdname regression_model #' #' @return object of standard class for fun #' @export #' #' @examples #' \dontrun{ #' gtsummary::trial |> #' regression_model_uv(outcome.str = "age") #' gtsummary::trial |> #' regression_model_uv( #' outcome.str = "age", #' fun = "stats::lm", #' args.list = NULL #' ) #' m <- gtsummary::trial |> regression_model_uv( #' outcome.str = "trt", #' fun = "stats::glm", #' args.list = list(family = stats::binomial(link = "logit")) #' ) #' lapply(m, broom::tidy) |> dplyr::bind_rows() #' } regression_model_uv <- function(data, outcome.str, args.list = NULL, fun = NULL, vars = NULL, ...) { ## This will handle if outcome is not in data for nicer shiny behavior if (!outcome.str %in% names(data)) { outcome.str <- names(data)[1] print("outcome is not in data, first column is used") } if (!is.null(vars)) { data <- data |> dplyr::select(dplyr::all_of( unique(c(outcome.str, vars)) )) } if (is.null(args.list)) { args.list <- list() } if (is.null(fun)) { if (is.numeric(data[[outcome.str]])) { fun <- "stats::lm" } else if (is.factor(data[[outcome.str]])) { if (length(levels(data[[outcome.str]])) == 2) { fun <- "stats::glm" args.list <- list(family = stats::binomial(link = "logit")) } else if (length(levels(data[[outcome.str]])) > 2) { fun <- "MASS::polr" args.list <- list( Hess = TRUE, method = "logistic" ) } else { stop("The provided output variable only has one level") } } else { stop("Output variable should be either numeric or factor for auto.mode") } } assertthat::assert_that("character" %in% class(fun), msg = "Please provide the function as a character vector." ) out <- names(data)[!names(data) %in% outcome.str] |> purrr::map(\(.var){ do.call( regression_model, c( list( data = data[match(c(outcome.str, .var), names(data))], outcome.str = outcome.str ), args.list ) ) }) return(out) } ### HELPERS #' Data type assessment. #' #' @description #' These are more overall than the native typeof. This is used to assess a more #' meaningful "clinical" data type. #' #' @param data vector or data.frame. if data frame, each column is evaluated. #' #' @returns outcome type #' @export #' #' @examples #' mtcars |> #' default_parsing() |> #' lapply(data_type) #' mtcars |> #' default_parsing() |> #' data_type() #' c(1, 2) |> data_type() #' 1 |> data_type() #' c(rep(NA, 10)) |> data_type() #' sample(1:100, 50) |> data_type() #' factor(letters[1:20]) |> data_type() #' as.Date(1:20) |> data_type() data_type <- function(data) { if (is.data.frame(data)) { sapply(data, data_type) } else { cl_d <- class(data) l_unique <- length(unique(na.omit(data))) if (all(is.na(data))) { out <- "empty" } else if (l_unique < 2) { out <- "monotone" } else if (any(c("factor", "logical") %in% cl_d) | l_unique == 2) { if (identical("logical", cl_d) | l_unique == 2) { out <- "dichotomous" } else { # if (is.ordered(data)) { # out <- "ordinal" # } else { out <- "categorical" # } } } else if (identical(cl_d, "character")) { out <- "text" } else if (any(c("hms", "Date", "POSIXct", "POSIXt") %in% cl_d)) { out <- "datetime" } else if (l_unique > 2) { ## Previously had all thinkable classes ## Now just assumes the class has not been defined above ## any(c("numeric", "integer", "hms", "Date", "timediff") %in% cl_d) & out <- "continuous" } else { out <- "unknown" } out } } #' Recognised data types from data_type #' #' @returns vector #' @export #' #' @examples #' data_types() data_types <- function() { list( "empty" = list(descr="Variable of all NAs",classes="Any class"), "monotone" = list(descr="Variable with only one unique value",classes="Any class"), "dichotomous" = list(descr="Variable with only two unique values",classes="Any class"), "categorical"= list(descr="Factor variable",classes="factor (ordered or unordered)"), "text"= list(descr="Character variable",classes="character"), "datetime"= list(descr="Variable of time, date or datetime values",classes="hms, Date, POSIXct and POSIXt"), "continuous"= list(descr="Numeric variable",classes="numeric, integer or double"), "unknown"= list(descr="Anything not falling within the previous",classes="Any other class") ) } #' Implemented functions #' #' @description #' Library of supported functions. The list name and "descr" element should be #' unique for each element on list. #' #' #' @returns list #' @export #' #' @examples #' supported_functions() supported_functions <- function() { list( lm = list( descr = "Linear regression model", design = "cross-sectional", out.type = "continuous", fun = "stats::lm", args.list = NULL, formula.str = "{outcome.str}~{paste(vars,collapse='+')}", table.fun = "gtsummary::tbl_regression", table.args.list = list(exponentiate = FALSE) ), glm = list( descr = "Logistic regression model", design = "cross-sectional", out.type = "dichotomous", fun = "stats::glm", args.list = list(family = "binomial"), formula.str = "{outcome.str}~{paste(vars,collapse='+')}", table.fun = "gtsummary::tbl_regression", table.args.list = list() ), polr = list( descr = "Ordinal logistic regression model", design = "cross-sectional", out.type = c("categorical"), fun = "MASS::polr", args.list = list( Hess = TRUE, method = "logistic" ), formula.str = "{outcome.str}~{paste(vars,collapse='+')}", table.fun = "gtsummary::tbl_regression", table.args.list = list() ) ) } #' Get possible regression models #' #' @param data data #' #' @returns character vector #' @export #' #' @examples #' mtcars |> #' default_parsing() |> #' dplyr::pull("cyl") |> #' possible_functions(design = "cross-sectional") #' #' mtcars |> #' default_parsing() |> #' dplyr::select("cyl") |> #' possible_functions(design = "cross-sectional") possible_functions <- function(data, design = c("cross-sectional")) { # # data <- if (is.reactive(data)) data() else data if (is.data.frame(data)) { data <- data[[1]] } design <- match.arg(design) type <- data_type(data) design_ls <- supported_functions() |> lapply(\(.x){ if (design %in% .x$design) { .x } }) if (type == "unknown") { out <- type } else { out <- design_ls |> lapply(\(.x){ if (type %in% .x$out.type) { .x$descr } }) |> unlist() } unname(out) } #' Get the function options based on the selected function description #' #' @param data vector #' #' @returns list #' @export #' #' @examples #' mtcars |> #' default_parsing() |> #' dplyr::pull(mpg) |> #' possible_functions(design = "cross-sectional") |> #' (\(.x){ #' .x[[1]] #' })() |> #' get_fun_options() get_fun_options <- function(data) { descrs <- supported_functions() |> lapply(\(.x){ .x$descr }) |> unlist() supported_functions() |> (\(.x){ .x[match(data, descrs)] })() } #' Wrapper to create regression model based on supported models #' #' @description #' Output is a concatenated list of model information and model #' #' #' @param data data #' @param outcome.str name of outcome variable #' @param fun.descr Description of chosen function matching description in #' "supported_functions()" #' @param fun name of custom function. Default is NULL. #' @param formula.str custom formula glue string. Default is NULL. #' @param args.list custom character string to be converted using #' argsstring2list() or list of arguments. Default is NULL. #' @param ... ignored #' #' @returns list #' @export #' @rdname regression_model #' #' @examples #' \dontrun{ #' gtsummary::trial |> #' regression_model( #' outcome.str = "age", #' fun = "stats::lm", #' formula.str = "{outcome.str}~.", #' args.list = NULL #' ) #' ls <- regression_model_list(data = default_parsing(mtcars), outcome.str = "cyl", fun.descr = "Ordinal logistic regression model") #' summary(ls$model) #' ls <- regression_model_list(data = default_parsing(mtcars), outcome.str = "mpg", fun.descr = "Linear regression model") #' #' ls <- regression_model_list(data = default_parsing(gtsummary::trial), outcome.str = "trt", fun.descr = "Logistic regression model") #' tbl <- gtsummary::tbl_regression(ls$model, exponentiate = TRUE) #' m <- gtsummary::trial |> #' default_parsing() |> #' regression_model( #' outcome.str = "trt", #' fun = "stats::glm", #' formula.str = "{outcome.str}~.", #' args.list = list(family = "binomial") #' ) #' tbl2 <- gtsummary::tbl_regression(m, exponentiate = TRUE) #' broom::tidy(ls$model) #' broom::tidy(m) #' } regression_model_list <- function(data, outcome.str, fun.descr, fun = NULL, formula.str = NULL, args.list = NULL, vars = NULL, ...) { options <- get_fun_options(fun.descr) |> (\(.x){ .x[[1]] })() ## Custom, specific fun, args and formula options if (is.null(formula.str)) { formula.str.c <- options$formula.str } else { formula.str.c <- formula.str } if (is.null(fun)) { fun.c <- options$fun } else { fun.c <- fun } if (is.null(args.list)) { args.list.c <- options$args.list } else { args.list.c <- args.list } if (is.character(args.list.c)) args.list.c <- argsstring2list(args.list.c) ## Handling vars to print code if (is.null(vars)) { vars <- names(data)[!names(data) %in% outcome.str] } else { if (outcome.str %in% vars) { vars <- vars[!vars %in% outcome.str] } } parameters <- list( data = data, fun = fun.c, formula.str = glue::glue(formula.str.c), args.list = args.list.c ) model <- do.call( regression_model, parameters ) parameters_code <- Filter( length, modifyList(parameters, list( data = as.symbol("df"), formula.str = as.character(glue::glue(formula.str.c)), outcome.str = NULL # args.list = NULL, )) ) ## The easiest solution was to simple paste as a string ## The rlang::call2 or rlang::expr functions would probably work as well # code <- glue::glue("FreesearchR::regression_model({parameters_print}, args.list=list({list2str(args.list.c)}))", .null = "NULL") code <- rlang::call2("regression_model", !!!parameters_code, .ns = "FreesearchR") list( options = options, model = model, code = expression_string(code) ) } list2str <- function(data) { out <- purrr::imap(data, \(.x, .i){ if (is.logical(.x)) { arg <- .x } else { arg <- glue::glue("'{.x}'") } glue::glue("{.i} = {arg}") }) |> unlist() |> paste(collapse = (", ")) if (out == "") { return(NULL) } else { out } } #' @returns list #' @export #' @rdname regression_model #' #' @examples #' \dontrun{ #' gtsummary::trial |> #' regression_model_uv( #' outcome.str = "trt", #' fun = "stats::glm", #' args.list = list(family = stats::binomial(link = "logit")) #' ) |> #' lapply(broom::tidy) |> #' dplyr::bind_rows() #' ms <- regression_model_uv_list(data = default_parsing(mtcars), outcome.str = "mpg", fun.descr = "Linear regression model") #' ms$code #' ls <- regression_model_uv_list(data = default_parsing(mtcars), outcome.str = "am", fun.descr = "Logistic regression model") #' ls$code #' lapply(ms$model, broom::tidy) |> dplyr::bind_rows() #' } regression_model_uv_list <- function(data, outcome.str, fun.descr, fun = NULL, formula.str = NULL, args.list = NULL, vars = NULL, ...) { options <- get_fun_options(fun.descr) |> (\(.x){ .x[[1]] })() ## Custom, specific fun, args and formula options if (is.null(formula.str)) { formula.str.c <- options$formula.str } else { formula.str.c <- formula.str } if (is.null(fun)) { fun.c <- options$fun } else { fun.c <- fun } if (is.null(args.list)) { args.list.c <- options$args.list } else { args.list.c <- args.list } if (is.character(args.list.c)) args.list.c <- argsstring2list(args.list.c) ## Handling vars to print code if (is.null(vars)) { vars <- names(data)[!names(data) %in% outcome.str] } else { if (outcome.str %in% vars) { vars <- vars[!vars %in% outcome.str] } } # assertthat::assert_that("character" %in% class(fun), # msg = "Please provide the function as a character vector." # ) # model <- do.call( # regression_model, # c( # list(data = data), # list(outcome.str = outcome.str), # list(fun = fun.c), # list(formula.str = formula.str.c), # args.list.c # ) # ) model <- vars |> lapply(\(.var){ parameters <- list( fun = fun.c, data = data[c(outcome.str, .var)], formula.str = as.character(glue::glue(gsub("vars", ".var", formula.str.c))), args.list = args.list.c ) out <- do.call( regression_model, parameters ) ## This is the very long version ## Handles deeply nested glue string # code <- glue::glue("FreesearchR::regression_model(data=df,{list2str(modifyList(parameters,list(data=NULL,args.list=list2str(args.list.c))))})") code <- rlang::call2("regression_model", !!!modifyList(parameters, list(data = as.symbol("df"), args.list = args.list.c)), .ns = "FreesearchR") REDCapCAST::set_attr(out, code, "code") }) code <- model |> lapply(\(.x)REDCapCAST::get_attr(.x, "code")) |> lapply(expression_string) |> pipe_string(collapse = ",\n") |> (\(.x){ paste0("list(\n", .x, ")") })() list( options = options, model = model, code = code ) } # regression_model(mtcars, fun = "stats::lm", formula.str = "mpg~cyl")