Data Dictionary Labelling
Code Properties
- Language: R
- Packages:
dplyr,matchmaker,rlang,stringr,purrr
Overview
Functions for applying human-readable labels from a data dictionary to dataset columns and values, and reversing the process back to machine-readable format.
Code
Apply Labels
#' Apply Labels
#'
#' @param data Dataset to apply labeling on
#' @param dict Dictionary to use for application of labeling
#' @param by Column in dict defining the columns in data. Defaults to "variable"
#' @param names_from Column defining where to match names from. Defaults to "variable"
#' @param names_to Column defining replacements for names(data). Defaults to "variable_label"
#' @param from Column with original values. Defaults to "value"
#' @param to Column with replacement values. Defaults to "value_label"
#' @param dataset_name Optional name of dataset to filter dict by
#' @param ... Passed to matchmaker::match_df
#'
#' @return A labelled, display-friendly tibble
#' @export
apply_labels <- function(
data,
dict,
from = "value",
to = "value_label",
by = "variable",
names_from = "variable",
names_to = "variable_label",
dataset_name = NULL,
...
) {
if (!is.null(dataset_name)) {
dict <- dict |>
dplyr::filter(.data$dataset == dataset_name)
globals <- dict |>
dplyr::filter(.data[[by]] == ".global")
} else {
globals <- NULL
}
suppressWarnings({
hold <- matchmaker::match_df(
data,
dictionary = dict,
from = from,
to = to,
by = by,
...
)
if (!is.null(globals) && nrow(globals) > 0) {
purrr::walk2(globals$value, globals$value_label, function(x, y) {
hold <<- hold |>
dplyr::mutate(dplyr::across(
dplyr::where(is.character),
~ stringr::str_replace_all(.x, pattern = x, replacement = y)
))
})
}
})
hold |>
rlang::set_names(
dict[[names_to]][match(names(hold), dict[[names_from]])]
) |>
dplyr::mutate(dplyr::across(
dplyr::where(is.character),
~ dplyr::na_if(.x, ".missing")
))
}Reverse Labels
#' Reverse Labels
#'
#' Reverses apply_labels() back to machine-readable format
#'
#' @inheritParams apply_labels
#'
#' @return An un-labeled, R-friendly tibble
#' @export
reverse_labels <- function(
data,
dict,
from = "value",
to = "value_label",
by = "variable",
names_from = "variable",
names_to = "variable_label",
dataset_name = NULL,
...
) {
if (!is.null(dataset_name)) {
dict <- dict |>
dplyr::filter(.data$dataset == dataset_name)
globals <- dict |>
dplyr::filter(.data[[by]] == ".global")
} else {
globals <- NULL
}
suppressWarnings({
hold <- matchmaker::match_df(
data,
dictionary = dict,
from = to,
to = from,
by = "variable_label",
...
)
if (!is.null(globals) && nrow(globals) > 0) {
purrr::walk2(globals$value_label, globals$value, function(x, y) {
hold <<- hold |>
dplyr::mutate(dplyr::across(
dplyr::where(is.character),
~ stringr::str_replace_all(.x, pattern = x, replacement = y)
))
})
}
})
hold |>
rlang::set_names(
dict[[names_from]][match(names(hold), dict[[names_to]])]
) |>
dplyr::mutate(dplyr::across(
dplyr::where(is.character),
~ dplyr::na_if(.x, ".missing")
))
}Usage
# create dictionary
dictionary <- data.frame(
var = c(rep("x1", 2), rep("x2", 3), rep("x3", 5)),
var_lab = c(rep("Column 1", 2), rep("Column 2", 3), rep("Column 3", 5)),
val = c(c(TRUE, FALSE), c(1:3), letters[1:5]),
val_lab = c(
c("YES", "NO"),
paste0("Group ", c(1:3)),
paste0("Area ", LETTERS[1:5])
)
)
# sample data
dat <- data.frame(
"x1" = rep(c(TRUE, FALSE), 15),
"x2" = rep(c(1:3), 10),
"x3" = rep(letters[1:5], 6)
)
# apply labels for display
labeled_data <- apply_labels(
dat,
dictionary,
from = "val",
to = "val_lab",
by = "var",
names_from = "var",
names_to = "var_lab"
)
# reverse back to machine format
original_data <- reverse_labels(
labeled_data,
dictionary,
from = "val",
to = "val_lab",
by = "var",
names_from = "var",
names_to = "var_lab"
)Appendix
Note created on 2024-06-23 and last modified on 2024-12-13.
See Also
Backlinks
(c) No Clocks, LLC | 2024