if (basename(getwd()) != "docs") stop("This script should be run in the phonto/docs/ folder")
require(nhanesA)
require(phonto)
options(warn = 1)
nhanesOptions(log.access = TRUE)
stopifnot(isTRUE(nhanesOptions("use.db"))) # too slow otherwise
sort_by <- function(x, by = NULL, ...)
{
if (!inherits(by, "formula")) stop("'by' must be a formula")
f <- .formula2varlist(by, x)
o <- do.call(order, c(unname(f), list(...)))
x[o, , drop = FALSE]
}
## start with tables available in the DB
tableDesc <- nhanesQuery("select * from Metadata.QuestionnaireDescriptions")
dim(tableDesc)
## drop pre-pandemic and limited access tables
tableDesc <- subset(tableDesc, !startsWith(TableName, "P_") & UseConstraints == "None")
dim(tableDesc)
str(tableDesc)
## make this a searchable table
## add TableName without cycle suffix
drop_table_suffix <- function(x) gsub("_[ABCDEFGHIJ]$", "", x)
tableDesc$TableBase <- drop_table_suffix(tableDesc$TableName)
tableDesc <- sort_by(tableDesc, ~ TableBase + TableName)
dim(unique(tableDesc[c("TableBase", "Description", "DataGroup")]))
shortDesc <- function(nh_table, do_missing = FALSE) {
if (interactive() && runif(1) < 0.1) cat("\r", nh_table, " ")
cb <- nhanesCodebook(nh_table)
nvars <- length(cb) - 1 # exclude 1st variable, which is usually ID
## ncases should be same regardless of which variable we pick, but not checking here.
## Maybe something a reimplementation of nhanesAttr() should check
ncases <- try(tail(cb[[2]][[length(cb[[2]])]]$Cumulative, 1), silent = TRUE)
if (inherits(ncases, "try-error")) ncases <- NA_integer_
if (do_missing) {
nmissing <- function(comp) {
## info not always available, so try
e <- try({
varInfoTable <- comp[[length(comp)]]
n <- subset(varInfoTable, Value.Description == "Missing")$Count
if (length(n) != 1L) stop("Missing values in ", nh_table, ": ", nmissing)
n
}, silent = TRUE)
if (inherits(e, "try-error")) NA_integer_ else e
}
nmissing_by_var <- sapply(cb[-1], nmissing)
sprintf("[%d x %d] (%g %% NA)", ncases, nvars,
round(100 * (sum(nmissing_by_var, na.rm = TRUE) /
(ncases * sum(!is.na(nmissing_by_var)))), 1))
}
else
sprintf("[%d x %d]", ncases, nvars)
}
summarizeTables <- function(tableDesc)
{
tableSummary <- (
xtabs(~ TableBase + Description + DataGroup, tableDesc)
|> as.data.frame.table()
|> subset(Freq > 0, select = -Freq)
)
subtableLinks <- function(i) {
## all tables matching i-th row of tableSummary
dmatch <-
subset(tableDesc,
TableBase == tableSummary$TableBase[i] &
Description == tableSummary$Description[i])
tab_links <-
with(dmatch,
{
links <- sprintf("%s %s",
DocFile, TableName, ShortDesc)
## some have DocFile == ""
bad_doc <- trimws(dmatch$DocFile) == ""
links[bad_doc] <-
sprintf("%s %s",
TableName[bad_doc], ShortDesc[bad_doc])
paste(links, collapse = ", ")
})
}
tableSummary[["Tables"]] <- sapply(seq_len(nrow(tableSummary)),
subtableLinks)
rownames(tableSummary) <- as.character(seq_len(nrow(tableSummary)))
tableSummary
}
## This will take a little time
tableDesc <- within(tableDesc, {
ShortDesc <- sapply(TableName, shortDesc)
})
tab_summary <- summarizeTables(tableDesc)
library(toHTML)
cat(toHTML(tab_summary), file = "tables/table-summary.html", sep = "\n")