Overview
This appendix performs integration testing of the physical activity harmonization from PR #157 and L0-L4 analysis updates:
Load CCHS PUMF data for physical activity variables
Test harmonization via rec_with_table() against ground truth
Validate staged updates from L0-L4 analysis (PAADVTRV, PAYDVTTR, PAADVWHO extensions)
Validate derived variables (e.g., active_transport) with distribution checks
L0-L4 analysis updates
The following staged updates are being validated (from pa_variables_update.csv and pa_variable_details_update.csv):
PAADVTRV
Extended
Added cchs2019_2020_p
PAYDVTTR
Extended
Added cchs2019_2020_p
active_transport
Extended
Added cchs2019_2020_p
energy_exp
Extended
Added cchs2019_2020_p - all input vars already support 2019-2020
PAADVWHO
New
WHO physical activity classification (2015-2022)
PACFLEI
Fixed
dummyVariable naming cat_cat6 → cat2
PAC_4B
Fixed
Label “walking” → “biking”
Setup
Code
# Load required packages
library (dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
Code
library (haven)
library (knitr)
library (here)
here() starts at /Users/dmanuel/github/cchsflow
Code
library (stringr)
library (sjlabelled)
Attaching package: 'sjlabelled'
The following objects are masked from 'package:haven':
as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels
The following object is masked from 'package:dplyr':
as_label
Code
# Set project root
project_root <- here:: here ()
message ("Project root: " , project_root)
Project root: /Users/dmanuel/github/cchsflow
Code
# Save current working directory
original_wd <- getwd ()
setwd (project_root)
# Source cchsflow infrastructure
source ("R/strings.R" )
source ("R/label-utils.R" )
source ("R/source-lookups.R" )
source-lookups.R loaded. Functions available:
get_valid_databases() - List all valid database names
is_valid_database() - Check if database name is valid
parse_database_name() - Parse database into components
get_variables_for_database() - Get variables in a database
variable_exists_in_database() - Check if variable exists
search_variables() - Search variables by pattern
get_categories_for_variable() - Get category codes/labels
validate_category_codes() - Validate codes against DDI
load_concepts() - Load domain concepts YAML
get_source_variables() - Get source variable definitions
get_cycles_for_variable() - Get cycles for a variable
cross_validate_variable() - Validate against DDI + concepts
Code
source ("R/load-cchs-subset.R" )
validate-all-source-references.R loaded. Functions available:
validate_all_source_references(file_path) - Comprehensive validation
print_all_validation_result(result) - Pretty-print results
load-cchs-subset.R loaded. Functions available:
load_cchs_subset(worksheet_path) - Load CCHS data for worksheet variables
get_variables_by_database(path) - Extract variable list by database
Code
source ("R/recode-with-table.R" )
# Restore working directory
setwd (original_wd)
message ("Infrastructure loaded successfully" )
Infrastructure loaded successfully
Phase 1: Load CCHS data
Load CCHS PUMF data for physical activity variables.
Code
# Define physical activity variables from PR #157 + L0-L4 updates
pa_variables <- c (
"PACFLEI" , "PACDEE" , "PAA_045" , "PAA_050" , "PAA_075" , "PAA_080" ,
"PAADVDYS" , "PAADVVIG" , "PAYDVTOA" , "PAYDVADL" , "PAYDVVIG" , "PAYDVDYS" ,
"PAADVTRV" , "PAYDVTTR" , "PAADVWHO" # Added from L0-L4 analysis
)
# Also include source variables for validation
pa_source_vars <- c (
"PACADEE" , "PACCDEE" , "PACEDEE" , # Cycle-specific energy expenditure
"PACAFLEI" , "PACEFLEI" # Cycle-specific leisure activity
)
# Load PUMF data
rdata_dir <- file.path (project_root, "working_data_and_documentation/pumf-rdata" )
if (dir.exists (rdata_dir)) {
# Get all available PUMF files
pumf_files <- list.files (rdata_dir, pattern = "CCHS.* \\ .RData$" , full.names = TRUE )
cchs_data <- list ()
for (f in pumf_files) {
cycle_name <- gsub ("CCHS_| \\ .RData" , "" , basename (f))
cycle_name <- paste0 ("cchs" , cycle_name, "_p" )
tryCatch ({
env <- new.env ()
load (f, envir = env)
df <- get (ls (env)[1 ], envir = env)
# Convert haven_labelled columns to plain numeric (required for rec_with_table)
df <- as.data.frame (df)
df[] <- lapply (df, function (x) {
if (inherits (x, "haven_labelled" )) as.numeric (x) else x
})
# Filter to PA-related columns
pa_cols <- grep ("^PAC|^PAA|^PAY" , names (df), value = TRUE , ignore.case = TRUE )
if (length (pa_cols) > 0 ) {
cchs_data[[cycle_name]] <- df
}
}, error = function (e) {
message ("Failed to load " , f, ": " , e$ message)
})
}
message ("Loaded " , length (cchs_data), " PUMF cycles with PA variables" )
} else {
message ("PUMF data directory not found: " , rdata_dir)
cchs_data <- list ()
}
Data summary
Code
if (length (cchs_data) > 0 ) {
data_summary <- data.frame (
Cycle = names (cchs_data),
Rows = sapply (cchs_data, nrow),
PA_Columns = sapply (cchs_data, function (df) {
length (grep ("^PAC|^PAA|^PAY" , names (df), value = TRUE ))
})
)
kable (data_summary, row.names = FALSE )
} else {
message ("No data loaded" )
}
Available PA variables by cycle
Code
if (length (cchs_data) > 0 ) {
pa_vars_by_cycle <- lapply (names (cchs_data), function (cycle) {
cols <- names (cchs_data[[cycle]])
pa_cols <- grep ("^PAC|^PAA|^PAY" , cols, value = TRUE , ignore.case = TRUE )
# Check for key variables including L0-L4 additions
has_pacdee <- any (grepl ("PACDEE|PACADEE|PACCDEE|PACEDEE" , pa_cols))
has_paadvtrv <- "PAADVTRV" %in% pa_cols
has_paydvttr <- "PAYDVTTR" %in% pa_cols
has_paadvwho <- "PAADVWHO" %in% pa_cols
has_paadvvig <- "PAADVVIG" %in% pa_cols
data.frame (
Cycle = cycle,
` PA Variables ` = length (pa_cols),
PACDEE = ifelse (has_pacdee, "Yes" , "No" ),
PAADVTRV = ifelse (has_paadvtrv, "Yes" , "No" ),
PAYDVTTR = ifelse (has_paydvttr, "Yes" , "No" ),
PAADVWHO = ifelse (has_paadvwho, "Yes" , "No" ),
PAADVVIG = ifelse (has_paadvvig, "Yes" , "No" ),
check.names = FALSE
)
})
pa_summary <- do.call (rbind, pa_vars_by_cycle)
kable (pa_summary, row.names = FALSE )
}
Phase 2: Load staged worksheet updates
Code
# Load base worksheets from repo
repo_variable_details <- read.csv (
file.path (project_root, "inst/extdata/variable_details.csv" ),
stringsAsFactors = FALSE
)
# Merge staged PA updates from L0-L4 analysis
staged_pa_path <- file.path (project_root, "ceps/cep-003-physical-activity/pa_variable_details_update.csv" )
if (file.exists (staged_pa_path)) {
staged_pa <- read.csv (staged_pa_path, stringsAsFactors = FALSE )
# Variables that need FULL REPLACEMENT (fixes where all rows are in staged file)
vars_to_replace <- c ("PACFLEI" , "PAC_4B" , "PAC_4B_cont" , "PAADVWHO" )
# Variables that are EXTENSIONS (only new cycle rows in staged file - keep existing)
vars_to_extend <- c ("PAADVTRV" , "PAYDVTTR" , "active_transport" , "energy_exp" )
# For replacement variables: remove old rows entirely
repo_variable_details <- repo_variable_details[
! repo_variable_details$ variable %in% vars_to_replace,
]
# Add all staged rows (both replacements and extensions)
repo_variable_details <- rbind (repo_variable_details, staged_pa)
message ("Replaced variables: " , paste (vars_to_replace, collapse = ", " ))
message ("Extended variables: " , paste (vars_to_extend, collapse = ", " ))
message ("Total staged rows added: " , nrow (staged_pa))
} else {
message ("No staged PA updates found at: " , staged_pa_path)
}
# Also merge older PACFLEI updates if they exist separately
staged_pacflei_path <- file.path (project_root, "ceps/cep-003-physical-activity/pacflei_variable_details_update.csv" )
if (file.exists (staged_pacflei_path) && ! "PACFLEI" %in% vars_to_replace) {
staged_pacflei <- read.csv (staged_pacflei_path, stringsAsFactors = FALSE )
repo_variable_details <- repo_variable_details[repo_variable_details$ variable != "PACFLEI" , ]
repo_variable_details <- rbind (repo_variable_details, staged_pacflei)
message ("Merged staged PACFLEI updates (" , nrow (staged_pacflei), " rows)" )
}
Phase 3: Validate PACDEE (Daily energy expenditure)
Test the harmonized PACDEE variable across cycles.
Code
if (length (cchs_data) > 0 && exists ("repo_variable_details" )) {
pacdee_validation <- lapply (names (cchs_data), function (cycle) {
df <- cchs_data[[cycle]]
# Ground truth from source
pacdee_variants <- c ("PACADEE" , "PACCDEE" , "PACEDEE" , "PACDEE" )
found_var <- intersect (pacdee_variants, names (df))
if (length (found_var) == 0 ) {
return (data.frame (
Cycle = cycle,
Ground_Truth_Mean = NA ,
Harmonized_Mean = NA ,
Match = "-" ,
Status = "No source"
))
}
gt_values <- df[[found_var[1 ]]]
gt_valid <- gt_values[! is.na (gt_values) & gt_values >= 0 & gt_values <= 43.5 ]
gt_mean <- mean (gt_valid, na.rm = TRUE )
# rec_with_table() harmonization
harmonized_mean <- NA
status <- "Not tested"
tryCatch ({
result <- cchsflow:: rec_with_table (
data = df,
variables = "PACDEE" ,
database_name = cycle,
variable_details = repo_variable_details,
append_to_data = FALSE ,
notes = FALSE
)
if ("PACDEE" %in% names (result)) {
rwt_values <- result$ PACDEE
rwt_valid <- rwt_values[! is.na (rwt_values) & rwt_values >= 0 & rwt_values <= 43.5 ]
harmonized_mean <- mean (rwt_valid, na.rm = TRUE )
status <- "OK"
} else {
status <- "No PACDEE output"
}
}, error = function (e) {
status <<- paste ("Error:" , substr (e$ message, 1 , 30 ))
})
# Compare
match_result <- if (! is.na (harmonized_mean) && ! is.na (gt_mean)) {
if (abs (harmonized_mean - gt_mean) < 0.01 ) "Yes" else "No"
} else {
"-"
}
data.frame (
Cycle = cycle,
Ground_Truth_Mean = round (gt_mean, 2 ),
Harmonized_Mean = round (harmonized_mean, 2 ),
Match = match_result,
Status = status
)
})
pacdee_val_summary <- do.call (rbind, pacdee_validation)
kable (pacdee_val_summary, row.names = FALSE )
}
PACDEE: Source vs harmonized (rec_with_table)
cchs2001_p
2.02
2.02
Yes
OK
cchs2003_p
2.22
2.22
Yes
OK
cchs2005_p
2.20
2.20
Yes
OK
cchs2007_2008_p
2.14
2.14
Yes
OK
cchs2009_2010_p
2.21
2.21
Yes
OK
cchs2011_2012_p
2.32
2.32
Yes
OK
cchs2013_2014_p
2.32
2.32
Yes
OK
cchs2015_2016_p
NA
NA
-
No source
cchs2017_2018_p
NA
NA
-
No source
cchs2019_2020_p
NA
NA
-
No source
cchs2022_p
NA
NA
-
No source
Phase 4: Validate PACFLEI (Leisure physical activity index)
Test the harmonized PACFLEI variable with fixed dummyVariable naming.
Code
if (length (cchs_data) > 0 && exists ("repo_variable_details" )) {
pacflei_validation <- lapply (names (cchs_data), function (cycle) {
df <- cchs_data[[cycle]]
# Ground truth from source - era-dependent naming
pacflei_variants <- c ("PACAFLEI" , "PACCFLEI" , "PACEFLEI" , "PACFLEI" )
found_var <- intersect (pacflei_variants, names (df))
if (length (found_var) == 0 ) {
return (data.frame (
Cycle = cycle,
Ground_Truth_Mean = NA ,
Harmonized_Mean = NA ,
Match = "-" ,
Status = "No source"
))
}
gt_values <- df[[found_var[1 ]]]
# PACFLEI is categorical (1-2), filter valid range
gt_valid <- gt_values[! is.na (gt_values) & gt_values >= 1 & gt_values <= 2 ]
gt_mean <- mean (gt_valid, na.rm = TRUE )
# rec_with_table() harmonization
harmonized_mean <- NA
status <- "Not tested"
tryCatch ({
result <- cchsflow:: rec_with_table (
data = df,
variables = "PACFLEI" ,
database_name = cycle,
variable_details = repo_variable_details,
append_to_data = FALSE ,
notes = FALSE
)
if ("PACFLEI" %in% names (result)) {
rwt_values <- result$ PACFLEI
# Handle factor output
if (is.factor (rwt_values)) {
rwt_values <- as.numeric (as.character (rwt_values))
}
rwt_valid <- rwt_values[! is.na (rwt_values) & rwt_values >= 1 & rwt_values <= 2 ]
harmonized_mean <- mean (rwt_valid, na.rm = TRUE )
status <- "OK"
} else {
status <- "No PACFLEI output"
}
}, error = function (e) {
status <<- paste ("Error:" , substr (e$ message, 1 , 40 ))
})
# Compare
match_result <- if (! is.na (harmonized_mean) && ! is.na (gt_mean)) {
if (abs (harmonized_mean - gt_mean) < 0.01 ) "Yes" else "No"
} else {
"-"
}
data.frame (
Cycle = cycle,
Ground_Truth_Mean = round (gt_mean, 3 ),
Harmonized_Mean = round (harmonized_mean, 3 ),
Match = match_result,
Status = status
)
})
pacflei_val_summary <- do.call (rbind, pacflei_validation)
kable (pacflei_val_summary, row.names = FALSE )
}
PACFLEI: Source vs harmonized (rec_with_table)
cchs2001_p
1.108
1.108
Yes
OK
cchs2003_p
1.086
1.086
Yes
OK
cchs2005_p
1.084
1.084
Yes
OK
cchs2007_2008_p
1.087
1.087
Yes
OK
cchs2009_2010_p
1.086
1.086
Yes
OK
cchs2011_2012_p
1.076
1.076
Yes
OK
cchs2013_2014_p
1.072
1.072
Yes
OK
cchs2015_2016_p
NA
NA
-
No source
cchs2017_2018_p
NA
NA
-
No source
cchs2019_2020_p
NA
NA
-
No source
cchs2022_p
NA
NA
-
No source
Phase 5: Validate energy_exp (Daily energy expenditure)
Test the harmonized energy_exp derived variable across all cycles (2001-2020).
Code
if (length (cchs_data) > 0 && exists ("repo_variable_details" )) {
energy_exp_validation <- lapply (names (cchs_data), function (cycle) {
df <- cchs_data[[cycle]]
# For pre-2015, ground truth is PACDEE variants
# For 2015+, it's calculated from age-specific inputs
is_pre_2015 <- grepl ("2001|2003|2005|2007|2009|2010|2011|2012|2013|2014" , cycle)
if (is_pre_2015) {
# Ground truth from PACDEE variants
pacdee_variants <- c ("PACADEE" , "PACCDEE" , "PACEDEE" , "PACDEE" )
found_var <- intersect (pacdee_variants, names (df))
if (length (found_var) == 0 ) {
return (data.frame (
Cycle = cycle,
Era = "Pre-2015" ,
Ground_Truth_Mean = NA ,
Harmonized_Mean = NA ,
Match = "-" ,
Status = "No source"
))
}
gt_values <- df[[found_var[1 ]]]
gt_valid <- gt_values[! is.na (gt_values) & gt_values >= 0 & gt_values <= 43.5 ]
gt_mean <- mean (gt_valid, na.rm = TRUE )
} else {
# For 2015+, we need to check if input variables exist
# energy_exp is derived from age-specific activity variables
input_vars <- c ("PAA_045" , "PAA_050" , "PAA_075" , "PAA_080" ,
"PAADVDYS" , "PAADVVIG" , "PAYDVTOA" , "PAYDVADL" ,
"PAYDVVIG" , "PAYDVDYS" )
has_inputs <- sum (input_vars %in% names (df)) > 5 # Most inputs present
if (! has_inputs) {
return (data.frame (
Cycle = cycle,
Era = "Post-2015" ,
Ground_Truth_Mean = NA ,
Harmonized_Mean = NA ,
Match = "-" ,
Status = "Insufficient inputs"
))
}
# No direct ground truth for 2015+ energy_exp (it's calculated)
gt_mean <- NA
}
# rec_with_table() harmonization
harmonized_mean <- NA
status <- "Not tested"
tryCatch ({
result <- cchsflow:: rec_with_table (
data = df,
variables = "energy_exp" ,
database_name = cycle,
variable_details = repo_variable_details,
append_to_data = FALSE ,
notes = FALSE
)
if ("energy_exp" %in% names (result)) {
harm_values <- result$ energy_exp
harm_valid <- harm_values[! is.na (harm_values) & harm_values >= 0 & harm_values <= 100 ]
if (length (harm_valid) > 0 ) {
harmonized_mean <- mean (harm_valid, na.rm = TRUE )
}
status <- "OK"
} else {
status <- "No energy_exp output"
}
}, error = function (e) {
status <<- paste ("Error:" , substr (e$ message, 1 , 40 ))
})
# Compare (for pre-2015 only)
match_result <- if (is_pre_2015 && ! is.na (harmonized_mean) && ! is.na (gt_mean)) {
if (abs (harmonized_mean - gt_mean) < 0.1 ) "Yes" else "No"
} else if (! is_pre_2015 && ! is.na (harmonized_mean)) {
"Calculated"
} else {
"-"
}
data.frame (
Cycle = cycle,
Era = ifelse (is_pre_2015, "Pre-2015" , "Post-2015" ),
Ground_Truth_Mean = round (gt_mean, 2 ),
Harmonized_Mean = round (harmonized_mean, 2 ),
Match = match_result,
Status = status
)
})
energy_exp_val_summary <- do.call (rbind, energy_exp_validation)
kable (energy_exp_val_summary, row.names = FALSE )
}
energy_exp: Source vs harmonized (rec_with_table)
cchs2001_p
Pre-2015
2.02
2.02
Yes
OK
cchs2003_p
Pre-2015
2.22
2.22
Yes
OK
cchs2005_p
Pre-2015
2.20
2.20
Yes
OK
cchs2007_2008_p
Pre-2015
2.14
2.14
Yes
OK
cchs2009_2010_p
Pre-2015
2.21
2.21
Yes
OK
cchs2011_2012_p
Pre-2015
2.32
2.32
Yes
OK
cchs2013_2014_p
Pre-2015
2.32
2.32
Yes
OK
cchs2015_2016_p
Post-2015
NA
NA
-
No energy_exp output
cchs2017_2018_p
Post-2015
NA
NA
-
No energy_exp output
cchs2019_2020_p
Post-2015
NA
NA
-
No energy_exp output
cchs2022_p
Post-2015
NA
NA
-
Insufficient inputs
Phase 6: Validate L0-L4 extensions (PAADVTRV, PAYDVTTR, PAADVWHO)
Test the newly extended/added variables from L0-L4 analysis.
Active transport variables (PAADVTRV, PAYDVTTR)
Code
if (length (cchs_data) > 0 && exists ("repo_variable_details" )) {
# Focus on 2015+ cycles where these variables exist
cycles_2015plus <- grep ("2015|2017|2019|2022" , names (cchs_data), value = TRUE )
active_transport_vars <- c ("PAADVTRV" , "PAYDVTTR" )
at_results <- list ()
for (cycle in cycles_2015plus) {
df <- cchs_data[[cycle]]
for (var in active_transport_vars) {
has_source <- var %in% names (df)
gt_mean <- NA
harmonized_mean <- NA
status <- "No source"
if (has_source) {
# Ground truth
gt_values <- df[[var]]
# Filter out missing codes (99996-99999)
gt_valid <- gt_values[! is.na (gt_values) & gt_values < 99996 ]
if (length (gt_valid) > 0 ) {
gt_mean <- mean (gt_valid, na.rm = TRUE )
}
# rec_with_table() test
tryCatch ({
result <- cchsflow:: rec_with_table (
data = df,
variables = var,
database_name = cycle,
variable_details = repo_variable_details,
append_to_data = FALSE ,
notes = FALSE
)
if (var %in% names (result)) {
rwt_values <- result[[var]]
rwt_valid <- rwt_values[! is.na (rwt_values)]
if (length (rwt_valid) > 0 ) {
harmonized_mean <- mean (rwt_valid, na.rm = TRUE )
}
status <- "OK"
} else {
status <- "No output"
}
}, error = function (e) {
status <<- paste ("Error:" , substr (e$ message, 1 , 30 ))
})
}
# Compare
match_result <- if (! is.na (harmonized_mean) && ! is.na (gt_mean)) {
if (abs (harmonized_mean - gt_mean) < 1 ) "Yes" else "No" # Allow 1 min tolerance
} else {
"-"
}
at_results[[length (at_results) + 1 ]] <- data.frame (
Cycle = cycle,
Variable = var,
GT_Mean = round (gt_mean, 1 ),
Harmonized_Mean = round (harmonized_mean, 1 ),
Match = match_result,
Status = status
)
}
}
at_summary <- do.call (rbind, at_results)
kable (at_summary, row.names = FALSE )
}
Active transport variables: rec_with_table() validation
cchs2015_2016_p
PAADVTRV
100.4
100.4
Yes
OK
cchs2015_2016_p
PAYDVTTR
191.9
191.9
Yes
OK
cchs2017_2018_p
PAADVTRV
93.2
93.2
Yes
OK
cchs2017_2018_p
PAYDVTTR
189.4
189.4
Yes
OK
cchs2019_2020_p
PAADVTRV
98.0
98.0
Yes
OK
cchs2019_2020_p
PAYDVTTR
164.9
164.9
Yes
OK
cchs2022_p
PAADVTRV
NA
NA
-
No source
cchs2022_p
PAYDVTTR
NA
NA
-
No source
PAADVWHO (WHO physical activity classification)
Code
if (length (cchs_data) > 0 && exists ("repo_variable_details" )) {
cycles_2015plus <- grep ("2015|2017|2019|2022" , names (cchs_data), value = TRUE )
who_results <- lapply (cycles_2015plus, function (cycle) {
df <- cchs_data[[cycle]]
has_source <- "PAADVWHO" %in% names (df)
gt_mean <- NA
harmonized_mean <- NA
status <- "No source"
n_valid <- NA
if (has_source) {
gt_values <- df[["PAADVWHO" ]]
# Valid categories are 1-4
gt_valid <- gt_values[! is.na (gt_values) & gt_values >= 1 & gt_values <= 4 ]
n_valid <- length (gt_valid)
if (n_valid > 0 ) {
gt_mean <- mean (gt_valid, na.rm = TRUE )
}
tryCatch ({
result <- cchsflow:: rec_with_table (
data = df,
variables = "PAADVWHO" ,
database_name = cycle,
variable_details = repo_variable_details,
append_to_data = FALSE ,
notes = FALSE
)
if ("PAADVWHO" %in% names (result)) {
rwt_values <- result[["PAADVWHO" ]]
if (is.factor (rwt_values)) {
rwt_values <- as.numeric (as.character (rwt_values))
}
rwt_valid <- rwt_values[! is.na (rwt_values) & rwt_values >= 1 & rwt_values <= 4 ]
if (length (rwt_valid) > 0 ) {
harmonized_mean <- mean (rwt_valid, na.rm = TRUE )
}
status <- "OK"
} else {
status <- "No output"
}
}, error = function (e) {
status <<- paste ("Error:" , substr (e$ message, 1 , 30 ))
})
}
match_result <- if (! is.na (harmonized_mean) && ! is.na (gt_mean)) {
if (abs (harmonized_mean - gt_mean) < 0.01 ) "Yes" else "No"
} else {
"-"
}
data.frame (
Cycle = cycle,
N_Valid = n_valid,
GT_Mean = round (gt_mean, 2 ),
Harmonized_Mean = round (harmonized_mean, 2 ),
Match = match_result,
Status = status
)
})
who_summary <- do.call (rbind, who_results)
kable (who_summary, row.names = FALSE )
}
PAADVWHO: Source vs harmonized (rec_with_table)
cchs2015_2016_p
97646
2.26
2.26
Yes
OK
cchs2017_2018_p
101668
2.32
2.32
Yes
OK
cchs2019_2020_p
20210
2.42
2.42
Yes
OK
cchs2022_p
1792
2.09
2.09
Yes
OK
Phase 7: Validate 2015+ variables
Code
if (length (cchs_data) > 0 && exists ("repo_variable_details" )) {
cycles_2015plus <- grep ("2015|2017|2019|2022" , names (cchs_data), value = TRUE )
# All PR #157 + L0-L4 variables for 2015+
vars_2015 <- c ("PAA_045" , "PAA_050" , "PAA_075" , "PAA_080" ,
"PAADVDYS" , "PAADVVIG" , "PAYDVTOA" , "PAYDVADL" ,
"PAYDVVIG" , "PAYDVDYS" )
all_2015_results <- list ()
for (cycle in cycles_2015plus) {
df <- cchs_data[[cycle]]
for (var in vars_2015) {
# Check if source variable exists in data
has_source <- var %in% names (df)
gt_mean <- NA
harmonized_mean <- NA
status <- "No source"
if (has_source) {
# Ground truth
gt_values <- df[[var]]
gt_valid <- gt_values[! is.na (gt_values)]
if (length (gt_valid) > 0 ) {
gt_mean <- mean (gt_valid, na.rm = TRUE )
}
# rec_with_table() test
tryCatch ({
result <- cchsflow:: rec_with_table (
data = df,
variables = var,
database_name = cycle,
variable_details = repo_variable_details,
append_to_data = FALSE ,
notes = FALSE
)
if (var %in% names (result)) {
rwt_values <- result[[var]]
rwt_valid <- rwt_values[! is.na (rwt_values)]
if (length (rwt_valid) > 0 ) {
harmonized_mean <- mean (rwt_valid, na.rm = TRUE )
}
status <- "OK"
} else {
status <- "No output"
}
}, error = function (e) {
status <<- paste ("Error:" , substr (e$ message, 1 , 30 ))
})
}
# Compare
match_result <- if (! is.na (harmonized_mean) && ! is.na (gt_mean)) {
if (abs (harmonized_mean - gt_mean) < 0.01 ) "Yes" else "No"
} else {
"-"
}
all_2015_results[[length (all_2015_results) + 1 ]] <- data.frame (
Cycle = cycle,
Variable = var,
GT_Mean = round (gt_mean, 2 ),
Harmonized_Mean = round (harmonized_mean, 2 ),
Match = match_result,
Status = status
)
}
}
all_2015_summary <- do.call (rbind, all_2015_results)
kable (all_2015_summary, row.names = FALSE )
}
All 2015+ variables: rec_with_table() validation
cchs2015_2016_p
PAA_045
1.52
1.52
Yes
OK
cchs2015_2016_p
PAA_050
10.11
10.11
Yes
OK
cchs2015_2016_p
PAA_075
2.85
2.85
Yes
OK
cchs2015_2016_p
PAA_080
7.18
7.18
Yes
OK
cchs2015_2016_p
PAADVDYS
3.62
3.62
Yes
OK
cchs2015_2016_p
PAADVVIG
42.02
42.02
Yes
OK
cchs2015_2016_p
PAYDVTOA
61.70
61.70
Yes
OK
cchs2015_2016_p
PAYDVADL
254.37
254.37
Yes
OK
cchs2015_2016_p
PAYDVVIG
100.91
99.97
No
OK
cchs2015_2016_p
PAYDVDYS
5.66
5.66
Yes
OK
cchs2017_2018_p
PAA_045
1.45
1.45
Yes
OK
cchs2017_2018_p
PAA_050
10.19
10.07
No
OK
cchs2017_2018_p
PAA_075
2.63
2.63
Yes
OK
cchs2017_2018_p
PAA_080
6.40
6.22
No
OK
cchs2017_2018_p
PAADVDYS
3.50
3.50
Yes
OK
cchs2017_2018_p
PAADVVIG
38.25
38.19
No
OK
cchs2017_2018_p
PAYDVTOA
55.11
55.11
Yes
OK
cchs2017_2018_p
PAYDVADL
239.06
239.06
Yes
OK
cchs2017_2018_p
PAYDVVIG
92.07
92.07
Yes
OK
cchs2017_2018_p
PAYDVDYS
5.48
5.48
Yes
OK
cchs2019_2020_p
PAA_045
1.32
1.31
Yes
OK
cchs2019_2020_p
PAA_050
10.99
10.99
Yes
OK
cchs2019_2020_p
PAA_075
2.29
2.29
Yes
OK
cchs2019_2020_p
PAA_080
7.25
7.25
Yes
OK
cchs2019_2020_p
PAADVDYS
3.39
3.39
Yes
OK
cchs2019_2020_p
PAADVVIG
28.86
28.86
Yes
OK
cchs2019_2020_p
PAYDVTOA
61.51
57.28
No
OK
cchs2019_2020_p
PAYDVADL
210.92
210.92
Yes
OK
cchs2019_2020_p
PAYDVVIG
81.87
81.87
Yes
OK
cchs2019_2020_p
PAYDVDYS
5.30
5.30
Yes
OK
cchs2022_p
PAA_045
NA
NA
-
No source
cchs2022_p
PAA_050
NA
NA
-
No source
cchs2022_p
PAA_075
NA
NA
-
No source
cchs2022_p
PAA_080
NA
NA
-
No source
cchs2022_p
PAADVDYS
NA
NA
-
No source
cchs2022_p
PAADVVIG
52.92
52.92
Yes
OK
cchs2022_p
PAYDVTOA
NA
NA
-
No source
cchs2022_p
PAYDVADL
NA
NA
-
No source
cchs2022_p
PAYDVVIG
NA
NA
-
No source
cchs2022_p
PAYDVDYS
NA
NA
-
No source
Summary by variable
Code
if (exists ("all_2015_summary" )) {
var_summary <- all_2015_summary %>%
group_by (Variable) %>%
summarise (
Cycles_Tested = sum (Status == "OK" ),
Cycles_Matched = sum (Match == "Yes" , na.rm = TRUE ),
Cycles_Failed = sum (Status != "OK" & Status != "No source" ),
.groups = "drop"
)
kable (var_summary, row.names = FALSE )
}
2015+ variable validation summary
PAADVDYS
3
3
0
PAADVVIG
4
3
0
PAA_045
3
3
0
PAA_050
3
2
0
PAA_075
3
3
0
PAA_080
3
2
0
PAYDVADL
3
3
0
PAYDVDYS
3
3
0
PAYDVTOA
3
2
0
PAYDVVIG
3
2
0
Phase 8: Test summary
Code
message (" \n " , strrep ("=" , 60 ))
message ("PHYSICAL ACTIVITY INTEGRATION TEST SUMMARY" )
message (strrep ("=" , 60 ), " \n " )
# Count results
n_cycles <- length (cchs_data)
message ("CCHS PUMF cycles loaded: " , n_cycles)
# PACDEE summary
if (exists ("pacdee_val_summary" )) {
n_pacdee_ok <- sum (pacdee_val_summary$ Status == "OK" )
n_pacdee_match <- sum (pacdee_val_summary$ Match == "Yes" , na.rm = TRUE )
message (" \n PACDEE (pre-2015 energy expenditure):" )
message (" Cycles tested: " , n_pacdee_ok)
message (" Cycles matched: " , n_pacdee_match)
}
# PACFLEI summary
if (exists ("pacflei_val_summary" )) {
n_pacflei_ok <- sum (pacflei_val_summary$ Status == "OK" )
n_pacflei_match <- sum (pacflei_val_summary$ Match == "Yes" , na.rm = TRUE )
message (" \n PACFLEI (pre-2015 leisure index):" )
message (" Cycles tested: " , n_pacflei_ok)
message (" Cycles matched: " , n_pacflei_match)
}
# energy_exp summary
if (exists ("energy_exp_val_summary" )) {
n_ee_ok <- sum (energy_exp_val_summary$ Status == "OK" )
n_ee_pre2015_match <- sum (energy_exp_val_summary$ Era == "Pre-2015" &
energy_exp_val_summary$ Match == "Yes" , na.rm = TRUE )
n_ee_post2015_calc <- sum (energy_exp_val_summary$ Era == "Post-2015" &
energy_exp_val_summary$ Match == "Calculated" , na.rm = TRUE )
message (" \n energy_exp (derived energy expenditure):" )
message (" Total cycles tested: " , n_ee_ok)
message (" Pre-2015 cycles matched (vs PACDEE): " , n_ee_pre2015_match)
message (" Post-2015 cycles calculated: " , n_ee_post2015_calc)
}
# Active transport summary
if (exists ("at_summary" )) {
n_at_ok <- sum (at_summary$ Status == "OK" )
n_at_match <- sum (at_summary$ Match == "Yes" , na.rm = TRUE )
message (" \n Active transport (PAADVTRV, PAYDVTTR):" )
message (" Variable-cycle combinations tested: " , n_at_ok)
message (" Variable-cycle combinations matched: " , n_at_match)
}
# PAADVWHO summary
if (exists ("who_summary" )) {
n_who_ok <- sum (who_summary$ Status == "OK" )
n_who_match <- sum (who_summary$ Match == "Yes" , na.rm = TRUE )
message (" \n PAADVWHO (WHO classification):" )
message (" Cycles tested: " , n_who_ok)
message (" Cycles matched: " , n_who_match)
}
# 2015+ summary
if (exists ("all_2015_summary" )) {
n_2015_ok <- sum (all_2015_summary$ Status == "OK" )
n_2015_match <- sum (all_2015_summary$ Match == "Yes" , na.rm = TRUE )
n_2015_error <- sum (grepl ("Error" , all_2015_summary$ Status))
message (" \n 2015+ variables (PAA_*, PAADVVIG, etc.):" )
message (" Variable-cycle combinations tested: " , n_2015_ok)
message (" Variable-cycle combinations matched: " , n_2015_match)
if (n_2015_error > 0 ) {
message (" Errors: " , n_2015_error)
}
}
message (" \n " , strrep ("=" , 60 ))
L0-L4 analysis findings
Based on this validation:
Extensions validated
PAADVTRV
cchs2019_2020_p
Tested
PAYDVTTR
cchs2019_2020_p
Tested
active_transport
Uses PAADVTRV/PAYDVTTR
Tested
energy_exp
cchs2019_2020_p (all inputs already support)
Tested
New variable validated
PAADVWHO
2015-2022
Tested
Fixes validated
PACFLEI
dummyVariable cat_cat6 → cat2
Tested
PAC_4B
Label “walking” → “biking”
Metadata only