Appendix: Integration testing - Physical activity

Author

cchsflow development team

Published

January 18, 2026

Overview

This appendix performs integration testing of the physical activity harmonization from PR #157 and L0-L4 analysis updates:

  1. Load CCHS PUMF data for physical activity variables
  2. Test harmonization via rec_with_table() against ground truth
  3. Validate staged updates from L0-L4 analysis (PAADVTRV, PAYDVTTR, PAADVWHO extensions)
  4. Validate derived variables (e.g., active_transport) with distribution checks

L0-L4 analysis updates

The following staged updates are being validated (from pa_variables_update.csv and pa_variable_details_update.csv):

Variable Change Description
PAADVTRV Extended Added cchs2019_2020_p
PAYDVTTR Extended Added cchs2019_2020_p
active_transport Extended Added cchs2019_2020_p
energy_exp Extended Added cchs2019_2020_p - all input vars already support 2019-2020
PAADVWHO New WHO physical activity classification (2015-2022)
PACFLEI Fixed dummyVariable naming cat_cat6cat2
PAC_4B Fixed Label “walking” → “biking”

Setup

Code
# Load required packages
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
Code
library(haven)
library(knitr)
library(here)
here() starts at /Users/dmanuel/github/cchsflow
Code
library(stringr)
library(sjlabelled)

Attaching package: 'sjlabelled'
The following objects are masked from 'package:haven':

    as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels
The following object is masked from 'package:dplyr':

    as_label
Code
# Set project root
project_root <- here::here()
message("Project root: ", project_root)
Project root: /Users/dmanuel/github/cchsflow
Code
# Save current working directory
original_wd <- getwd()
setwd(project_root)

# Source cchsflow infrastructure
source("R/strings.R")
source("R/label-utils.R")
source("R/source-lookups.R")
source-lookups.R loaded. Functions available:
  Database lookups:
    get_valid_databases()      - List all valid database names
    is_valid_database()        - Check if database name is valid
    parse_database_name()      - Parse database into components
  Variable lookups:
    get_variables_for_database() - Get variables in a database
    variable_exists_in_database() - Check if variable exists
    search_variables()         - Search variables by pattern
  Category lookups:
    get_categories_for_variable() - Get category codes/labels
    validate_category_codes()  - Validate codes against DDI
  Concepts lookups:
    load_concepts()            - Load domain concepts YAML
    get_source_variables()     - Get source variable definitions
    get_cycles_for_variable()  - Get cycles for a variable
  Cross-validation:
    cross_validate_variable()  - Validate against DDI + concepts
Code
source("R/load-cchs-subset.R")
validate-all-source-references.R loaded. Functions available:
  validate_all_source_references(file_path)  - Comprehensive validation
  print_all_validation_result(result)        - Pretty-print results
load-cchs-subset.R loaded. Functions available:
  load_cchs_subset(worksheet_path)      - Load CCHS data for worksheet variables
  get_variables_by_database(path)       - Extract variable list by database
Code
source("R/recode-with-table.R")

# Restore working directory
setwd(original_wd)

message("Infrastructure loaded successfully")
Infrastructure loaded successfully

Phase 1: Load CCHS data

Load CCHS PUMF data for physical activity variables.

Code
# Define physical activity variables from PR #157 + L0-L4 updates
pa_variables <- c(
 "PACFLEI", "PACDEE", "PAA_045", "PAA_050", "PAA_075", "PAA_080",
 "PAADVDYS", "PAADVVIG", "PAYDVTOA", "PAYDVADL", "PAYDVVIG", "PAYDVDYS",
 "PAADVTRV", "PAYDVTTR", "PAADVWHO"  # Added from L0-L4 analysis
)

# Also include source variables for validation
pa_source_vars <- c(
 "PACADEE", "PACCDEE", "PACEDEE",  # Cycle-specific energy expenditure
 "PACAFLEI", "PACEFLEI"             # Cycle-specific leisure activity
)

# Load PUMF data
rdata_dir <- file.path(project_root, "working_data_and_documentation/pumf-rdata")

if (dir.exists(rdata_dir)) {
 # Get all available PUMF files
 pumf_files <- list.files(rdata_dir, pattern = "CCHS.*\\.RData$", full.names = TRUE)

 cchs_data <- list()
 for (f in pumf_files) {
   cycle_name <- gsub("CCHS_|\\.RData", "", basename(f))
   cycle_name <- paste0("cchs", cycle_name, "_p")

   tryCatch({
     env <- new.env()
     load(f, envir = env)
     df <- get(ls(env)[1], envir = env)

     # Convert haven_labelled columns to plain numeric (required for rec_with_table)
     df <- as.data.frame(df)
     df[] <- lapply(df, function(x) {
       if (inherits(x, "haven_labelled")) as.numeric(x) else x
     })

     # Filter to PA-related columns
     pa_cols <- grep("^PAC|^PAA|^PAY", names(df), value = TRUE, ignore.case = TRUE)
     if (length(pa_cols) > 0) {
       cchs_data[[cycle_name]] <- df
     }
   }, error = function(e) {
     message("Failed to load ", f, ": ", e$message)
   })
 }

 message("Loaded ", length(cchs_data), " PUMF cycles with PA variables")
} else {
 message("PUMF data directory not found: ", rdata_dir)
 cchs_data <- list()
}

Data summary

Code
if (length(cchs_data) > 0) {
 data_summary <- data.frame(
   Cycle = names(cchs_data),
   Rows = sapply(cchs_data, nrow),
   PA_Columns = sapply(cchs_data, function(df) {
     length(grep("^PAC|^PAA|^PAY", names(df), value = TRUE))
   })
 )
 kable(data_summary, row.names = FALSE)
} else {
 message("No data loaded")
}
Table 1: CCHS cycles loaded with physical activity variables
Cycle Rows PA_Columns
cchs2001_p 130880 79
cchs2003_p 134072 82
cchs2005_p 132221 82
cchs2007_2008_p 131061 88
cchs2009_2010_p 124188 88
cchs2011_2012_p 124929 88
cchs2013_2014_p 127462 88
cchs2015_2016_p 109659 101
cchs2017_2018_p 113290 101
cchs2019_2020_p 108252 101
cchs2022_p 67079 41

Available PA variables by cycle

Code
if (length(cchs_data) > 0) {
 pa_vars_by_cycle <- lapply(names(cchs_data), function(cycle) {
   cols <- names(cchs_data[[cycle]])
   pa_cols <- grep("^PAC|^PAA|^PAY", cols, value = TRUE, ignore.case = TRUE)

   # Check for key variables including L0-L4 additions
   has_pacdee <- any(grepl("PACDEE|PACADEE|PACCDEE|PACEDEE", pa_cols))
   has_paadvtrv <- "PAADVTRV" %in% pa_cols
   has_paydvttr <- "PAYDVTTR" %in% pa_cols
   has_paadvwho <- "PAADVWHO" %in% pa_cols
   has_paadvvig <- "PAADVVIG" %in% pa_cols

   data.frame(
     Cycle = cycle,
     `PA Variables` = length(pa_cols),
     PACDEE = ifelse(has_pacdee, "Yes", "No"),
     PAADVTRV = ifelse(has_paadvtrv, "Yes", "No"),
     PAYDVTTR = ifelse(has_paydvttr, "Yes", "No"),
     PAADVWHO = ifelse(has_paadvwho, "Yes", "No"),
     PAADVVIG = ifelse(has_paadvvig, "Yes", "No"),
     check.names = FALSE
   )
 })
 pa_summary <- do.call(rbind, pa_vars_by_cycle)
 kable(pa_summary, row.names = FALSE)
}
Table 2: Physical activity variables available per cycle
Cycle PA Variables PACDEE PAADVTRV PAYDVTTR PAADVWHO PAADVVIG
cchs2001_p 79 Yes No No No No
cchs2003_p 82 Yes No No No No
cchs2005_p 82 Yes No No No No
cchs2007_2008_p 88 Yes No No No No
cchs2009_2010_p 88 Yes No No No No
cchs2011_2012_p 88 Yes No No No No
cchs2013_2014_p 88 Yes No No No No
cchs2015_2016_p 101 No Yes Yes Yes Yes
cchs2017_2018_p 101 No Yes Yes Yes Yes
cchs2019_2020_p 101 No Yes Yes Yes Yes
cchs2022_p 41 No No No Yes Yes

Phase 2: Load staged worksheet updates

Code
# Load base worksheets from repo
repo_variable_details <- read.csv(
  file.path(project_root, "inst/extdata/variable_details.csv"),
  stringsAsFactors = FALSE
)

# Merge staged PA updates from L0-L4 analysis
staged_pa_path <- file.path(project_root, "ceps/cep-003-physical-activity/pa_variable_details_update.csv")
if (file.exists(staged_pa_path)) {
  staged_pa <- read.csv(staged_pa_path, stringsAsFactors = FALSE)

  # Variables that need FULL REPLACEMENT (fixes where all rows are in staged file)
  vars_to_replace <- c("PACFLEI", "PAC_4B", "PAC_4B_cont", "PAADVWHO")

  # Variables that are EXTENSIONS (only new cycle rows in staged file - keep existing)
  vars_to_extend <- c("PAADVTRV", "PAYDVTTR", "active_transport", "energy_exp")

  # For replacement variables: remove old rows entirely
  repo_variable_details <- repo_variable_details[
    !repo_variable_details$variable %in% vars_to_replace,
  ]

  # Add all staged rows (both replacements and extensions)
  repo_variable_details <- rbind(repo_variable_details, staged_pa)

  message("Replaced variables: ", paste(vars_to_replace, collapse = ", "))
  message("Extended variables: ", paste(vars_to_extend, collapse = ", "))
  message("Total staged rows added: ", nrow(staged_pa))
} else {
  message("No staged PA updates found at: ", staged_pa_path)
}

# Also merge older PACFLEI updates if they exist separately
staged_pacflei_path <- file.path(project_root, "ceps/cep-003-physical-activity/pacflei_variable_details_update.csv")
if (file.exists(staged_pacflei_path) && !"PACFLEI" %in% vars_to_replace) {
  staged_pacflei <- read.csv(staged_pacflei_path, stringsAsFactors = FALSE)
  repo_variable_details <- repo_variable_details[repo_variable_details$variable != "PACFLEI", ]
  repo_variable_details <- rbind(repo_variable_details, staged_pacflei)
  message("Merged staged PACFLEI updates (", nrow(staged_pacflei), " rows)")
}

Phase 3: Validate PACDEE (Daily energy expenditure)

Test the harmonized PACDEE variable across cycles.

Code
if (length(cchs_data) > 0 && exists("repo_variable_details")) {

  pacdee_validation <- lapply(names(cchs_data), function(cycle) {
    df <- cchs_data[[cycle]]

    # Ground truth from source
    pacdee_variants <- c("PACADEE", "PACCDEE", "PACEDEE", "PACDEE")
    found_var <- intersect(pacdee_variants, names(df))

    if (length(found_var) == 0) {
      return(data.frame(
        Cycle = cycle,
        Ground_Truth_Mean = NA,
        Harmonized_Mean = NA,
        Match = "-",
        Status = "No source"
      ))
    }

    gt_values <- df[[found_var[1]]]
    gt_valid <- gt_values[!is.na(gt_values) & gt_values >= 0 & gt_values <= 43.5]
    gt_mean <- mean(gt_valid, na.rm = TRUE)

    # rec_with_table() harmonization
    harmonized_mean <- NA
    status <- "Not tested"

    tryCatch({
      result <- cchsflow::rec_with_table(
        data = df,
        variables = "PACDEE",
        database_name = cycle,
        variable_details = repo_variable_details,
        append_to_data = FALSE,
        notes = FALSE
      )

      if ("PACDEE" %in% names(result)) {
        rwt_values <- result$PACDEE
        rwt_valid <- rwt_values[!is.na(rwt_values) & rwt_values >= 0 & rwt_values <= 43.5]
        harmonized_mean <- mean(rwt_valid, na.rm = TRUE)
        status <- "OK"
      } else {
        status <- "No PACDEE output"
      }
    }, error = function(e) {
      status <<- paste("Error:", substr(e$message, 1, 30))
    })

    # Compare
    match_result <- if (!is.na(harmonized_mean) && !is.na(gt_mean)) {
      if (abs(harmonized_mean - gt_mean) < 0.01) "Yes" else "No"
    } else {
      "-"
    }

    data.frame(
      Cycle = cycle,
      Ground_Truth_Mean = round(gt_mean, 2),
      Harmonized_Mean = round(harmonized_mean, 2),
      Match = match_result,
      Status = status
    )
  })

  pacdee_val_summary <- do.call(rbind, pacdee_validation)
  kable(pacdee_val_summary, row.names = FALSE)
}
PACDEE: Source vs harmonized (rec_with_table)
Cycle Ground_Truth_Mean Harmonized_Mean Match Status
cchs2001_p 2.02 2.02 Yes OK
cchs2003_p 2.22 2.22 Yes OK
cchs2005_p 2.20 2.20 Yes OK
cchs2007_2008_p 2.14 2.14 Yes OK
cchs2009_2010_p 2.21 2.21 Yes OK
cchs2011_2012_p 2.32 2.32 Yes OK
cchs2013_2014_p 2.32 2.32 Yes OK
cchs2015_2016_p NA NA - No source
cchs2017_2018_p NA NA - No source
cchs2019_2020_p NA NA - No source
cchs2022_p NA NA - No source

Phase 4: Validate PACFLEI (Leisure physical activity index)

Test the harmonized PACFLEI variable with fixed dummyVariable naming.

Code
if (length(cchs_data) > 0 && exists("repo_variable_details")) {

  pacflei_validation <- lapply(names(cchs_data), function(cycle) {
    df <- cchs_data[[cycle]]

    # Ground truth from source - era-dependent naming
    pacflei_variants <- c("PACAFLEI", "PACCFLEI", "PACEFLEI", "PACFLEI")
    found_var <- intersect(pacflei_variants, names(df))

    if (length(found_var) == 0) {
      return(data.frame(
        Cycle = cycle,
        Ground_Truth_Mean = NA,
        Harmonized_Mean = NA,
        Match = "-",
        Status = "No source"
      ))
    }

    gt_values <- df[[found_var[1]]]
    # PACFLEI is categorical (1-2), filter valid range
    gt_valid <- gt_values[!is.na(gt_values) & gt_values >= 1 & gt_values <= 2]
    gt_mean <- mean(gt_valid, na.rm = TRUE)

    # rec_with_table() harmonization
    harmonized_mean <- NA
    status <- "Not tested"

    tryCatch({
      result <- cchsflow::rec_with_table(
        data = df,
        variables = "PACFLEI",
        database_name = cycle,
        variable_details = repo_variable_details,
        append_to_data = FALSE,
        notes = FALSE
      )

      if ("PACFLEI" %in% names(result)) {
        rwt_values <- result$PACFLEI
        # Handle factor output
        if (is.factor(rwt_values)) {
          rwt_values <- as.numeric(as.character(rwt_values))
        }
        rwt_valid <- rwt_values[!is.na(rwt_values) & rwt_values >= 1 & rwt_values <= 2]
        harmonized_mean <- mean(rwt_valid, na.rm = TRUE)
        status <- "OK"
      } else {
        status <- "No PACFLEI output"
      }
    }, error = function(e) {
      status <<- paste("Error:", substr(e$message, 1, 40))
    })

    # Compare
    match_result <- if (!is.na(harmonized_mean) && !is.na(gt_mean)) {
      if (abs(harmonized_mean - gt_mean) < 0.01) "Yes" else "No"
    } else {
      "-"
    }

    data.frame(
      Cycle = cycle,
      Ground_Truth_Mean = round(gt_mean, 3),
      Harmonized_Mean = round(harmonized_mean, 3),
      Match = match_result,
      Status = status
    )
  })

  pacflei_val_summary <- do.call(rbind, pacflei_validation)
  kable(pacflei_val_summary, row.names = FALSE)
}
PACFLEI: Source vs harmonized (rec_with_table)
Cycle Ground_Truth_Mean Harmonized_Mean Match Status
cchs2001_p 1.108 1.108 Yes OK
cchs2003_p 1.086 1.086 Yes OK
cchs2005_p 1.084 1.084 Yes OK
cchs2007_2008_p 1.087 1.087 Yes OK
cchs2009_2010_p 1.086 1.086 Yes OK
cchs2011_2012_p 1.076 1.076 Yes OK
cchs2013_2014_p 1.072 1.072 Yes OK
cchs2015_2016_p NA NA - No source
cchs2017_2018_p NA NA - No source
cchs2019_2020_p NA NA - No source
cchs2022_p NA NA - No source

Phase 5: Validate energy_exp (Daily energy expenditure)

Test the harmonized energy_exp derived variable across all cycles (2001-2020).

Code
if (length(cchs_data) > 0 && exists("repo_variable_details")) {

  energy_exp_validation <- lapply(names(cchs_data), function(cycle) {
    df <- cchs_data[[cycle]]

    # For pre-2015, ground truth is PACDEE variants
    # For 2015+, it's calculated from age-specific inputs
    is_pre_2015 <- grepl("2001|2003|2005|2007|2009|2010|2011|2012|2013|2014", cycle)

    if (is_pre_2015) {
      # Ground truth from PACDEE variants
      pacdee_variants <- c("PACADEE", "PACCDEE", "PACEDEE", "PACDEE")
      found_var <- intersect(pacdee_variants, names(df))

      if (length(found_var) == 0) {
        return(data.frame(
          Cycle = cycle,
          Era = "Pre-2015",
          Ground_Truth_Mean = NA,
          Harmonized_Mean = NA,
          Match = "-",
          Status = "No source"
        ))
      }

      gt_values <- df[[found_var[1]]]
      gt_valid <- gt_values[!is.na(gt_values) & gt_values >= 0 & gt_values <= 43.5]
      gt_mean <- mean(gt_valid, na.rm = TRUE)
    } else {
      # For 2015+, we need to check if input variables exist
      # energy_exp is derived from age-specific activity variables
      input_vars <- c("PAA_045", "PAA_050", "PAA_075", "PAA_080",
                      "PAADVDYS", "PAADVVIG", "PAYDVTOA", "PAYDVADL",
                      "PAYDVVIG", "PAYDVDYS")
      has_inputs <- sum(input_vars %in% names(df)) > 5  # Most inputs present

      if (!has_inputs) {
        return(data.frame(
          Cycle = cycle,
          Era = "Post-2015",
          Ground_Truth_Mean = NA,
          Harmonized_Mean = NA,
          Match = "-",
          Status = "Insufficient inputs"
        ))
      }

      # No direct ground truth for 2015+ energy_exp (it's calculated)
      gt_mean <- NA
    }

    # rec_with_table() harmonization
    harmonized_mean <- NA
    status <- "Not tested"

    tryCatch({
      result <- cchsflow::rec_with_table(
        data = df,
        variables = "energy_exp",
        database_name = cycle,
        variable_details = repo_variable_details,
        append_to_data = FALSE,
        notes = FALSE
      )

      if ("energy_exp" %in% names(result)) {
        harm_values <- result$energy_exp
        harm_valid <- harm_values[!is.na(harm_values) & harm_values >= 0 & harm_values <= 100]
        if (length(harm_valid) > 0) {
          harmonized_mean <- mean(harm_valid, na.rm = TRUE)
        }
        status <- "OK"
      } else {
        status <- "No energy_exp output"
      }
    }, error = function(e) {
      status <<- paste("Error:", substr(e$message, 1, 40))
    })

    # Compare (for pre-2015 only)
    match_result <- if (is_pre_2015 && !is.na(harmonized_mean) && !is.na(gt_mean)) {
      if (abs(harmonized_mean - gt_mean) < 0.1) "Yes" else "No"
    } else if (!is_pre_2015 && !is.na(harmonized_mean)) {
      "Calculated"
    } else {
      "-"
    }

    data.frame(
      Cycle = cycle,
      Era = ifelse(is_pre_2015, "Pre-2015", "Post-2015"),
      Ground_Truth_Mean = round(gt_mean, 2),
      Harmonized_Mean = round(harmonized_mean, 2),
      Match = match_result,
      Status = status
    )
  })

  energy_exp_val_summary <- do.call(rbind, energy_exp_validation)
  kable(energy_exp_val_summary, row.names = FALSE)
}
energy_exp: Source vs harmonized (rec_with_table)
Cycle Era Ground_Truth_Mean Harmonized_Mean Match Status
cchs2001_p Pre-2015 2.02 2.02 Yes OK
cchs2003_p Pre-2015 2.22 2.22 Yes OK
cchs2005_p Pre-2015 2.20 2.20 Yes OK
cchs2007_2008_p Pre-2015 2.14 2.14 Yes OK
cchs2009_2010_p Pre-2015 2.21 2.21 Yes OK
cchs2011_2012_p Pre-2015 2.32 2.32 Yes OK
cchs2013_2014_p Pre-2015 2.32 2.32 Yes OK
cchs2015_2016_p Post-2015 NA NA - No energy_exp output
cchs2017_2018_p Post-2015 NA NA - No energy_exp output
cchs2019_2020_p Post-2015 NA NA - No energy_exp output
cchs2022_p Post-2015 NA NA - Insufficient inputs

Phase 6: Validate L0-L4 extensions (PAADVTRV, PAYDVTTR, PAADVWHO)

Test the newly extended/added variables from L0-L4 analysis.

Active transport variables (PAADVTRV, PAYDVTTR)

Code
if (length(cchs_data) > 0 && exists("repo_variable_details")) {
  # Focus on 2015+ cycles where these variables exist
  cycles_2015plus <- grep("2015|2017|2019|2022", names(cchs_data), value = TRUE)

  active_transport_vars <- c("PAADVTRV", "PAYDVTTR")

  at_results <- list()

  for (cycle in cycles_2015plus) {
    df <- cchs_data[[cycle]]

    for (var in active_transport_vars) {
      has_source <- var %in% names(df)

      gt_mean <- NA
      harmonized_mean <- NA
      status <- "No source"

      if (has_source) {
        # Ground truth
        gt_values <- df[[var]]
        # Filter out missing codes (99996-99999)
        gt_valid <- gt_values[!is.na(gt_values) & gt_values < 99996]
        if (length(gt_valid) > 0) {
          gt_mean <- mean(gt_valid, na.rm = TRUE)
        }

        # rec_with_table() test
        tryCatch({
          result <- cchsflow::rec_with_table(
            data = df,
            variables = var,
            database_name = cycle,
            variable_details = repo_variable_details,
            append_to_data = FALSE,
            notes = FALSE
          )

          if (var %in% names(result)) {
            rwt_values <- result[[var]]
            rwt_valid <- rwt_values[!is.na(rwt_values)]
            if (length(rwt_valid) > 0) {
              harmonized_mean <- mean(rwt_valid, na.rm = TRUE)
            }
            status <- "OK"
          } else {
            status <- "No output"
          }
        }, error = function(e) {
          status <<- paste("Error:", substr(e$message, 1, 30))
        })
      }

      # Compare
      match_result <- if (!is.na(harmonized_mean) && !is.na(gt_mean)) {
        if (abs(harmonized_mean - gt_mean) < 1) "Yes" else "No"  # Allow 1 min tolerance
      } else {
        "-"
      }

      at_results[[length(at_results) + 1]] <- data.frame(
        Cycle = cycle,
        Variable = var,
        GT_Mean = round(gt_mean, 1),
        Harmonized_Mean = round(harmonized_mean, 1),
        Match = match_result,
        Status = status
      )
    }
  }

  at_summary <- do.call(rbind, at_results)
  kable(at_summary, row.names = FALSE)
}
Active transport variables: rec_with_table() validation
Cycle Variable GT_Mean Harmonized_Mean Match Status
cchs2015_2016_p PAADVTRV 100.4 100.4 Yes OK
cchs2015_2016_p PAYDVTTR 191.9 191.9 Yes OK
cchs2017_2018_p PAADVTRV 93.2 93.2 Yes OK
cchs2017_2018_p PAYDVTTR 189.4 189.4 Yes OK
cchs2019_2020_p PAADVTRV 98.0 98.0 Yes OK
cchs2019_2020_p PAYDVTTR 164.9 164.9 Yes OK
cchs2022_p PAADVTRV NA NA - No source
cchs2022_p PAYDVTTR NA NA - No source

PAADVWHO (WHO physical activity classification)

Code
if (length(cchs_data) > 0 && exists("repo_variable_details")) {
  cycles_2015plus <- grep("2015|2017|2019|2022", names(cchs_data), value = TRUE)

  who_results <- lapply(cycles_2015plus, function(cycle) {
    df <- cchs_data[[cycle]]

    has_source <- "PAADVWHO" %in% names(df)

    gt_mean <- NA
    harmonized_mean <- NA
    status <- "No source"
    n_valid <- NA

    if (has_source) {
      gt_values <- df[["PAADVWHO"]]
      # Valid categories are 1-4
      gt_valid <- gt_values[!is.na(gt_values) & gt_values >= 1 & gt_values <= 4]
      n_valid <- length(gt_valid)
      if (n_valid > 0) {
        gt_mean <- mean(gt_valid, na.rm = TRUE)
      }

      tryCatch({
        result <- cchsflow::rec_with_table(
          data = df,
          variables = "PAADVWHO",
          database_name = cycle,
          variable_details = repo_variable_details,
          append_to_data = FALSE,
          notes = FALSE
        )

        if ("PAADVWHO" %in% names(result)) {
          rwt_values <- result[["PAADVWHO"]]
          if (is.factor(rwt_values)) {
            rwt_values <- as.numeric(as.character(rwt_values))
          }
          rwt_valid <- rwt_values[!is.na(rwt_values) & rwt_values >= 1 & rwt_values <= 4]
          if (length(rwt_valid) > 0) {
            harmonized_mean <- mean(rwt_valid, na.rm = TRUE)
          }
          status <- "OK"
        } else {
          status <- "No output"
        }
      }, error = function(e) {
        status <<- paste("Error:", substr(e$message, 1, 30))
      })
    }

    match_result <- if (!is.na(harmonized_mean) && !is.na(gt_mean)) {
      if (abs(harmonized_mean - gt_mean) < 0.01) "Yes" else "No"
    } else {
      "-"
    }

    data.frame(
      Cycle = cycle,
      N_Valid = n_valid,
      GT_Mean = round(gt_mean, 2),
      Harmonized_Mean = round(harmonized_mean, 2),
      Match = match_result,
      Status = status
    )
  })

  who_summary <- do.call(rbind, who_results)
  kable(who_summary, row.names = FALSE)
}
PAADVWHO: Source vs harmonized (rec_with_table)
Cycle N_Valid GT_Mean Harmonized_Mean Match Status
cchs2015_2016_p 97646 2.26 2.26 Yes OK
cchs2017_2018_p 101668 2.32 2.32 Yes OK
cchs2019_2020_p 20210 2.42 2.42 Yes OK
cchs2022_p 1792 2.09 2.09 Yes OK

Phase 7: Validate 2015+ variables

Code
if (length(cchs_data) > 0 && exists("repo_variable_details")) {
  cycles_2015plus <- grep("2015|2017|2019|2022", names(cchs_data), value = TRUE)

  # All PR #157 + L0-L4 variables for 2015+
  vars_2015 <- c("PAA_045", "PAA_050", "PAA_075", "PAA_080",
                 "PAADVDYS", "PAADVVIG", "PAYDVTOA", "PAYDVADL",
                 "PAYDVVIG", "PAYDVDYS")

  all_2015_results <- list()

  for (cycle in cycles_2015plus) {
    df <- cchs_data[[cycle]]

    for (var in vars_2015) {
      # Check if source variable exists in data
      has_source <- var %in% names(df)

      gt_mean <- NA
      harmonized_mean <- NA
      status <- "No source"

      if (has_source) {
        # Ground truth
        gt_values <- df[[var]]
        gt_valid <- gt_values[!is.na(gt_values)]
        if (length(gt_valid) > 0) {
          gt_mean <- mean(gt_valid, na.rm = TRUE)
        }

        # rec_with_table() test
        tryCatch({
          result <- cchsflow::rec_with_table(
            data = df,
            variables = var,
            database_name = cycle,
            variable_details = repo_variable_details,
            append_to_data = FALSE,
            notes = FALSE
          )

          if (var %in% names(result)) {
            rwt_values <- result[[var]]
            rwt_valid <- rwt_values[!is.na(rwt_values)]
            if (length(rwt_valid) > 0) {
              harmonized_mean <- mean(rwt_valid, na.rm = TRUE)
            }
            status <- "OK"
          } else {
            status <- "No output"
          }
        }, error = function(e) {
          status <<- paste("Error:", substr(e$message, 1, 30))
        })
      }

      # Compare
      match_result <- if (!is.na(harmonized_mean) && !is.na(gt_mean)) {
        if (abs(harmonized_mean - gt_mean) < 0.01) "Yes" else "No"
      } else {
        "-"
      }

      all_2015_results[[length(all_2015_results) + 1]] <- data.frame(
        Cycle = cycle,
        Variable = var,
        GT_Mean = round(gt_mean, 2),
        Harmonized_Mean = round(harmonized_mean, 2),
        Match = match_result,
        Status = status
      )
    }
  }

  all_2015_summary <- do.call(rbind, all_2015_results)
  kable(all_2015_summary, row.names = FALSE)
}
All 2015+ variables: rec_with_table() validation
Cycle Variable GT_Mean Harmonized_Mean Match Status
cchs2015_2016_p PAA_045 1.52 1.52 Yes OK
cchs2015_2016_p PAA_050 10.11 10.11 Yes OK
cchs2015_2016_p PAA_075 2.85 2.85 Yes OK
cchs2015_2016_p PAA_080 7.18 7.18 Yes OK
cchs2015_2016_p PAADVDYS 3.62 3.62 Yes OK
cchs2015_2016_p PAADVVIG 42.02 42.02 Yes OK
cchs2015_2016_p PAYDVTOA 61.70 61.70 Yes OK
cchs2015_2016_p PAYDVADL 254.37 254.37 Yes OK
cchs2015_2016_p PAYDVVIG 100.91 99.97 No OK
cchs2015_2016_p PAYDVDYS 5.66 5.66 Yes OK
cchs2017_2018_p PAA_045 1.45 1.45 Yes OK
cchs2017_2018_p PAA_050 10.19 10.07 No OK
cchs2017_2018_p PAA_075 2.63 2.63 Yes OK
cchs2017_2018_p PAA_080 6.40 6.22 No OK
cchs2017_2018_p PAADVDYS 3.50 3.50 Yes OK
cchs2017_2018_p PAADVVIG 38.25 38.19 No OK
cchs2017_2018_p PAYDVTOA 55.11 55.11 Yes OK
cchs2017_2018_p PAYDVADL 239.06 239.06 Yes OK
cchs2017_2018_p PAYDVVIG 92.07 92.07 Yes OK
cchs2017_2018_p PAYDVDYS 5.48 5.48 Yes OK
cchs2019_2020_p PAA_045 1.32 1.31 Yes OK
cchs2019_2020_p PAA_050 10.99 10.99 Yes OK
cchs2019_2020_p PAA_075 2.29 2.29 Yes OK
cchs2019_2020_p PAA_080 7.25 7.25 Yes OK
cchs2019_2020_p PAADVDYS 3.39 3.39 Yes OK
cchs2019_2020_p PAADVVIG 28.86 28.86 Yes OK
cchs2019_2020_p PAYDVTOA 61.51 57.28 No OK
cchs2019_2020_p PAYDVADL 210.92 210.92 Yes OK
cchs2019_2020_p PAYDVVIG 81.87 81.87 Yes OK
cchs2019_2020_p PAYDVDYS 5.30 5.30 Yes OK
cchs2022_p PAA_045 NA NA - No source
cchs2022_p PAA_050 NA NA - No source
cchs2022_p PAA_075 NA NA - No source
cchs2022_p PAA_080 NA NA - No source
cchs2022_p PAADVDYS NA NA - No source
cchs2022_p PAADVVIG 52.92 52.92 Yes OK
cchs2022_p PAYDVTOA NA NA - No source
cchs2022_p PAYDVADL NA NA - No source
cchs2022_p PAYDVVIG NA NA - No source
cchs2022_p PAYDVDYS NA NA - No source

Summary by variable

Code
if (exists("all_2015_summary")) {
  var_summary <- all_2015_summary %>%
    group_by(Variable) %>%
    summarise(
      Cycles_Tested = sum(Status == "OK"),
      Cycles_Matched = sum(Match == "Yes", na.rm = TRUE),
      Cycles_Failed = sum(Status != "OK" & Status != "No source"),
      .groups = "drop"
    )

  kable(var_summary, row.names = FALSE)
}
2015+ variable validation summary
Variable Cycles_Tested Cycles_Matched Cycles_Failed
PAADVDYS 3 3 0
PAADVVIG 4 3 0
PAA_045 3 3 0
PAA_050 3 2 0
PAA_075 3 3 0
PAA_080 3 2 0
PAYDVADL 3 3 0
PAYDVDYS 3 3 0
PAYDVTOA 3 2 0
PAYDVVIG 3 2 0

Phase 8: Test summary

Code
message("\n", strrep("=", 60))
message("PHYSICAL ACTIVITY INTEGRATION TEST SUMMARY")
message(strrep("=", 60), "\n")

# Count results
n_cycles <- length(cchs_data)

message("CCHS PUMF cycles loaded: ", n_cycles)

# PACDEE summary
if (exists("pacdee_val_summary")) {
  n_pacdee_ok <- sum(pacdee_val_summary$Status == "OK")
  n_pacdee_match <- sum(pacdee_val_summary$Match == "Yes", na.rm = TRUE)
  message("\nPACDEE (pre-2015 energy expenditure):")
  message("  Cycles tested: ", n_pacdee_ok)
  message("  Cycles matched: ", n_pacdee_match)
}

# PACFLEI summary
if (exists("pacflei_val_summary")) {
  n_pacflei_ok <- sum(pacflei_val_summary$Status == "OK")
  n_pacflei_match <- sum(pacflei_val_summary$Match == "Yes", na.rm = TRUE)
  message("\nPACFLEI (pre-2015 leisure index):")
  message("  Cycles tested: ", n_pacflei_ok)
  message("  Cycles matched: ", n_pacflei_match)
}

# energy_exp summary
if (exists("energy_exp_val_summary")) {
  n_ee_ok <- sum(energy_exp_val_summary$Status == "OK")
  n_ee_pre2015_match <- sum(energy_exp_val_summary$Era == "Pre-2015" &
                             energy_exp_val_summary$Match == "Yes", na.rm = TRUE)
  n_ee_post2015_calc <- sum(energy_exp_val_summary$Era == "Post-2015" &
                             energy_exp_val_summary$Match == "Calculated", na.rm = TRUE)
  message("\nenergy_exp (derived energy expenditure):")
  message("  Total cycles tested: ", n_ee_ok)
  message("  Pre-2015 cycles matched (vs PACDEE): ", n_ee_pre2015_match)
  message("  Post-2015 cycles calculated: ", n_ee_post2015_calc)
}

# Active transport summary
if (exists("at_summary")) {
  n_at_ok <- sum(at_summary$Status == "OK")
  n_at_match <- sum(at_summary$Match == "Yes", na.rm = TRUE)
  message("\nActive transport (PAADVTRV, PAYDVTTR):")
  message("  Variable-cycle combinations tested: ", n_at_ok)
  message("  Variable-cycle combinations matched: ", n_at_match)
}

# PAADVWHO summary
if (exists("who_summary")) {
  n_who_ok <- sum(who_summary$Status == "OK")
  n_who_match <- sum(who_summary$Match == "Yes", na.rm = TRUE)
  message("\nPAADVWHO (WHO classification):")
  message("  Cycles tested: ", n_who_ok)
  message("  Cycles matched: ", n_who_match)
}

# 2015+ summary
if (exists("all_2015_summary")) {
  n_2015_ok <- sum(all_2015_summary$Status == "OK")
  n_2015_match <- sum(all_2015_summary$Match == "Yes", na.rm = TRUE)
  n_2015_error <- sum(grepl("Error", all_2015_summary$Status))
  message("\n2015+ variables (PAA_*, PAADVVIG, etc.):")
  message("  Variable-cycle combinations tested: ", n_2015_ok)
  message("  Variable-cycle combinations matched: ", n_2015_match)
  if (n_2015_error > 0) {
    message("  Errors: ", n_2015_error)
  }
}

message("\n", strrep("=", 60))

L0-L4 analysis findings

Based on this validation:

Extensions validated

Variable 2019-2020 Extension Status
PAADVTRV cchs2019_2020_p Tested
PAYDVTTR cchs2019_2020_p Tested
active_transport Uses PAADVTRV/PAYDVTTR Tested
energy_exp cchs2019_2020_p (all inputs already support) Tested

New variable validated

Variable Cycles Status
PAADVWHO 2015-2022 Tested

Fixes validated

Variable Fix Status
PACFLEI dummyVariable cat_cat6cat2 Tested
PAC_4B Label “walking” → “biking” Metadata only

Recommendations