## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set(collapse = TRUE, comment = "#>", echo = FALSE) library(knitr) # Load a scenario's $summary from simulations/results/.rds if present # (so the tables reflect the most recent run); otherwise fall back to the cached # values below. The simulations/ tree is not shipped with the package, so on a # clean install the cached values are shown. load_summary <- function(name) { candidates <- c( file.path("..", "simulations", "results", paste0(name, ".rds")), file.path("simulations", "results", paste0(name, ".rds")) ) for (p in candidates) { if (file.exists(p)) { s <- tryCatch(readRDS(p)$summary, error = function(e) NULL) if (!is.null(s)) return(s) } } NULL } show_cols <- function(df, cols, ...) { cols <- intersect(cols, names(df)) kable(df[, cols, drop = FALSE], row.names = FALSE, ...) } ## ----design-table------------------------------------------------------------- design <- data.frame( Regime = c("R1: perfect prediction (F=Y)", "R2: same-DGP draw", "R3: independent noise", "R4: LLM shift"), `Predicted F` = c( "predicted = observed (exact)", "fresh binary draw from the true model", "binary draw from scrambled item parameters", "binary draw from attenuated/shifted parameters"), `Role` = c("perfect predictor", "modest real signal", "uninformative LLM", "biased but informative LLM"), check.names = FALSE ) kable(design, caption = "Four predictor regimes (all binar responses).") ## ----lambda-table------------------------------------------------------------- lam <- load_summary("lambda_selection") if (is.null(lam)) { lam <- data.frame( label = c("R1: perfect (F=Y)", "R2: same-DGP draw", "R3: independent noise", "R4: LLM shift"), mean_risk = c(0.750, 0.119, 0.063, 0.105), median_risk = c(0.750, 0.108, 0.040, 0.104), prop_zero = c(0.00, 0.07, 0.35, 0.16), mean_ppi = c(0.750, 0.004, 0.000, 0.002) ) } show_cols(lam, c("label", "mean_risk", "median_risk", "prop_zero", "mean_ppi"), caption = "Selected lambda by regime (ability-risk tuning).") ## ----coverage-table----------------------------------------------------------- cov <- load_summary("coverage") if (is.null(cov)) { cov <- data.frame( label = c("R1 perfect (F=Y)", "R2 same-DGP draw", "R3 independent noise", "R4 LLM shift"), louis_cov_90 = c(0.909, 0.916, 0.914, 0.905), louis_cov_95 = c(0.955, 0.957, 0.961, 0.954), em_cov_90 = c(0.713, 0.727, 0.721, 0.726), em_cov_95 = c(0.787, 0.797, 0.795, 0.792), mean_se_ratio = c(1.626, 1.616, 1.651, 1.622) ) } show_cols(cov, c("label", "louis_cov_90", "louis_cov_95", "em_cov_90", "em_cov_95", "mean_se_ratio"), caption = paste("Item-parameter CI coverage, all four regimes", "(200 reps). Nominal targets 0.90 and 0.95.")) ## ----downstream-table--------------------------------------------------------- dwn <- load_summary("downstream") if (is.null(dwn)) { dwn <- data.frame( label = c("R1: perfect (F=Y)", "R2: same-DGP draw", "R3: independent noise", "R4: LLM shift"), mean_lambda = c(0.750, 0.119, 0.063, 0.105), rmse_human = c(1.4961, 1.5023, 1.5037, 1.5064), rmse_tuned = c(1.4923, 1.5005, 1.5030, 1.5046), mean_delta = c(-0.0038, -0.0019, -0.0006, -0.0018), delta_lo = c(-0.0073, -0.0027, -0.0009, -0.0030), delta_hi = c(-0.0003, -0.0011, -0.0003, -0.0007), prop_improve = c(0.48, 0.59, 0.50, 0.50), bias_a = c(0.010, 0.025, 0.035, 0.008) ) } show_cols(dwn, c("label", "mean_lambda", "rmse_human", "rmse_tuned", "mean_delta", "delta_lo", "delta_hi", "prop_improve", "bias_a"), caption = "Downstream ability-score RMSE.") ## ----crossfit-table----------------------------------------------------------- cf <- load_summary("crossfit") if (is.null(cf)) { cf <- data.frame( label = c("R1 perfect (F=Y)", "R2 same-DGP draw", "R4 LLM shift"), lambda_nocf = c(0.750, 0.119, 0.100), lambda_cf = c(0.858, 0.135, 0.115), bias_a_nocf = c(0.0104, 0.0250, 0.0374), bias_a_cf = c(0.0100, 0.0258, 0.0372), rmse_nocf = c(1.4923, 1.5005, 1.5027), rmse_cf = c(1.4927, 1.5005, 1.5029), cover_nocf = c(0.954, 0.954, 0.953), cover_cf = c(0.949, 0.956, 0.956) ) } show_cols(cf, c("label", "lambda_nocf", "lambda_cf", "bias_a_nocf", "bias_a_cf", "rmse_nocf", "rmse_cf", "cover_nocf", "cover_cf"), caption = "Cross-fitted vs non-cross-fitted tuning (100 reps, R1/R2/R4).") ## ----coverage-tuned-table----------------------------------------------------- ct <- load_summary("coverage_tuned") if (is.null(ct)) { ct <- data.frame( label = c("R1 perfect (F=Y)", "R2 same-DGP draw", "R3 independent noise", "R4 LLM shift"), lambda_sd = c(0.751, 0.116, 0.053, 0.103), lambda_xf = c(0.859, 0.135, 0.074, 0.130), fixed_95 = c(0.955, 0.957, 0.961, 0.954), samedata_95 = c(0.955, 0.956, 0.958, 0.958), crossfit_95 = c(0.953, 0.956, 0.958, 0.958), bias_a_fixed = c(0.0097, 0.0280, 0.0365, 0.0207), bias_a_samedata = c(0.0063, 0.0224, 0.0286, 0.0159), bias_a_crossfit = c(0.0051, 0.0222, 0.0289, 0.0166) ) } show_cols(ct, c("label", "lambda_sd", "lambda_xf", "fixed_95", "samedata_95", "crossfit_95"), caption = paste("95% coverage rates of the true item parameters at the", "fixed, same-data-tuned, and cross-fit-tuned λ (200 reps).", "lambda_sd / lambda_xf are the mean selected λ for each tuner.")) ## ----coverage-tuned-bias------------------------------------------------------ show_cols(ct, c("label", "bias_a_fixed", "bias_a_samedata", "bias_a_crossfit"), caption = paste("Mean discrimination bias E[a-hat - a] at each operating", "point. Same-data vs cross-fit differ by <= 0.001 except R1."))