## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set(collapse = TRUE, comment = "#>", echo = FALSE) library(knitr) # Fits at N = 100,000 take several seconds each, so this vignette CANNOT compute # live. Results are precomputed by data-raw/precompute_largeN.R, which writes # vignettes/largeN_results.rds. We load that if present, else fall back to the # cached numbers below (so the vignette always renders). load_largeN <- function() { for (p in c("largeN_results.rds", file.path("..", "vignettes", "largeN_results.rds"))) { if (file.exists(p)) { r <- tryCatch(readRDS(p), error = function(e) NULL) if (!is.null(r)) return(r) } } NULL } res <- load_largeN() if (is.null(res)) { lam <- seq(0, 1, by = 0.1) res <- list( n = 500, N = 100000, n_reps = 16, opt_lambda = 0.275, grid_argmin = 0.10, per_rep_opt = c(0.275, 0.226, 0.052, 0.005, 0.190, 0.086, 0.175, 0.186, 0.139, 0.173, 0.211, 0.170, 0.027, 0.119, 0.147, 0.131), naive_bias = c(a = -0.119, d = 0.248), human_bias = c(a = 0.000, d = 0.001), curve = data.frame( lambda = lam, bias_a = c(0.0002, 0.0008, 0.0019, 0.0034, 0.0053, 0.0077, 0.0105, 0.0139, 0.0178, 0.0222, 0.0273), bias_a_se = c(0.0190, 0.0187, 0.0185, 0.0186, 0.0187, 0.0190, 0.0195, 0.0200, 0.0207, 0.0215, 0.0225), bias_d = c(0.0011, 0.0008, 0.0006, 0.0004, 0.0003, 0.0003, 0.0003, 0.0005, 0.0007, 0.0010, 0.0014), bias_d_se = c(0.0130, 0.0120, 0.0111, 0.0103, 0.0098, 0.0095, 0.0094, 0.0096, 0.0101, 0.0108, 0.0116), risk = c(0.0389, 0.0381, 0.0382, 0.0391, 0.0409, 0.0435, 0.0470, 0.0514, 0.0568, 0.0631, 0.0705), risk_se = c(0.0014, 0.0013, 0.0013, 0.0014, 0.0015, 0.0016, 0.0018, 0.0021, 0.0025, 0.0029, 0.0034) ) ) } ## ----note-precompute, echo = FALSE, results = "asis"-------------------------- cat(sprintf( "_All numbers are precomputed (`data-raw/precompute_largeN.R`): n = %d, N = %d, %d Monte Carlo replications._", res$n, res$N, res$n_reps)) ## ----naive-line, echo = FALSE, results = "asis"------------------------------- cat(sprintf( "Averaged over the replications, the naive estimator's item-parameter bias is **%+.3f** in the slopes and **%+.3f** in the intercepts — essentially the LLM's shift (−0.1·a, +0.25). Because `N = 100000`, that wrong answer is estimated *very precisely* (a tiny standard error); more LLM data only sharpens the bias.", res$naive_bias[["a"]], res$naive_bias[["d"]])) ## ----bias-curve, echo = FALSE, fig.width = 7, fig.height = 3.3, fig.alt = "Item-parameter bias of the mixed-subjects estimator is flat near zero across all lambda, far from the naive pooled bias shown as dashed reference lines."---- if (requireNamespace("ggplot2", quietly = TRUE)) { library(ggplot2) cv <- res$curve bd <- rbind( data.frame(lambda = cv$lambda, bias = cv$bias_a, se = cv$bias_a_se, parameter = "discrimination (a)"), data.frame(lambda = cv$lambda, bias = cv$bias_d, se = cv$bias_d_se, parameter = "difficulty (d)") ) refs <- data.frame( parameter = c("discrimination (a)", "difficulty (d)"), naive = c(res$naive_bias[["a"]], res$naive_bias[["d"]]) ) ggplot(bd, aes(lambda, bias)) + geom_hline(yintercept = 0, colour = "grey70") + geom_hline(data = refs, aes(yintercept = naive), linetype = 2, colour = "firebrick") + geom_ribbon(aes(ymin = bias - 2 * se, ymax = bias + 2 * se), alpha = 0.15) + geom_line() + geom_point(size = 1.6) + facet_wrap(~ parameter) + labs(x = expression(lambda), y = "item-parameter bias", title = "Bias is flat at ~0 for every λ (dashed red = naive pooled bias)") + theme_minimal() } ## ----risk-curve, echo = FALSE, fig.width = 5.4, fig.height = 3.3, fig.alt = "Model-based ability-score risk as a function of lambda, with a shallow minimum near the optimized lambda."---- if (requireNamespace("ggplot2", quietly = TRUE)) { cv <- res$curve vline <- cv$lambda[which.min(cv$risk)] # minimum of the (averaged) risk curve rug <- data.frame(lambda = res$per_rep_opt) ggplot(cv, aes(lambda, risk)) + geom_ribbon(aes(ymin = risk - 2 * risk_se, ymax = risk + 2 * risk_se), alpha = 0.15) + geom_line() + geom_point(size = 1.6) + geom_vline(xintercept = vline, linetype = 3, colour = "steelblue") + geom_rug(data = rug, aes(x = lambda), inherit.aes = FALSE, sides = "b", colour = "firebrick", alpha = 0.6) + annotate("text", x = vline, y = max(cv$risk), hjust = -0.1, label = sprintf("avg-curve min ~ λ = %.1f", vline), colour = "steelblue", size = 3.2) + labs(x = expression(lambda), y = "model-based ability-score risk", title = "Efficiency curve (shallow); red ticks = per-dataset optima") + theme_minimal() } ## ----efficiency-line, echo = FALSE, results = "asis"-------------------------- cv <- res$curve argmin <- cv$lambda[which.min(cv$risk)] gain <- (cv$risk[cv$lambda == 0] - min(cv$risk)) / cv$risk[cv$lambda == 0] cat(sprintf( paste0("For this weakly-informative LLM the averaged risk curve is shallow ", "and rises for larger λ: leaning on a poorly-correlated predictor *adds* ", "measurement error to latent ability. Its minimum sits near λ ≈ %.1f, only", "about %.0f%% below the λ = 0 (human-only) risk — almost no efficiency to be", "gained. Because the curve is so flat, each individual dataset's optimum ", "scatters around this value (the red ticks); see the next section. Every point", "on the curve is unbiased."), argmin, 100 * gain)) ## ----tune-line, echo = FALSE, results = "asis"-------------------------------- pr <- res$per_rep_opt cat(sprintf( paste0("The optimizer returns the minimizer of this dataset's risk surface. ", "Here, λ = %.2f. Every dataset has its own (noisy) risk surface, so its ", "optimal λ varies. Across the %d replications the per-dataset optimum ", "averaged %.2f and ranged [%.1f, %.1f], scattering around the minimum of the ", "*averaged* curve (≈ %.1f). (These are not the same point — the minimum of the ", "average risk is not the average of the per-dataset minima.) The scatter is wide ", "here because the surface is shallow; informative predictions sharpens it."), res$opt_lambda, res$n_reps, mean(pr), min(pr), max(pr), res$grid_argmin))