## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>", echo = FALSE)
library(knitr)

# Fits at N = 100,000 take several seconds each, so this vignette CANNOT compute
# live. Results are precomputed by data-raw/precompute_largeN.R, which writes
# vignettes/largeN_results.rds. We load that if present, else fall back to the
# cached numbers below (so the vignette always renders).
load_largeN <- function() {
  for (p in c("largeN_results.rds",
              file.path("..", "vignettes", "largeN_results.rds"))) {
    if (file.exists(p)) {
      r <- tryCatch(readRDS(p), error = function(e) NULL)
      if (!is.null(r)) return(r)
    }
  }
  NULL
}

res <- load_largeN()
if (is.null(res)) {
  lam <- seq(0, 1, by = 0.1)
  res <- list(
    n = 500, N = 100000, n_reps = 16,
    opt_lambda = 0.275, grid_argmin = 0.10,
    per_rep_opt = c(0.275, 0.226, 0.052, 0.005, 0.190, 0.086, 0.175, 0.186,
                    0.139, 0.173, 0.211, 0.170, 0.027, 0.119, 0.147, 0.131),
    naive_bias = c(a = -0.119, d = 0.248),
    human_bias = c(a = 0.000, d = 0.001),
    curve = data.frame(
      lambda    = lam,
      bias_a    = c(0.0002, 0.0008, 0.0019, 0.0034, 0.0053, 0.0077, 0.0105, 0.0139, 0.0178, 0.0222, 0.0273),
      bias_a_se = c(0.0190, 0.0187, 0.0185, 0.0186, 0.0187, 0.0190, 0.0195, 0.0200, 0.0207, 0.0215, 0.0225),
      bias_d    = c(0.0011, 0.0008, 0.0006, 0.0004, 0.0003, 0.0003, 0.0003, 0.0005, 0.0007, 0.0010, 0.0014),
      bias_d_se = c(0.0130, 0.0120, 0.0111, 0.0103, 0.0098, 0.0095, 0.0094, 0.0096, 0.0101, 0.0108, 0.0116),
      risk      = c(0.0389, 0.0381, 0.0382, 0.0391, 0.0409, 0.0435, 0.0470, 0.0514, 0.0568, 0.0631, 0.0705),
      risk_se   = c(0.0014, 0.0013, 0.0013, 0.0014, 0.0015, 0.0016, 0.0018, 0.0021, 0.0025, 0.0029, 0.0034)
    )
  )
}

## ----note-precompute, echo = FALSE, results = "asis"--------------------------
cat(sprintf(
  "_All numbers are precomputed (`data-raw/precompute_largeN.R`): n = %d, N = %d, %d Monte Carlo replications._",
  res$n, res$N, res$n_reps))

## ----naive-line, echo = FALSE, results = "asis"-------------------------------
cat(sprintf(
  "Averaged over the replications, the naive estimator's item-parameter bias is **%+.3f** in the slopes and **%+.3f** in the intercepts — essentially the LLM's shift (−0.1·a, +0.25). Because `N = 100000`, that wrong answer is estimated *very precisely* (a tiny standard error); more LLM data only sharpens the bias.",
  res$naive_bias[["a"]], res$naive_bias[["d"]]))

## ----bias-curve, echo = FALSE, fig.width = 7, fig.height = 3.3, fig.alt = "Item-parameter bias of the mixed-subjects estimator is flat near zero across all lambda, far from the naive pooled bias shown as dashed reference lines."----
if (requireNamespace("ggplot2", quietly = TRUE)) {
  library(ggplot2)
  cv <- res$curve
  bd <- rbind(
    data.frame(lambda = cv$lambda, bias = cv$bias_a, se = cv$bias_a_se,
               parameter = "discrimination (a)"),
    data.frame(lambda = cv$lambda, bias = cv$bias_d, se = cv$bias_d_se,
               parameter = "difficulty (d)")
  )
  refs <- data.frame(
    parameter = c("discrimination (a)", "difficulty (d)"),
    naive = c(res$naive_bias[["a"]], res$naive_bias[["d"]])
  )
  ggplot(bd, aes(lambda, bias)) +
    geom_hline(yintercept = 0, colour = "grey70") +
    geom_hline(data = refs, aes(yintercept = naive), linetype = 2, colour = "firebrick") +
    geom_ribbon(aes(ymin = bias - 2 * se, ymax = bias + 2 * se), alpha = 0.15) +
    geom_line() + geom_point(size = 1.6) +
    facet_wrap(~ parameter) +
    labs(x = expression(lambda), y = "item-parameter bias",
         title = "Bias is flat at ~0 for every λ (dashed red = naive pooled bias)") +
    theme_minimal()
}

## ----risk-curve, echo = FALSE, fig.width = 5.4, fig.height = 3.3, fig.alt = "Model-based ability-score risk as a function of lambda, with a shallow minimum near the optimized lambda."----
if (requireNamespace("ggplot2", quietly = TRUE)) {
  cv <- res$curve
  vline <- cv$lambda[which.min(cv$risk)]   # minimum of the (averaged) risk curve
  rug <- data.frame(lambda = res$per_rep_opt)
  ggplot(cv, aes(lambda, risk)) +
    geom_ribbon(aes(ymin = risk - 2 * risk_se, ymax = risk + 2 * risk_se), alpha = 0.15) +
    geom_line() + geom_point(size = 1.6) +
    geom_vline(xintercept = vline, linetype = 3, colour = "steelblue") +
    geom_rug(data = rug, aes(x = lambda), inherit.aes = FALSE, sides = "b",
             colour = "firebrick", alpha = 0.6) +
    annotate("text", x = vline, y = max(cv$risk), hjust = -0.1,
             label = sprintf("avg-curve min ~ λ = %.1f", vline),
             colour = "steelblue", size = 3.2) +
    labs(x = expression(lambda), y = "model-based ability-score risk",
         title = "Efficiency curve (shallow); red ticks = per-dataset optima") +
    theme_minimal()
}

## ----efficiency-line, echo = FALSE, results = "asis"--------------------------
cv <- res$curve
argmin <- cv$lambda[which.min(cv$risk)]
gain   <- (cv$risk[cv$lambda == 0] - min(cv$risk)) / cv$risk[cv$lambda == 0]
cat(sprintf(
  paste0("For this weakly-informative LLM the averaged risk curve is shallow ",
  "and rises for larger λ: leaning on a poorly-correlated predictor *adds* ",
  "measurement error to latent ability. Its minimum sits near λ ≈ %.1f, only",
  "about %.0f%% below the λ = 0 (human-only) risk — almost no efficiency to be",
  "gained. Because the curve is so flat, each individual dataset's optimum ",
  "scatters around this value (the red ticks); see the next section. Every point",
  "on the curve is unbiased."), argmin, 100 * gain))

## ----tune-line, echo = FALSE, results = "asis"--------------------------------
pr <- res$per_rep_opt
cat(sprintf(
  paste0("The optimizer returns the minimizer of this dataset's risk surface. ",
  "Here, λ = %.2f. Every dataset has its own (noisy) risk surface, so its  ",
  "optimal λ varies. Across the %d replications the per-dataset optimum ",
  "averaged %.2f and ranged [%.1f, %.1f], scattering around the minimum of the ",
  "*averaged* curve (≈ %.1f). (These are not the same point — the minimum of the ",
  "average risk is not the average of the per-dataset minima.) The scatter is wide ",
  "here because the surface is shallow; informative predictions sharpens it."),
  res$opt_lambda, res$n_reps, mean(pr), min(pr), max(pr), res$grid_argmin))