## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 4 ) ## ----echo = FALSE, results = "asis"------------------------------------------- cat( "$$L_o^{\\mathrm{marg}}(\\gamma) + \\lambda\\bigl[", "L_g^{\\mathrm{marg}}(\\gamma) - L_p^{\\mathrm{marg}}(\\gamma)\\bigr]$$" ) ## ----simulate----------------------------------------------------------------- library(mixedsubjectsirt) library(ggplot2) set.seed(242424) n_human <- 400 n_generated <- 1200 n_items <- 8 true_pars <- data.frame( item = paste0("Item", seq_len(n_items)), a = seq(0.8, 1.6, length.out = n_items), d = seq(-1.1, 1.1, length.out = n_items) ) true_pars$b <- -true_pars$d / true_pars$a theta_human <- rnorm(n_human) observed <- simulate_2pl(theta_human, true_pars) # Strongly informative predictions. On the n labeled subjects, predictions # match human responses (the F = Y benchmark from the simulation study), and # N additional unlabeled responses are drawn from the same 2PL. This is # the "good predictor" regime, where the method should lean on unlabeled data. # (Further down we show the complementary case of an uninformative predictor, # where the criterion instead drives lambda to 0.) predicted <- observed generated <- simulate_2pl(rnorm(n_generated), true_pars) ## ----human-baseline----------------------------------------------------------- human_start <- fit_2pl(observed, technical = list(NCYCLES = 500)) ## ----mml-fit------------------------------------------------------------------ mixed_mml <- fit_mixed_subjects_mml( observed = observed, predicted = predicted, generated = generated, lambda = 0.5, initial_pars = human_start$pars ) mixed_mml ## ----mml-pars----------------------------------------------------------------- mixed_mml$item_pars ## ----tune-lambda-------------------------------------------------------------- ability_tuned <- tune_lambda_ability_risk( observed = observed, predicted = predicted, generated = generated, target_resp = observed, initial_pars = human_start$pars, fit_fn = fit_mixed_subjects_mml, control = list(maxit = 200) ) ability_tuned$best_lambda ## ----crossfit-tune------------------------------------------------------------ cf_tuned <- tune_lambda_ability_risk_crossfit( observed = observed, predicted = predicted, generated = generated, initial_pars = human_start$pars, fit_fn = fit_mixed_subjects_mml, final_fit_fn = fit_mixed_subjects_mml, n_splits = 2, # the standard PPI sample split (also the default) control = list(maxit = 200) ) cf_tuned$lambda_by_split # one tuned lambda per held-out fold cf_tuned$lambda_final # fold-size-weighted scalar used for the final fit ## ----covariance--------------------------------------------------------------- Sigma <- vcov(cf_tuned$final_fit) dim(Sigma) # 2J × 2J ## ----compare------------------------------------------------------------------ human_only <- fit_mixed_subjects_mml( observed = observed, predicted = predicted, generated = generated, lambda = 0, initial_pars = human_start$pars ) estimates <- rbind( data.frame(estimator = "human only", human_only$item_pars), data.frame(estimator = "MML lambda = 0.5", mixed_mml$item_pars), data.frame(estimator = "MML ability-risk", ability_tuned$best_fit$item_pars) ) estimates$true_b <- true_pars$b[match(estimates$item, true_pars$item)] estimates$true_a <- true_pars$a[match(estimates$item, true_pars$item)] ggplot(estimates, aes(true_b, b, colour = estimator)) + geom_abline(slope = 1, intercept = 0, linewidth = 0.4) + geom_point(size = 2) + labs(x = "True difficulty", y = "Estimated difficulty", colour = NULL) + theme_minimal() ## ----uninformative-predictor-------------------------------------------------- set.seed(242424) bad_pars <- true_pars bad_pars$a <- pmax(0.05, abs(rnorm(n_items, 0, 0.1))) # near-zero slopes bad_pars$d <- rnorm(n_items, 0, 2) # difficulties unrelated to truth predicted_bad <- simulate_2pl(theta_human, bad_pars) generated_bad <- simulate_2pl(rnorm(n_generated), bad_pars) tuned_bad <- tune_lambda_ability_risk( observed = observed, predicted = predicted_bad, generated = generated_bad, target_resp = observed, initial_pars = human_start$pars, fit_fn = fit_mixed_subjects_mml, control = list(maxit = 200) ) tuned_bad$best_lambda # expect ~0: the useless LLM is correctly ignored