#' Cross-validated Dantzig estimator with missing covariates
#'
#' @importFrom graphics legend abline matplot par
#'
#' @description
#' Performs K-fold cross-validation for the Dantzig selector in linear regression
#' models with missing covariates. The method optionally incorporates unlabelled
#' covariate data to improve estimation of second-moment matrices. This function is based on
#' Section 3 of \insertCite{RisebrowSSLR;textual}{LRMiss}.
#'
#' @usage
#' cv_dantzig_missing(
#'   X, y, X_unlabeled = NULL,
#'   lambdas = NULL, nlambda = 30, lambda_min_ratio = 1e-3,
#'   K = 5, standardise = TRUE, gurobi = FALSE,
#'   seed = 123, fold_ids = NULL, verbose = TRUE,
#'   plot_path = TRUE
#' )
#'
#' @param X Labelled covariates.
#' @param y Response variables for the labelled data.
#' @param X_unlabeled Optional unlabeled covariates.
#' @param lambdas Optional sequence of regularisation parameters.
#' @param nlambda Number of lambdas if \code{lambdas} is not supplied.
#' @param lambda_min_ratio Smallest lambda as a fraction of the largest.
#' @param K Number of cross-validation folds.
#' @param standardise Logical; if TRUE covariates are standardised.
#' @param gurobi Logical; if TRUE uses Gurobi to solve the linear programs.
#' @param seed Random seed for fold assignment.
#' @param fold_ids Optional fold assignments for labelled or combined data.
#' @param verbose Logical; print progress messages.
#' @param plot_path Logical; if TRUE computes and plots the solution path.
#'
#' @details
#' For each candidate value of the regularisation parameter, the Dantzig selector
#' is fitted using moment estimates computed from the training folds. Prediction
#' performance is assessed on held-out folds via the maximum absolute moment
#' mismatch. The tuning parameter is selected using both the minimum mean
#' cross-validation score and the one-standard-error (1-SE) rule.
#'
#' @return A named list with the following components:
#' \describe{
#'   \item{lambdas}{Numeric vector of tuning parameters used.}
#'   \item{cv_scores_matrix}{Numeric matrix of cross-validation scores (folds × lambdas).}
#'   \item{mean_scores}{Mean CV score for each lambda.}
#'   \item{se_scores}{Standard error of CV scores for each lambda.}
#'   \item{lambda_min_mean}{Lambda minimising mean CV score.}
#'   \item{lambda_1se}{Lambda chosen by the 1-SE rule.}
#'   \item{beta_path}{Optional coefficient path matrix (present if \code{plot_path=TRUE}).}
#'   \item{design_colnames}{Optional design column names (matching \code{beta_path} rows).}
#'   \item{beta_est}{Optional saved coefficient vector from full-data path.}
#'   \item{intercept_est}{Optional saved intercept corresponding to \code{beta_est}.}
#' }
#'
#' @examples
#' set.seed(1)
#' n <- 50; p <- 5
#' X <- matrix(rnorm(n * p), n, p)
#' y <- X[, 1] + 0.5 * X[, 2] + rnorm(n)
#' X_unlabeled <- matrix(rnorm(100 * p), 100, p)
#'
#' cv_fit <- cv_dantzig_missing(
#'   X = X,
#'   y = y,
#'   X_unlabeled = X_unlabeled,
#'   K = 5,
#'   nlambda = 20
#' )
#'
#' cv_fit$lambda_1se
#' @references
#' \insertRef{RisebrowSSLR}{LRMiss}
#' @export
cv_dantzig_missing <- function(X, y, X_unlabeled = NULL,
                              lambdas = NULL, nlambda = 30, lambda_min_ratio = 1e-3,
                              K = 5, standardise = TRUE, gurobi = FALSE,
                              seed = 123, fold_ids = NULL, verbose = TRUE,
                              plot_path = TRUE) {
  # CV for Dantzig selector that uses unlabeled data for Sigma exactly as dantzig_missing*
  # X: labelled design (data.frame / matrix)
  # y: labelled responses (length n_labeled)
  # X_unlabeled: optional matrix/data.frame of unlabeled covariates (can be NULL)
  #
  # Returns: list with CV results, lambda choices, and if plot_path=TRUE also beta_path and beta_est (lambda.1se)

  if (!requireNamespace("fastDummies", quietly = TRUE)) stop("Package 'fastDummies' is required")
  if (!requireNamespace("Rglpk", quietly = TRUE)) stop("Package 'Rglpk' is required")
  if (gurobi && !requireNamespace("gurobi", quietly = TRUE)) stop("gurobi = TRUE but package 'gurobi' is not available.")

  set.seed(seed)

  # ---------- 0: basic dimensions ----------
  X_labeled_df <- as.data.frame(X)
  n_labeled <- nrow(X_labeled_df)
  if (length(y) != n_labeled) stop("Length of y must equal number of rows in X (labelled).")

  if (!is.null(X_unlabeled)) {
    X_unlabeled_df <- as.data.frame(X_unlabeled)
    X_combined_df <- rbind(X_labeled_df, X_unlabeled_df)
    n_unlabeled <- nrow(X_unlabeled_df)
  } else {
    X_combined_df <- X_labeled_df
    n_unlabeled <- 0
  }
  n_total <- nrow(X_combined_df)

  # ---------- 1: ensure factor encoding consistent (combined) ----------
  X_combined_df[] <- lapply(X_combined_df, function(col) if (is.character(col)) factor(col) else col)

  if (any(sapply(X_combined_df, function(col) is.factor(col) || is.character(col)))) {
    X_enc <- fastDummies::dummy_cols(
      X_combined_df,
      remove_first_dummy = TRUE,
      remove_selected_columns = TRUE,
      ignore_na = TRUE
    )
    X_enc[] <- lapply(X_enc, as.numeric)
  } else {
    X_enc <- X_combined_df
  }

  X_mat_all <- data.matrix(X_enc)   # rows: labeled (1..n_labeled), then unlabeled (n_labeled+1..n_total)
  p <- ncol(X_mat_all)

  # ---------- 2: folds: balanced assignment for labelled and unlabeled ----------
  # If user supplies fold_ids, accept length n_labeled (assign unlabeled balanced) or n_total (use as-is)
  if (is.null(fold_ids)) {
    # Balanced fold assignment: use rep(1:K, length.out = n) then shuffle to avoid ordering effects
    labelled_folds <- sample(rep(1:K, length.out = n_labeled))
    if (n_unlabeled > 0) {
      unlabeled_folds <- sample(rep(1:K, length.out = n_unlabeled))
      fold_ids_full <- c(labelled_folds, unlabeled_folds)
    } else {
      fold_ids_full <- labelled_folds
    }
  } else {
    if (!is.numeric(fold_ids)) stop("fold_ids must be numeric integer-like vector")
    if (length(fold_ids) == n_labeled) {
      # user supplied folds for labelled rows only -> assign unlabeled in a balanced way
      labelled_folds <- as.integer(fold_ids)
      if (n_unlabeled > 0) {
        unlabeled_folds <- sample(rep(1:K, length.out = n_unlabeled))
        fold_ids_full <- c(labelled_folds, unlabeled_folds)
      } else fold_ids_full <- labelled_folds
    } else if (length(fold_ids) == n_total) {
      fold_ids_full <- as.integer(fold_ids)
    } else {
      stop("fold_ids must have length equal to number of labelled rows or total rows (labelled + unlabeled).")
    }
    # normalize fold ids into 1:K if needed
    uniq_f <- sort(unique(fold_ids_full))
    if (any(uniq_f < 1) || any(uniq_f > K)) {
      # map unique values to 1:K in order
      newmap <- setNames(rep(1:K, length.out = length(uniq_f)), uniq_f)
      fold_ids_full <- as.integer(newmap[as.character(fold_ids_full)])
    }
  }
  fold_ids <- as.integer(fold_ids_full)

  # verify each fold contains at least one labelled row
  labelled_in_fold <- sapply(1:K, function(k) sum(fold_ids[seq_len(n_labeled)] == k))
  if (any(labelled_in_fold == 0)) {
    stop("One or more folds contain no labelled rows. Re-specify K or provide fold_ids ensuring at least one labelled row per fold.")
  }

  # ---------- 3: helper to compute gamma (works on an Xsub aligned with y_sub) ----------
  compute_gamma <- function(Xsub, ysub) {
    p_local <- ncol(Xsub)
    gamma <- numeric(p_local)
    for (j in seq_len(p_local)) {
      colj <- Xsub[, j]
      ok <- !is.na(colj) & !is.na(ysub)
      if (any(ok)) gamma[j] <- mean(colj[ok] * ysub[ok]) else gamma[j] <- 0
    }
    gamma
  }

  # ---------- 4: helper to solve Dantzig LP from Sigma/gamma ----------
  solve_dantzig_from_moments <- function(Sigma_hat, gamma_hat, lambda, gurobi = FALSE) {
    p_loc <- ncol(Sigma_hat)
    Objective <- c(rep(1, p_loc), rep(0, p_loc))
    I <- diag(1, p_loc)
    Z <- matrix(0, p_loc, p_loc)
    mat1 <- cbind(I, I); mat2 <- cbind(I, -I); mat3 <- cbind(I, Z)
    mat4 <- cbind(Z, Sigma_hat); mat5 <- cbind(Z, -Sigma_hat)
    OverallMatrix <- rbind(mat1, mat2, mat3, mat4, mat5)
    rhs <- c(rep(0, p_loc), rep(0, p_loc), rep(0, p_loc), -lambda + gamma_hat, -lambda - gamma_hat)
    dir <- rep(">=", length(rhs))

    if (!gurobi) {
      bounds <- list(
        lower = list(ind = seq_len(2 * p_loc), val = rep(-Inf, 2 * p_loc)),
        upper = list(ind = seq_len(2 * p_loc), val = rep( Inf, 2 * p_loc))
      )
      res <- Rglpk::Rglpk_solve_LP(obj = Objective, mat = OverallMatrix, dir = dir, rhs = rhs,
                                   bounds = bounds, max = FALSE)
      sol <- res$solution
    } else {
      gurobi_model <- list(A = -OverallMatrix, rhs = -rhs, sense = rep("<", length(rhs)),
                           obj = Objective, modelsense = "min",
                           lb = rep(-1e20, 2*p_loc), ub = rep(1e20, 2*p_loc))
      params <- list(OutputFlag = 0)
      res <- gurobi::gurobi(gurobi_model, params)
      sol <- res$x
    }
    beta_hat <- sol[(p_loc + 1):(2 * p_loc)]
    beta_hat[abs(beta_hat) < 1e-12] <- 0
    beta_hat
  }

  # ---------- 5: build lambda grid (data-driven using combined data for encoding; gamma from labelled rows) ----------
  if (is.null(lambdas)) {
    if (standardise) {
      mu_full  <- apply(X_mat_all, 2, function(col) mean(col, na.rm = TRUE))
      sdv_full <- apply(X_mat_all, 2, function(col) stats::sd(col, na.rm = TRUE))
      sdv_full[sdv_full == 0 | is.na(sdv_full)] <- 1
      X_all_for_gamma <- sweep(sweep(X_mat_all, 2, mu_full, "-"), 2, sdv_full, "/")
    } else {
      X_all_for_gamma <- X_mat_all
    }
    gamma_full <- compute_gamma(X_all_for_gamma[1:n_labeled, , drop = FALSE], y)
    gamma_norm <- max(abs(gamma_full), na.rm = TRUE)
    if (!is.finite(gamma_norm) || gamma_norm <= 0) gamma_norm <- 1e-6
    lambda_max <- gamma_norm
    lambda_min <- max(lambda_min_ratio * gamma_norm, .Machine$double.eps)
    lambdas <- exp(seq(log(lambda_max), log(lambda_min), length.out = nlambda))
    lambdas <- sort(lambdas)
  } else {
    if (!is.numeric(lambdas)) stop("`lambdas` must be numeric")
    lambdas <- sort(unique(as.numeric(lambdas)))
  }

  # ---------- 6: cross-validation loop (folds defined on combined data; Sigma uses combined fold rows; gamma uses labelled subset) ----------
  cv_scores <- matrix(NA, nrow = K, ncol = length(lambdas),
                      dimnames = list(paste0("fold", 1:K), paste0("lam_", signif(lambdas, 3))))

  for (k in seq_len(K)) {
    if (verbose) message("Processing fold ", k, "/", K)

    # indices in combined dataset assigned to test/train
    test_comb_idx <- which(fold_ids == k)
    train_comb_idx <- setdiff(seq_len(n_total), test_comb_idx)

    # labeled indices in train/test (subset of 1:n_labeled)
    train_label_idx <- intersect(train_comb_idx, seq_len(n_labeled))
    test_label_idx  <- intersect(test_comb_idx,  seq_len(n_labeled))

    if (length(train_label_idx) == 0 || length(test_label_idx) == 0) {
      stop(sprintf("Fold %d has zero labelled rows in train or test. Recreate folds so each fold has labelled rows.", k))
    }

    # Combined matrices for Sigma (train/test include both labeled and unlabeled rows as assigned by fold_ids)
    X_train_comb_raw <- X_mat_all[train_comb_idx, , drop = FALSE]
    X_test_comb_raw  <- X_mat_all[test_comb_idx,  , drop = FALSE]

    y_train <- y[train_label_idx]
    y_test  <- y[test_label_idx]

    # Standardise using training combined (train labeled + train unlabeled) if requested
    if (standardise) {
      mu <- apply(X_train_comb_raw, 2, function(col) mean(col, na.rm = TRUE))
      sdv <- apply(X_train_comb_raw, 2, function(col) stats::sd(col, na.rm = TRUE))
      sdv[sdv == 0 | is.na(sdv)] <- 1
      X_train_comb_std <- sweep(sweep(X_train_comb_raw, 2, mu, "-"), 2, sdv, "/")
      X_test_comb_std  <- sweep(sweep(X_test_comb_raw,  2, mu, "-"), 2, sdv, "/")
      # label-only standardized subsets for gamma computation
      pos_train_label_in_comb <- match(train_label_idx, train_comb_idx)
      pos_test_label_in_comb  <- match(test_label_idx,  test_comb_idx)
      X_train_label_std <- X_train_comb_std[pos_train_label_in_comb, , drop = FALSE]
      X_test_label_std  <- X_test_comb_std[pos_test_label_in_comb, , drop = FALSE]
    } else {
      X_train_comb_std <- X_train_comb_raw
      X_test_comb_std  <- X_test_comb_raw
      X_train_label_std <- X_mat_all[train_label_idx, , drop = FALSE]
      X_test_label_std  <- X_mat_all[test_label_idx,  , drop = FALSE]
    }

    # Compute Sigma and gamma for train and test:
    Sigma_train <- estimate_cov_raw(X_train_comb_std)
    gamma_train <- compute_gamma(X_train_label_std, y_train)
    Sigma_test  <- estimate_cov_raw(X_test_comb_std)
    gamma_test  <- compute_gamma(X_test_label_std,  y_test)

    # Evaluate for each lambda: fit on train moments (which used combined data), test on test moments
    for (li in seq_along(lambdas)) {
      lam <- lambdas[li]
      beta_hat_train <- solve_dantzig_from_moments(Sigma_train, gamma_train, lam, gurobi = gurobi)
      vec_diff <- as.vector(Sigma_test %*% beta_hat_train - gamma_test)
      cv_scores[k, li] <- max(abs(vec_diff), na.rm = TRUE)
    }
  }

  mean_scores <- colMeans(cv_scores, na.rm = TRUE)
  se_scores <- apply(cv_scores, 2, function(x) sd(x, na.rm = TRUE)/sqrt(K))

  lambda_min_mean <- lambdas[which.min(mean_scores)]
  lambda_1se <- max(lambdas[mean_scores <= min(mean_scores) + se_scores[which.min(mean_scores)]])

  if (verbose) {
    message("Lambda.min (min mean CV error): ", lambda_min_mean)
    message("Lambda.1se (default choice): ", lambda_1se)
  }

  result <- list(
    lambdas = lambdas,
    cv_scores_matrix = cv_scores,
    mean_scores = mean_scores,
    se_scores = se_scores,
    lambda_min_mean = lambda_min_mean,
    lambda_1se = lambda_1se
  )

  # ---------- 7: solution path on full combined data and save beta_est for lambda.1se ----------
  if (plot_path) {
    if (verbose) message("Computing full-data solution path for each lambda (this may take time)...")
    beta_path <- matrix(0, nrow = p, ncol = length(lambdas))
    rownames(beta_path) <- colnames(X_mat_all)
    colnames(beta_path) <- paste0("lam_", signif(lambdas, 6))
    intercepts <- rep(NA_real_, length(lambdas))

    for (i in seq_along(lambdas)) {
      lam <- lambdas[i]
      fit_full <- dantzig_missing(X_labeled = X_labeled_df, y = y, X_unlabeled = X_unlabeled,
                                  lambda = lam, gurobi = gurobi, standardise = standardise)
      beta_path[, i] <- as.numeric(fit_full$beta_hat)
      if (!is.null(fit_full$intercept)) intercepts[i] <- as.numeric(fit_full$intercept)
    }

    # Determine index for lambda.1se
    idx_min <- which.min(abs(lambdas - lambda_min_mean))

    beta_est <- beta_path[, idx_min]
    names(beta_est) <- rownames(beta_path)

    intercept_est <- if (!all(is.na(intercepts))) intercepts[idx_min] else NULL


    # plot
    op <- par(no.readonly = TRUE); on.exit(par(op), add = TRUE)
    matplot(log(lambdas), t(beta_path), type = "l", lty = 1, col = 1:p,
            xlab = "log(lambda)", ylab = "Coefficients", main = "Dantzig solution path")

    # vertical lines for lambda.min and lambda.1se
    abline(v = log(lambda_min_mean), col = "blue", lwd = 2, lty = 2)
    abline(v = log(lambda_1se), col = "red", lwd = 2, lty = 2)
    legend("topright", legend = c("lambda.min","lambda.1se"), col = c("blue","red"), lty = 2, lwd = 2)

    result$beta_path <- beta_path
    result$design_colnames <- colnames(X_mat_all)
    # add the saved estimate for lambda.1se
    result$beta_est <- beta_est
    result$intercept_est <- intercept_est
  }

  result
}
