#' Detect and analyze groups in a scientific network
#'
#' This function identifies and analyzes groups (communities) within scientific networks
#' created from articles and patents data. It can apply different clustering algorithms
#' to detect technological trajectories and emerging scientific fields.
#'
#' @param comps A list containing network components, typically generated by
#'   \code{\link{sniff_components}}(). Must include a network object with
#'   'component' and 'PY' (publication year) vertex attributes.
#' @param min_group_size Minimum size for a group to be included in results
#'   (default = 10). Groups with fewer members will be filtered out.
#' @param keep_component Character vector specifying which network components to
#'   process (default = "c1"). Can include multiple components.
#' @param cluster_component Character vector specifying which components should be
#'   clustered (default = "c1"). Components not listed here will be treated
#'   as single groups.
#' @param algorithm Community detection algorithm to use (default = "fast_greedy").
#'   Options include: "louvain", "walktrap", "edge_betweenness", "fast_greedy",
#'   or "leiden".
#' @param seed Random seed for reproducible results (default = 888L). Only applies
#'   to algorithms that use random initialization like Louvain.
#'
#' @return A list with three elements:
#' \itemize{
#'   \item \code{aggregate}: A data frame with group statistics including group name,
#'     number of papers, and average publication year
#'   \item \code{network}: The input network with added group attributes
#'   \item \code{pubs_by_year}: Publication counts by group and year
#' }
#'
#' @details The function first validates the input network, then applies the specified
#' clustering algorithm to detect communities within the network. It calculates
#' statistics for each detected group and returns the results along with the
#' augmented network. The function can handle multiple network components
#' simultaneously, applying clustering only to specified components.
#'
#' @examples
#' \dontrun{
#' # Assuming 'comps' is output from sniff_components()
#' groups <- sniff_groups(comps,
#'   min_group_size = 15,
#'   algorithm = "leiden",
#'   seed = 888L
#' )
#'
#' # Access group statistics
#' groups$aggregate
#' groups$network
#' groups$pubs_by_year
#' }
#'
#' @seealso \code{\link{sniff_components}()} for creating the input network components
#' @rdname sniff_groups
#' @export
#' @importFrom igraph vertex_attr_names cluster_louvain as_undirected cluster_walktrap cluster_edge_betweenness cluster_fast_greedy cluster_leiden V
#' @importFrom tidygraph as_tbl_graph activate
#' @importFrom dplyr filter as_tibble group_by summarise n arrange desc mutate select left_join tally rename ungroup
#' @importFrom purrr map set_names map_dfr
#' @importFrom tibble as_tibble
sniff_groups <- function(
  comps,
  min_group_size = 10,
  keep_component = c("c1"),
  cluster_component = c("c1"),
  algorithm = "fast_greedy",
  seed = 888L) {
  # Input validation
  if (!is.list(comps)) {
    stop("Input must be a list generated by sniff_components() function", call. = FALSE)
  }

  net <- comps$network

  if (!inherits(net, c("tbl_graph", "igraph"))) {
    stop("Input must be a network object (tbl_graph or igraph)", call. = FALSE)
  }

  if (!"component" %in% igraph::vertex_attr_names(net)) {
    stop("Network must contain 'component' vertex attribute, generated by sniff_components()", call. = FALSE)
  }

  if (!"PY" %in% igraph::vertex_attr_names(net)) {
    stop("Network must contain 'PY' (publication year) vertex attribute, generated by sniff_components()", call. = FALSE)
  }

  if (!is.numeric(min_group_size) || min_group_size < 1) {
    stop("min_group_size must be a positive integer", call. = FALSE)
  }

  valid_algorithms <- c("louvain", "walktrap", "edge_betweenness", "fast_greedy", "leiden")
  if (!algorithm %in% valid_algorithms) {
    stop("algorithm must be one of: ", paste(valid_algorithms, collapse = ", "), call. = FALSE)
  }

  if (!is.numeric(seed) && !is.integer(seed)) {
    stop("seed must be a numeric or integer value", call. = FALSE)
  }

  # Initialize variables
  component <- quantity_papers <- group <- group_new <- name <- NULL

  tryCatch(
    {
      # Filter network to keep only specified components
      net |>
        tidygraph::as_tbl_graph() |>
        tidygraph::activate(nodes) |>
        dplyr::mutate(PY = as.numeric(PY)) |>
        dplyr::filter(.data$component %in% keep_component) ->
        net2

      # Split components into separate networks
      purrr::map(keep_component, ~ {
        net2 |>
          tidygraph::as_tbl_graph() |>
          tidygraph::activate(nodes) |>
          dplyr::filter(.data$component == .x)
      }) |>
        purrr::set_names(keep_component) ->
        comp

      # Separate components to cluster vs not to cluster
      no_cluster <- comp[setdiff(keep_component, cluster_component)]
      to_cluster <- comp[intersect(keep_component, cluster_component)]

      # Process components to be clustered
      purrr::map(to_cluster, function(component_net) {

        # component_net <- to_cluster[[1]]
        # rio::export(component_net, '~/Downloads/component_net.rds')

        eb <- switch(algorithm,
          "louvain" = {
            set.seed(seed)
            igraph::cluster_louvain(igraph::as_undirected(component_net))
          },
          "walktrap" = igraph::cluster_walktrap(component_net),
          "edge_betweenness" = igraph::cluster_edge_betweenness(component_net),
          "fast_greedy" = igraph::cluster_fast_greedy(igraph::as_undirected(component_net)),
          "leiden" = {
            set.seed(seed)
            igraph::cluster_leiden(igraph::as_undirected(component_net))
          }
        )

        # Add group membership to vertices
        igraph::V(component_net)$group <- eb$membership

        # Create group summary statistics
        component_net |>
          tidygraph::as_tbl_graph() |>
          tidygraph::activate(nodes) |>
          dplyr::as_tibble() |>
          dplyr::group_by(.data$group) |>
          dplyr::summarise(
            quantity_papers = dplyr::n(),
            average_age = mean(.data$PY, na.rm = TRUE),
            component = unique(.data$component),
            .groups = "drop"
          ) |>
          dplyr::arrange(.data$component, dplyr::desc(.data$quantity_papers)) |>
          dplyr::mutate(
            group_old = .data$group, # Keep original group ID
            group_new = paste(.data$component, "g", 1:dplyr::n(), sep = "")
          ) |>
          dplyr::filter(.data$quantity_papers >= min_group_size) ->
          group_stats_with_mapping

        # Create lookup table for group renaming
        group_stats_with_mapping |>
          dplyr::select(group_old, group_new) ->
          group_lookup

        # Create node-group mapping using the lookup table
        component_net |>
          tidygraph::as_tbl_graph() |>
          tidygraph::activate(nodes) |>
          dplyr::as_tibble() |>
          dplyr::left_join(group_lookup, by = c("group" = "group_old")) |>
          dplyr::filter(!is.na(.data$group_new)) |> # Keep only groups that passed min_size
          dplyr::select(.data$name, group = .data$group_new) ->
          node_groups

        # Final group stats without the mapping column
        group_stats_with_mapping |>
          dplyr::select(group = .data$group_new, .data$quantity_papers, .data$average_age) ->
          group_stats

        list(aggregate = group_stats, ids = node_groups)
      }) ->
        clustered_results

      if (length(no_cluster) != 0) { # process non-clustered components

        purrr::map(no_cluster, function(component_net) {
          component_net |>
            tidygraph::as_tbl_graph() |>
            tidygraph::activate(nodes) |>
            dplyr::as_tibble() |>
            dplyr::group_by(.data$component) |>
            dplyr::summarise(
              quantity_papers = dplyr::n(),
              average_age = mean(.data$PY, na.rm = TRUE),
              .groups = "drop"
            ) |>
            dplyr::select(group = .data$component, .data$quantity_papers, .data$average_age) ->
            group_stats

          component_net |>
            tidygraph::as_tbl_graph() |>
            tidygraph::activate(nodes) |>
            dplyr::as_tibble() |>
            dplyr::select(.data$name, group = .data$component) ->
            node_groups

          list(aggregate = group_stats, ids = node_groups)
        }) ->
          non_clustered_results

        # Combine all results
        all_results <- c(clustered_results, non_clustered_results)
      } else { # only the giant component
        all_results <- clustered_results
      }

      # Create final outputs
      aggregates <- purrr::map_dfr(all_results, "aggregate")
      ids <- purrr::map_dfr(all_results, "ids")

      # Add group information to original network
      net |>
        tidygraph::as_tbl_graph() |>
        tidygraph::activate(nodes) |>
        dplyr::left_join(ids, by = "name") |>
        dplyr::left_join(aggregates, by = "group") |>
        dplyr::filter(group %in% aggregates$group) ->
        net_groups

      net_groups |>
        tidygraph::activate(nodes) |>
        tibble::as_tibble() |>
        dplyr::filter(!is.na(group)) |>
        dplyr::group_by(group, PY) |>
        dplyr::tally(sort = F, name = "publications") |>
        dplyr::arrange(PY, group) |>
        dplyr::rename(year = PY) |>
        dplyr::ungroup() ->
        groups_year

      # Always return full group names (removed groups_short_name logic)
      list(aggregate = aggregates, network = net_groups, pubs_by_year = groups_year)
    },
    error = function(e) {
      stop("Error in grouping network: ", e$message, call. = FALSE)
    }
  )
}
