MSCA

Background

This library implements basic tools to conduct unsupervised learning or clustering of instances - such as patients for instance - are described by multiple censored time-to-event endpoints. It has been developed to be adapted to situations where events are not associated with a change in state such as in the field of the social sciences where events such as or mark a change in status from one state to another, but have an additive impact, such as multiple long-term conditions on patients. This short vignette (on progress) will describe shortly how to conduct a full analysis using a toy dataset.

Unsupervised analyses workflow are conducted through the following steps when based on distances / dissimilarity between analysed instances:

The main purpose of the proposed tools is to be able to compute the Jaccard distance between patients on multiple censored time-to-event indicators. As a results patients having similar trajectories are expected to get clustered together, whereas patients with divergent health trajectories are likely to be assigned to different clusters.

In the fist section we will show how to construct censored state matrices from time stamped records (electonic health records) using simulated electronic health records. In section 2, we will show how to compute patients dissimilarity and derive a simple typology. In section 3 will will illustrate the use of the CLARA procedure in this setting when having to analyse larger set of patient (> 15000).

From electronic health records to state matrices

Load data and compute individual patient state matrices

library(MSCA)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

data(EHR)
head(EHR)
#> # A tibble: 6 × 3
#>   link_id  reg      aos
#>   <chr>    <chr>  <dbl>
#> 1 K_610836 ltc_30  83.0
#> 2 K_739086 ltc_22  73.0
#> 3 K_661210 ltc_15  79.9
#> 4 K_866970 ltc_31  30.6
#> 5 K_270151 ltc_31  39.6
#> 6 K_243582 ltc_16  63.5
EHR %>%
  nrow()
#> [1] 4856

Our toy dataset is composed of 4856 records 35 long term conditions and two absorbing states (death of censoring).

EHR %>%
  group_by( reg ) %>%
  tally
#> # A tibble: 37 × 2
#>    reg        n
#>    <chr>  <int>
#>  1 cens    1225
#>  2 death    927
#>  3 ltc_1     88
#>  4 ltc_10   171
#>  5 ltc_11    21
#>  6 ltc_12    83
#>  7 ltc_13   122
#>  8 ltc_14    11
#>  9 ltc_15    91
#> 10 ltc_16   259
#> # ℹ 27 more rows

The function ( make_state_matrix ) is needed to obtain the individual patients state matrices:

s_mat <- make_state_matrices(
  data = EHR,
  id = "link_id",
  ltc = "reg",
  aos = "aos",
  l = 111,
  fail_code = "death",
  cens_code = "cens"
)
dim( s_mat )
#> [1] 4144 2152

Compute the Jaccard distance between patients

The use of allow to speed the computation of Jaccard distance between patients.

library( cluster )
library( fastcluster )
#> 
#> Attaching package: 'fastcluster'
#> The following object is masked from 'package:stats':
#> 
#>     hclust
# Compute the jaccard distance
d_mat <- fast_jaccard_dist( s_mat , as.dist = TRUE )

# Get a hierachical clustering using the built in hclust function
h_mat <- hclust(d = d_mat , method = 'ward.D2' )
h_mat
#> 
#> Call:
#> hclust(d = d_mat, method = "ward.D2")
#> 
#> Cluster method   : ward.D2 
#> Number of objects: 2152

# Get a typology

ct_mat_8 <- cutree( h_mat , k = 8 )
table( ct_mat_8 )
#> ct_mat_8
#>    1    2    3    4    5    6    7    8 
#> 1172  116  242  129  157   91  136  109

Analyse clusters and get sequences statistics

Once a typology has been defined it become interesting to obtain basic sequence statistics by clusters. To do so few data manipulation is needed:

# Get a data frame with patient id and cluster assignation 
df1 <- data.frame( link_id = names(ct_mat_8) , cl = paste0('cl_',ct_mat_8)) 
head(df1)  
#>    link_id   cl
#> 1  K_10030 cl_1
#> 2 K_101275 cl_1
#> 3  K_10227 cl_1
#> 4 K_102385 cl_1
#> 5 K_102612 cl_1
#> 6 K_103518 cl_1

# Merge with primary data
EHR_cl <- EHR %>%
  left_join( df1 )
#> Joining with `by = join_by(link_id)`

# Get cluster sequences by cluster
dt_seq <- get_cluster_sequences(
  dt =  EHR_cl ,
  cl_col = "cl",
  id_col = "link_id",
  event_col = "reg",
  k = 2
)

# Get basic stats by cluster
sequence_stats(
  seq_list = dt_seq$sequences ,
  min_seq_freq = 0.03,
  min_conditional_prob = 0,
  min_relative_risk = 0
)
#> $cl_7
#> # A tibble: 2 × 6
#>   from  to    seq_count seq_freq conditional_prob relative_risk
#>   <chr> <chr>     <int>    <dbl>            <dbl>         <dbl>
#> 1 ltc_6 cens         82    0.383            0.536          1.15
#> 2 ltc_6 death        54    0.252            0.353          1.03
#> 
#> $cl_3
#> # A tibble: 2 × 6
#>   from   to    seq_count seq_freq conditional_prob relative_risk
#>   <chr>  <chr>     <int>    <dbl>            <dbl>         <dbl>
#> 1 ltc_22 cens        133    0.382            0.498          1.10
#> 2 ltc_22 death       109    0.313            0.408          1.06
#> 
#> $cl_4
#> # A tibble: 2 × 6
#>   from   to    seq_count seq_freq conditional_prob relative_risk
#>   <chr>  <chr>     <int>    <dbl>            <dbl>         <dbl>
#> 1 ltc_31 cens         90    0.370            0.584          1.12
#> 2 ltc_31 death        39    0.160            0.253          1.12
#> 
#> $cl_2
#> # A tibble: 2 × 6
#>   from   to    seq_count seq_freq conditional_prob relative_risk
#>   <chr>  <chr>     <int>    <dbl>            <dbl>         <dbl>
#> 1 ltc_16 death        64    0.358            0.504          1.07
#> 2 ltc_16 cens         52    0.291            0.409          1.18
#> 
#> $cl_1
#> # A tibble: 5 × 6
#>   from   to    seq_count seq_freq conditional_prob relative_risk
#>   <chr>  <chr>     <int>    <dbl>            <dbl>         <dbl>
#> 1 ltc_10 cens         76   0.0433            0.422         0.968
#> 2 ltc_10 death        76   0.0433            0.422         1.10 
#> 3 ltc_13 cens         69   0.0393            0.535         1.23 
#> 4 ltc_23 cens         65   0.0371            0.546         1.25 
#> 5 ltc_12 cens         53   0.0302            0.582         1.34 
#> 
#> $cl_8
#> # A tibble: 2 × 6
#>   from   to    seq_count seq_freq conditional_prob relative_risk
#>   <chr>  <chr>     <int>    <dbl>            <dbl>         <dbl>
#> 1 ltc_16 cens         83    0.464            0.654          1.08
#> 2 ltc_16 death        26    0.145            0.205          1.11
#> 
#> $cl_5
#> # A tibble: 2 × 6
#>   from  to    seq_count seq_freq conditional_prob relative_risk
#>   <chr> <chr>     <int>    <dbl>            <dbl>         <dbl>
#> 1 ltc_7 cens         93    0.355            0.505          1.08
#> 2 ltc_7 death        64    0.244            0.348          1.13
#> 
#> $cl_6
#> # A tibble: 3 × 6
#>   from   to    seq_count seq_freq conditional_prob relative_risk
#>   <chr>  <chr>     <int>    <dbl>            <dbl>         <dbl>
#> 1 ltc_18 cens         63     0.36            0.573         1.17 
#> 2 ltc_18 death        28     0.16            0.255         1.09 
#> 3 ltc_22 cens          7     0.04            0.467         0.950