Download a copy of the vignette to follow along here: label_propagation.Rmd
In this vignette, we will walk through label propagation in the
metasnf package. Code from this vignette is largely taken
from the end of the less
simple example vignette.
The label propagation procedure can be used to predict cluster membership for new, unlabeled observations based on their similarity to previously labeled observations. These unlabeled observations could be a held out test set from your original sample or a new sample entirely.
The process involves the following steps:
There is a lot of room for flexibility in how steps 1 and 2 are conducted. SNF is not necessary at any part of the process. For example, step one could be done by assigning clusters in your training set manually or by a simple clustering method like k-means. Step two could be done just by calculating the euclidean distances across all the training and testing observations for a small subset of features. The features used to calculate the similarities in step 2 don’t necessarily need to be the same ones used to derive the cluster solution in the training set either.
All that aside, we show here a simple approach that involves
assigning the clusters by a call to batch_snf, assembling a
data list that has the training and testing set observations, and
feeding the results into a simple label propagating function,
label_propagate.
library(metasnf)
# Function to identify obervations with complete data
uids_with_complete_obs <- get_complete_uids(
    list(
        cort_t,
        cort_sa,
        subc_v,
        income,
        pubertal,
        anxiety,
        depress
    ),
    uid = "unique_id"
)
# Dataframe assigning 80% of observations to train and 20% to test
train_test_split <- train_test_assign(
    train_frac = 0.8,
    uids = uids_with_complete_obs
)
# Pulling the training and testing observations specifically
train_obs <- train_test_split$"train"
test_obs <- train_test_split$"test"
# Partition a training set
train_cort_t <- cort_t[cort_t$"unique_id" %in% train_obs, ]
train_cort_sa <- cort_sa[cort_sa$"unique_id" %in% train_obs, ]
train_subc_v <- subc_v[subc_v$"unique_id" %in% train_obs, ]
train_income <- income[income$"unique_id" %in% train_obs, ]
train_pubertal <- pubertal[pubertal$"unique_id" %in% train_obs, ]
train_anxiety <- anxiety[anxiety$"unique_id" %in% train_obs, ]
train_depress <- depress[depress$"unique_id" %in% train_obs, ]
# Partition a test set
test_cort_t <- cort_t[cort_t$"unique_id" %in% test_obs, ]
test_cort_sa <- cort_sa[cort_sa$"unique_id" %in% test_obs, ]
test_subc_v <- subc_v[subc_v$"unique_id" %in% test_obs, ]
test_income <- income[income$"unique_id" %in% test_obs, ]
test_pubertal <- pubertal[pubertal$"unique_id" %in% test_obs, ]
test_anxiety <- anxiety[anxiety$"unique_id" %in% test_obs, ]
test_depress <- depress[depress$"unique_id" %in% test_obs, ]
# Find cluster solutions in the training set
train_dl <- data_list(
    list(train_cort_t, "cort_t", "neuroimaging", "continuous"),
    list(train_cort_sa, "cortical_sa", "neuroimaging", "continuous"),
    list(train_subc_v, "subc_v", "neuroimaging", "continuous"),
    list(train_income, "household_income", "demographics", "continuous"),
    list(train_pubertal, "pubertal_status", "demographics", "continuous"),
    uid = "unique_id"
)
# We'll pick a solution that has good separation over our target features
train_target_dl <- data_list(
    list(train_anxiety, "anxiety", "behaviour", "ordinal"),
    list(train_depress, "depressed", "behaviour", "ordinal"),
    uid = "unique_id"
)
set.seed(42)
sc <- snf_config(
    train_dl,
    n_solutions = 5,
    min_k = 10,
    max_k = 30
)
#> ℹ No distance functions specified. Using defaults.
#> ℹ No clustering functions specified. Using defaults.
train_sol_df <- batch_snf(
    train_dl,
    sc,
    return_sim_mats = TRUE
)
ext_sol_df <- extend_solutions(
    train_sol_df,
    train_target_dl
)
# Determining solution with the lowest minimum p-value
lowest_min_pval <- min(ext_sol_df$"min_pval")
which(ext_sol_df$"min_pval" == lowest_min_pval)
#> [1] 1
top_row <- ext_sol_df[1, ]
# Propagate that solution to the observations in the test set
# data list below has both training and testing observations
full_dl <- data_list(
    list(cort_t, "cort_t", "neuroimaging", "continuous"),
    list(cort_sa, "cort_sa", "neuroimaging", "continuous"),
    list(subc_v, "subc_v", "neuroimaging", "continuous"),
    list(income, "household_income", "demographics", "continuous"),
    list(pubertal, "pubertal_status", "demographics", "continuous"),
    uid = "unique_id"
)
# Use the solutions data frame from the training observations and the data list from
# the training and testing observations to propagate labels to the test observations
propagated_labels <- label_propagate(top_row, full_dl)
head(propagated_labels)
#>                    uid     group 1
#> 1 uid_NDAR_INV0567T2Y9 clustered 1
#> 2 uid_NDAR_INV0J4PYA5F clustered 2
#> 3 uid_NDAR_INV10OMKVLE clustered 1
#> 4 uid_NDAR_INV15FPCW4O clustered 1
#> 5 uid_NDAR_INV19NB4RJK clustered 1
#> 6 uid_NDAR_INV1HLGR738 clustered 1
tail(propagated_labels)
#>                     uid       group 1
#> 82 uid_NDAR_INVG5CI7XK4 unclustered 1
#> 83 uid_NDAR_INVGDBYXWV4 unclustered 1
#> 84 uid_NDAR_INVHEUWA52I unclustered 2
#> 85 uid_NDAR_INVK9ULDQA2 unclustered 1
#> 86 uid_NDAR_INVKYH529RD unclustered 1
#> 87 uid_NDAR_INVLDQH8ATK unclustered 1You could, if you wanted, see how all of your clustering solutions propagate to the test set, but that would mean reusing your test set and removing much of the protection against overfitting provided by this procedure.
propagated_labels_all <- label_propagate(ext_sol_df, full_dl)
head(propagated_labels_all)
#>                    uid     group 1 2 3 4  5
#> 1 uid_NDAR_INV0567T2Y9 clustered 1 1 5 1 10
#> 2 uid_NDAR_INV0J4PYA5F clustered 2 1 5 1  3
#> 3 uid_NDAR_INV10OMKVLE clustered 1 1 3 2  5
#> 4 uid_NDAR_INV15FPCW4O clustered 1 1 4 1  4
#> 5 uid_NDAR_INV19NB4RJK clustered 1 1 8 2  9
#> 6 uid_NDAR_INV1HLGR738 clustered 1 2 8 1  9
tail(propagated_labels_all)
#>                     uid       group 1 2 3 4 5
#> 82 uid_NDAR_INVG5CI7XK4 unclustered 1 1 2 1 2
#> 83 uid_NDAR_INVGDBYXWV4 unclustered 1 1 4 1 4
#> 84 uid_NDAR_INVHEUWA52I unclustered 2 1 1 2 1
#> 85 uid_NDAR_INVK9ULDQA2 unclustered 1 1 1 1 1
#> 86 uid_NDAR_INVKYH529RD unclustered 1 1 7 1 7
#> 87 uid_NDAR_INVLDQH8ATK unclustered 1 1 6 2 8