This vignette demonstrates how to:
We’ll use two datasets:
# Packages required for this vignette
pkgs <- c(
"rpart", "e1071", "dplyr", "tidyr", "ggplot2",
"rsample", "gridExtra", "kableExtra", "palmerpenguins"
)
# Load each package quietly if available
invisible(lapply(pkgs, function(pkg) {
if (!requireNamespace(pkg, quietly = TRUE)) {
stop(sprintf("Package '%s' is required to run this vignette.", pkg))
}
}))
library(svmodt)The Palmer Penguins dataset contains measurements of three penguin species from Antarctica. We’ll build a classifier to distinguish between Adelie and Chinstrap penguins.
# Adelie vs Chinstrap
penguins_data <- palmerpenguins::penguins |>
dplyr::filter(species %in% c("Adelie", "Chinstrap")) |>
dplyr::select(
species, bill_length_mm, bill_depth_mm,
flipper_length_mm, body_mass_g
) |>
na.omit() |>
dplyr::mutate(species = droplevels(species))
set.seed(234)
split_data <- rsample::initial_split(penguins_data, prop = 0.8, strata = species)
train_penguins <- rsample::training(split_data)
test_penguins <- rsample::testing(split_data)# Print tree structure
print(tree_penguins,
show_probabilities = TRUE,
show_feature_info = TRUE
)
#> [Node] depth = 1 | n = 174 | features = [bill_length_mm,flipper_length_mm] | penalty = +
#> |- Positive branch (distance >= 0):
#> | [Node] depth = 2 | n = 121 | features = [bill_length_mm,bill_depth_mm]
#> | |- Positive branch (distance >= 0):
#> | | [Leaf] predict = Adelie | n = 120 | probs = [Adelie = 0.975, Chinstrap = 0.025] | features = [bill_length_mm,bill_depth_mm]
#> | `- Negative branch (distance < 0):
#> | [Leaf] predict = Chinstrap | n = 1 | probs = [Adelie = 0, Chinstrap = 1]
#> `- Negative branch (distance < 0):
#> [Node] depth = 2 | n = 53 | features = [bill_length_mm,bill_depth_mm]
#> |- Positive branch (distance >= 0):
#> | [Leaf] predict = Adelie | n = 2 | probs = [Adelie = 1, Chinstrap = 0]
#> `- Negative branch (distance < 0):
#> [Leaf] predict = Chinstrap | n = 51 | probs = [Adelie = 0.02, Chinstrap = 0.98] | features = [bill_length_mm,bill_depth_mm]# Predict classes only
predictions <- predict(tree_penguins, test_penguins)
# Predict with probabilities
predictions_prob <- predict(tree_penguins, test_penguins,
return_probs = TRUE
)
# View first few predictions
head(data.frame(
Actual = test_penguins$species,
Predicted = predictions_prob$predictions,
Prob_Adelie = round(predictions_prob$probabilities[, "Adelie"], 3),
Prob_Chinstrap = round(predictions_prob$probabilities[, "Chinstrap"], 3)
), 10) |>
kableExtra::kable(align = "lccc", format = "html", caption = "SVMODT Class Predictions with Associated Probabilites on Palmerpenguins dataset") |>
kableExtra::kable_styling(position = "center", full_width = FALSE)| Actual | Predicted | Prob_Adelie | Prob_Chinstrap |
|---|---|---|---|
| Adelie | Adelie | 0.975 | 0.025 |
| Adelie | Adelie | 0.975 | 0.025 |
| Adelie | Adelie | 0.975 | 0.025 |
| Adelie | Adelie | 0.975 | 0.025 |
| Adelie | Adelie | 0.975 | 0.025 |
| Adelie | Adelie | 0.975 | 0.025 |
| Adelie | Adelie | 0.975 | 0.025 |
| Adelie | Adelie | 0.975 | 0.025 |
| Adelie | Adelie | 0.975 | 0.025 |
| Adelie | Adelie | 0.975 | 0.025 |
Table 1 shows the predicted value along with the associated probabilities of the first 10 observations of the palmerpenguins dataset. The prediction function returns the majority class at each node along the class proportion. In this table we can observe that all the observations have been correctly predicted.
# Visualize tree decision boundaries
viz_penguins <- plot(tree_penguins,
data = train_penguins,
response = "species",
plot.type = "boundary",
max_depth = 3)
#> $depth_1_RootFigure 1: SVMODT Split at Root Node (Depth = 1)
#>
#> $depth_2_Root_L
Figure 2: SVMODT Split at Root Node (Depth = 1)
#>
#> $depth_2_Root_R
Figure 3: SVMODT Split at Root Node (Depth = 1)
Figure 4: SVMODT Split at Root Node (Depth = 1)
Figure ?? depicts the root node of the SVM-based oblique decision tree, where the first split is performed using the features bill_length_mm and flipper_length_mm. The figure illustrates how the linear SVM at the root node divides the dataset into two branches, guiding samples toward subsequent child nodes.
gridExtra::grid.arrange(viz_penguins$plots$depth_2_Root_L, viz_penguins$plots$depth_2_Root_R, ncol = 2)Figure 2: SVMODT Split at Child Node (Depth = 2)
Figure 2 shows a node at depth 2 of the tree, where the SVM uses bill_length_mm and bill_depth_mm to further partition the data. The decision hyper-plane and sample positions are plotted, highlighting how oblique splits can capture multivariate relationships that univariate thresholds cannot.
trace_path(tree_penguins, test_penguins, sample_idx = 1)
#> === Tracing Prediction Path ===
#> Sample 1 :
#> species = 1
#> bill_length_mm = 38.9
#> bill_depth_mm = 17.8
#> flipper_length_mm = 181
#> body_mass_g = 3625
#>
#> [Node 1 ] features = bill_length_mm,flipper_length_mm
#> SVM decision value: 2.0018
#> -> Going LEFT (decision > 0)
#> [Node 2 ] features = bill_length_mm,bill_depth_mm
#> SVM decision value: 2.5287
#> -> Going LEFT (decision > 0)
#> [FINAL] Predict Adelie (n = 120 )
#> Path taken: LEFT -> LEFT
#>
#> Final prediction: AdelieThe Wisconsin Breast Cancer dataset contains features computed from digitized images of fine needle aspirate (FNA) of breast masses. The task is to classify tumors as Benign (B) or Malignant (M).
svmodt promotes feature diversity by penalizing previously used features in ancestor nodes.
The package also allows user to either randomize or decrease the number of features in child nodes.
# Decrease features with depth
tree_decrease <- svm_split(
data = train_wdbc,
response = "diagnosis",
max_depth = 5,
max_features = 10,
max_features_strategy = "decrease",
max_features_decrease_rate = 0.7,
verbose = FALSE
)
# Random feature selection
tree_random <- svm_split(
data = train_wdbc,
response = "diagnosis",
max_depth = 4,
max_features_strategy = "random",
max_features_random_range = c(0.3, 0.8),
verbose = FALSE
)We can also add custom class weights for our class labels in the model using domain expertise. This enables us to minimize classification of false negatives by assigning correct weights based on domain expertise.
The Wine dataset contains chemical measurements for 178 wine samples from three Italian cultivars. This demonstrates SVMODT’s multiclass capability via one-vs-rest splitting at each node.
set.seed(234)
wine$class <- as.factor(wine$class)
split_wine <- rsample::initial_split(wine, prop = 0.8, strata = class)
train_wine <- rsample::training(split_wine)
test_wine <- rsample::testing(split_wine)At each internal node, SVMODT iterates over all present classes, fits a binary one-vs-rest SVM, and selects the split that maximally reduces entropy. This continues recursively until max_depth is reached or min_impurity_decrease prevents uninformative splits.
Figure 5: SVMODT Decision Surface on Wine Dataset
Figure 5 displays multiclass decision surface on the Wine dataset across two features in the tree.
print(tree_wine,
show_probabilities = FALSE,
show_feature_info = TRUE,
show_penalties = TRUE
)
#> [Node] depth = 1 | n = 141 | split: 1 vs rest | impurity = 0.7237 | features = [flavanoids,proline,od280_od315,color_intensity,hue] | penalty = +
#> |- Positive branch (distance >= 0):
#> | [Node] depth = 2 | n = 49 | features = [color_intensity,alcohol,malic_acid,ash,alcalinity_of_ash]
#> | |- Positive branch (distance >= 0):
#> | | [Leaf] predict = 1 | n = 47
#> | `- Negative branch (distance < 0):
#> | [Leaf] predict = 2 | n = 2
#> `- Negative branch (distance < 0):
#> [Node] depth = 2 | n = 92 | features = [color_intensity,malic_acid,alcohol,flavanoids,od280_od315] | penalty = !
#> |- Positive branch (distance >= 0):
#> | [Node] depth = 3 | n = 54 | features = [alcohol,malic_acid,ash,alcalinity_of_ash,magnesium]
#> | |- Positive branch (distance >= 0):
#> | | [Leaf] predict = 2 | n = 50
#> | `- Negative branch (distance < 0):
#> | [Leaf] predict = 2 | n = 4
#> `- Negative branch (distance < 0):
#> [Node] depth = 3 | n = 38 | features = [alcohol,malic_acid,ash,alcalinity_of_ash,magnesium]
#> |- Positive branch (distance >= 0):
#> | [Leaf] predict = 3 | n = 4
#> `- Negative branch (distance < 0):
#> [Leaf] predict = 3 | n = 34Each internal node shows the class used for the one-vs-rest binary split (split: X vs rest), the features selected at that node, and whether feature penalization was active (penalty = !).
preds_wine <- predict(tree_wine, newdata = test_wine)
acc_wine <- mean(preds_wine == test_wine$class)
cat("Test accuracy:", round(acc_wine, 4), "\n")
#> Test accuracy: 0.9459
conf_mat <- table(Predicted = preds_wine, Actual = test_wine$class)
print(conf_mat)
#> Actual
#> Predicted 1 2 3
#> 1 10 0 0
#> 2 2 15 0
#> 3 0 0 10# Show how the first test observation is routed through the tree
trace_path(tree_wine, test_wine, sample_idx = 1)
#> === Tracing Prediction Path ===
#> Sample 1 :
#> class = 1
#> alcohol = 13.2
#> malic_acid = 1.78
#> ash = 2.14
#> alcalinity_of_ash = 11.2
#> magnesium = 100
#> total_phenols = 2.65
#> flavanoids = 2.76
#> nonflavanoid_phenols = 0.26
#> proanthocyanins = 1.28
#> color_intensity = 4.38
#> hue = 1.05
#> od280_od315 = 3.4
#> proline = 1050
#>
#> [Node 1 ] features = flavanoids,proline,od280_od315,color_intensity,hue
#> SVM decision value: 2.2295
#> -> Going LEFT (decision > 0)
#> [Node 2 ] features = color_intensity,alcohol,malic_acid,ash,alcalinity_of_ash
#> SVM decision value: 2.1619
#> -> Going LEFT (decision > 0)
#> [FINAL] Predict 1 (n = 47 )
#> Path taken: LEFT -> LEFT
#>
#> Final prediction: 1Now, we will compare the performance of our SVMODT model with other classification models such as a Linear SVM and a Decision Tree. For our svmodt tree we are using a tree-depth of 2 with mutual feature selection and penalization. For the decision tree we are setting the cost-complexity parameter as 0.01. Lastly, for the linear SVM model we have not done any hyperparameter tuning and using the default parameter of C = 1.
# RPART decision tree
tree_rpart <- rpart::rpart(diagnosis ~ .,
data = train_wdbc,
control = rpart::rpart.control(cp = 0.01)
)
pred_rpart <- predict(tree_rpart, test_wdbc, type = "class")
tree_wdbc <- svm_split(
data = train_wdbc,
response = "diagnosis",
max_depth = 2,
feature_method = "mutual",
penalize_used_features = TRUE
)
# Standard SVM
model_svm <- e1071::svm(diagnosis ~ ., data = train_wdbc, probability = TRUE)
pred_svm <- predict(model_svm, test_wdbc)
# Get SVMODT predictions
pred_svmodt <- predict(tree_wdbc, test_wdbc)
# Compare accuracies
results <- data.frame(
Model = c("SVMODT", "RPART", "Linear SVM"),
Accuracy = c(
mean(pred_svmodt == test_wdbc$diagnosis),
mean(pred_rpart == test_wdbc$diagnosis),
mean(pred_svm == test_wdbc$diagnosis)
)
)
results |>
kableExtra::kable(
align = "lc", format = "html", digits = 4,
caption = "Comparing Test set Accuracy of SVMODT model with a Linear SVM and a Decision Tree"
) |>
kableExtra::kable_styling(position = "center", full_width = FALSE)| Model | Accuracy |
|---|---|
| SVMODT | 0.9652 |
| RPART | 0.8870 |
| Linear SVM | 0.9565 |
From Table 2 we can observe that our SVMODT model has outperformed both the Linear SVM model and the Rpart Decision tree on the test.