## ----include = FALSE, warning=FALSE-------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, message = FALSE, warning = FALSE----------------------------------
library(mums2)
library(tidyverse)
library(networkD3)

## ----setup_dt, message = FALSE, warning = FALSE, include = FALSE--------------
Sys.setenv("OMP_THREAD_LIMIT" = 1)
data.table::setDTthreads(threads = 1)

## ----import_data--------------------------------------------------------------
data <- import_all_data(
  peak_table = mums2_example("boryillus_peaktable.csv"), 
  metadata = mums2_example("boryillus_metadata.csv"), 
  format = "Progenesis")

## ----peak_table, R.options=list(max.print=10)---------------------------------
read_csv(mums2_example("boryillus_peaktable.csv"), skip = 2)

## ----metadata, R.options=list(max.print=10)-----------------------------------
read_csv(mums2_example("boryillus_metadata.csv"))

## ----filter_data,  R.options=list(max.print=10)-------------------------------
filtered_data <- data |>
  filter_peak_table(filter_mispicked_ions_params()) |>
  filter_peak_table(filter_cv_params(cv_threshold = 0.2)) |>
  filter_peak_table(filter_group_params(group_threshold = 0.1,
                                            "Blanks")) |>
  filter_peak_table(filter_insource_ions_params())

filtered_data

## ----show_rt_of_filtered_data, R.options=list(max.print=5)--------------------
get_peak_table(filtered_data)$rt

## ----convert_rt_to_minutes_or_seconds, R.options=list(max.print=5)------------
filtered_data <- change_rt_to_seconds_or_minute(
  mpactr_object = filtered_data, rt_type = "seconds"
)

filtered_data

## ----R.options=list(max.print=5)----------------------------------------------
filtered_data <- change_rt_to_seconds_or_minute(
  mpactr_object = filtered_data, rt_type = "minutes"
)

filtered_data

## ----preprocess_ms2_data------------------------------------------------------
matched_data <- ms2_ms1_compare(
  ms2_files = mums2_example("botryllus_v2.gnps.mgf"),
  mpactr_object = filtered_data, mz_tolerance = 5, rt_tolerance = 6)

head(get_ms2_matches(matched_data))

head(get_ms1_data(matched_data))

get_ms2_peaks_data(matched_data)[1]

get_samples(matched_data)

## ----chemical_formula_prediction, R.options=list(max.print=10)----------------
data_small <- import_all_data(
  peak_table = mums2_example("botryllus_pt_small.csv"), 
  metadata = mums2_example("boryillus_metadata.csv"), 
  format = "None") |> 
    filter_peak_table(filter_mispicked_ions_params()) |>
    filter_peak_table(filter_cv_params(cv_threshold = 0.05)) |>
    filter_peak_table(filter_group_params(group_threshold = 0.1,
                                              "Blanks")) |>
    filter_peak_table(filter_insource_ions_params())

matched_data_small <- ms2_ms1_compare(
  ms2_files = mums2_example("botryllus_v2.gnps.mgf"),
  mpactr_object = data_small, mz_tolerance = .5, rt_tolerance = 6)


matched_data_small <- compute_molecular_formulas(
  mass_data = matched_data_small, parent_ppm = 3, number_of_threads = 2)

get_molecular_formula_preds(matched_data_small)

## ----annotations--------------------------------------------------------------
reference_db <- read_msp(msp_file = mums2_example("massbank_example_data.msp"))

annotations <- annotate_ms2(
  mass_data = matched_data, reference = reference_db,
  scoring_params = modified_cosine_params(0.5), ppm = 1000,
  min_score =  0.1, chemical_min_score = 0, number_of_threads = 2)

annotations[1:5,]

## ----scoring_distance, R.options=list(max.print=10)---------------------------
dist <- dist_ms2(
  data = matched_data, cutoff = 0.3, precursor_threshold = -1,
  score_params = modified_cosine_params(0.5), min_peaks = 0, number_of_threads = 2)

dist

## ----clustering, R.options=list(max.print=10)---------------------------------
cluster_results <- cluster_data(
  distance_df = dist, ms2_match_data = matched_data, cutoff = 0.3,
  cluster_method = "opticlust")

cluster_results

## ----community_object, R.options=list(max.print=10)---------------------------
community_w_omus <- create_community_matrix_object(cluster_results)
get_community_matrix(community_w_omus)

## ----community_object_no_omus, R.options=list(max.print=10)-------------------
community_wo_omus <- create_community_matrix_object(data = matched_data)
get_community_matrix(community_wo_omus)

## ----omu_annotations----------------------------------------------------------
annotations <- annotate_ms2(
  mass_data = matched_data, reference = reference_db,
  scoring_params = modified_cosine_params(0.5), ppm = 1000, min_score = 0.7,
  chemical_min_score = 0, cluster_data = cluster_results, number_of_threads = 2)

annotations[1:5,]

## -----------------------------------------------------------------------------
get_community_matrix(community_w_omus) |>
  rowSums() |>
  sort()

## ----alpha_summary, R.options=list(max.print=15)------------------------------
alpha_summary(
  community_object = community_w_omus, size = 40000, threshold = 200,
              diversity_index = c("simpson", "shannon", "richness"),
              subsample = TRUE, number_of_threads = 2) |>
  head()

## ----dist_shared, R.options=list(max.print=10)--------------------------------
bray_no_omus <- dist_shared(community_object = community_wo_omus, size = 40000,
                           threshold = 200, diversity_index = "bray", number_of_threads =  2)
bray_no_omus

bray_w_omus <- dist_shared(community_object = community_w_omus, size = 40000,
                          threshold = 200, diversity_index = "bray", number_of_threads =  2)
bray_w_omus

## ----pcoa, fig.width=10, fig.height=10, fig.fullwidth=TRUE--------------------
pcoa <- cmdscale(bray_w_omus, k = 2, eig = T, add = T)

variance <- round(pcoa$eig*100/sum(pcoa$eig), 1)
colnames(pcoa$points) <- c("pcoa_1", "pcoa_2")

as_tibble(pcoa$points, rownames = "sample") |>
  inner_join(get_metadata(filtered_data), by = c("sample" = "injection")) |>
  ggplot(aes(x=pcoa_1, y = pcoa_2, color = biological_group)) +
    geom_point(size = 5.5) +
    labs(
      x = paste0("PCoA 1 (", variance[1], "%)"),
      y = paste0("PCoA 2 (", variance[2], "%)"),
      color = "Sample type") +
    theme(
      legend.text = element_text(size = 10), 
      legend.title = element_text(size = 15)
    )

## ----hierarchical_clustering, fig.width=10, fig.height=10, fig.fullwidth=TRUE----
old_par <- par(no.readonly = TRUE)
hclust_result <- hclust(bray_w_omus, "average")
par(cex=0.6, mar=c(5, 8, 4, 1))
plot(hclust_result)
par(old_par)

## ----network_plot-------------------------------------------------------------
distance_df_annotations <- annotations[, c("query_ms2_id", "name", "score")]

simpleNetwork(
  distance_df_annotations, height="100px", width="100px", zoom = TRUE)

## ----group_average------------------------------------------------------------
  matched_avg <- convert_to_group_averages(
  matched_data = matched_data, mpactr_object = filtered_data)

dist_avg <- dist_ms2(
  data = matched_avg, cutoff = 0.3, precursor_thresh = 2,
  score_params = modified_cosine_params(0.5), min_peaks = 0, number_of_threads = 2)

cluster_results_avg <- cluster_data(
  distance_df = dist, ms2_match_data = matched_avg, cutoff = 0.3,
  cluster_method = "opticlust")

annotations_avg <- annotate_ms2(
  mass_data = matched_avg,
  reference = reference_db, scoring_params = modified_cosine_params(0.5), 
  ppm = 1000, min_score = 0.6, chemical_min_score = 0,
  cluster_data = cluster_results_avg, number_of_threads = 2)

community_object_avg <- create_community_matrix_object(
  data = cluster_results_avg)
head(get_community_matrix(community_object = community_object_avg), 3)

## ----samples------------------------------------------------------------------
# Normal
head(get_samples(matched_data))

# Group Average
head(get_samples(matched_avg))

# The items in the injection have been converted into the corresponding sample code.
get_metadata(filtered_data)[, c("injection", "sample_code")]

## ----combined_df--------------------------------------------------------------
# For normal data
generate_a_combined_table(
  matched_data = matched_data, annotations = annotations, 
  cluster_data = cluster_results) |> 
  head(n = 3)

# For group averaged data
generate_a_combined_table(
  matched_data = matched_avg, annotations = annotations_avg,
  cluster_data = cluster_results_avg) |>
  head(n = 3)

## ----heat_map, fig.width=10, fig.height=10, fig.fullwidth=TRUE----------------
dist_shared(community_object_avg, 40000, 200, "bray", number_of_threads = 2) |> 
  as.matrix() |>
  as_tibble(rownames = "A") |>
  pivot_longer(-A, names_to = "B", values_to = "distance") |>
  ggplot(aes(x = A, y = B, fill = distance)) +
  geom_tile() +
  scale_fill_gradient(low = "#FFFFFF", high="#FF0000") +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    panel.background = element_blank()
  )

## ----box_plot, fig.width=10, fig.height=10, fig.fullwidth=TRUE----------------
metadata <- get_metadata(filtered_data)

alpha <- alpha_summary(community_object_avg, 40000, 200, "simpson", number_of_threads = 2)

inner_join(alpha, metadata, by = c("samples" = "sample_code")) |>
  ggplot(aes(x = biological_group, y = simpson)) +
  geom_boxplot(outliers = FALSE, fill = NA, color = "gray") +
  geom_jitter(width = 0.3)  + 
  scale_y_continuous(limits = c(0, NA)) +
  theme_classic()

