In this vignette we will see how to use
mockDrugUtilisation() function to create mock data. This
function is predominantly used in this package’s unit testing.
For example, one could use the default parameters to create a mock cdm reference like so:
This will then populate several omop tables (for example,
person, concept and
visit_occurrence) and two cohorts in the cdm reference.
cdm$person |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 11
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ person_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
#> $ gender_concept_id <dbl> 8507, 8532, 8507, 8507, 8532, 8507, 8507, 8507, 8…
#> $ year_of_birth <int> 2008, 2000, 1970, 2003, 1956, 1986, 1986, 1983, 1…
#> $ day_of_birth <int> 5, 21, 26, 11, 20, 20, 13, 9, 11, 1
#> $ birth_datetime <date> 2008-12-05, 2000-11-21, 1970-11-26, 2003-02-11, 1…
#> $ race_concept_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ ethnicity_concept_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ location_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ provider_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ care_site_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
#> $ month_of_birth <int> 12, 11, 11, 2, 4, 1, 2, 12, 3, 5
cdm$person |>
dplyr::tally()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> n
#> <dbl>
#> 1 10cdm$concept |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 10
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ concept_id <dbl> 8505, 8507, 8532, 8576, 8587, 8718, 9202, 9551, 9655,…
#> $ concept_name <chr> "hour", "MALE", "FEMALE", "milligram", "milliliter", …
#> $ domain_id <chr> "Unit", "Gender", "Gender", "Unit", "Unit", "Unit", "…
#> $ vocabulary_id <chr> "UCUM", "Gender", "Gender", "UCUM", "UCUM", "UCUM", "…
#> $ concept_class_id <chr> "Unit", "Gender", "Gender", "Unit", "Unit", "Unit", "…
#> $ standard_concept <chr> "S", "S", "S", "S", "S", "S", "S", "S", "S", NA, "S",…
#> $ concept_code <chr> "h", "M", "F", "mg", "mL", "[iU]", "OP", "10*-3.eq", …
#> $ valid_start_date <chr> "01/01/1970", "01/01/1970", "01/01/1970", "01/01/1970…
#> $ valid_end_date <chr> "31/12/2099", "31/12/2099", "31/12/2099", "31/12/2099…
#> $ invalid_reason <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
cdm$concept |>
dplyr::tally()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> n
#> <dbl>
#> 1 38cdm$visit_occurrence |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 6
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ visit_occurrence_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
#> $ person_id <int> 2, 2, 3, 3, 3, 3, 4, 6, 7, 8, 10, 1, 1, 2, 2, 3,…
#> $ visit_concept_id <dbl> 9202, 9202, 9202, 9202, 9202, 9202, 9202, 9202, …
#> $ visit_start_date <date> 2022-06-10, 2022-05-25, 1997-05-27, 1984-10-29,…
#> $ visit_end_date <date> 2022-06-11, 2022-05-28, 2000-04-19, 2001-03-20,…
#> $ visit_type_concept_id <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
cdm$visit_occurrence |>
dplyr::tally()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> n
#> <dbl>
#> 1 47cdm$cohort1 |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ cohort_definition_id <int> 1, 3, 2, 1, 3, 3, 1, 3, 2, 1
#> $ subject_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
#> $ cohort_start_date <date> 2020-09-20, 2022-05-21, 2010-02-10, 2022-01-26, 2…
#> $ cohort_end_date <date> 2021-03-18, 2022-06-05, 2010-07-21, 2022-04-28, 2…
cdm$cohort1 |>
dplyr::tally()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> n
#> <dbl>
#> 1 10cdm$cohort2 |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ cohort_definition_id <int> 1, 1, 2, 1, 2, 3, 1, 3, 1, 1
#> $ subject_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
#> $ cohort_start_date <date> 2021-02-10, 2022-06-05, 1991-01-13, 2021-02-09, 2…
#> $ cohort_end_date <date> 2021-02-12, 2022-06-07, 2009-08-28, 2021-07-19, 2…
cdm$cohort2 |>
dplyr::tally()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> n
#> <dbl>
#> 1 10The user can also set the seed to control the randomness within the data.
We now observe that cohort1 has been changed as a result
of this seed:
cdm$cohort1 |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ cohort_definition_id <int> 2, 1, 2, 1, 1, 3, 1, 3, 2, 1
#> $ subject_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
#> $ cohort_start_date <date> 2018-06-14, 2019-04-10, 2020-01-28, 2010-07-09, 2…
#> $ cohort_end_date <date> 2018-08-10, 2019-11-19, 2020-02-02, 2015-04-24, 2…The users can then create mock data in two ways, one is to set the
numberIndividual parameter and the other is to cusutomise
the tables.
An example of use is as follows:
This will ensure that each of person,
observation_period, cohort1 and
cohort2 will have 100 rows.
cdm$person |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 11
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ person_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
#> $ gender_concept_id <dbl> 8507, 8532, 8507, 8507, 8532, 8507, 8507, 8507, 8…
#> $ year_of_birth <int> 1977, 1997, 1982, 1994, 1970, 1980, 1966, 1997, 2…
#> $ day_of_birth <int> 26, 22, 23, 22, 1, 13, 27, 10, 15, 21, 2, 12, 4, …
#> $ birth_datetime <date> 1977-04-26, 1997-12-22, 1982-04-23, 1994-08-22, …
#> $ race_concept_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ ethnicity_concept_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ location_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ provider_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ care_site_id <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ month_of_birth <int> 4, 12, 4, 8, 5, 3, 1, 2, 1, 10, 3, 5, 2, 12, 3, 4…cdm$person |>
dplyr::tally()
#> # Source: SQL [1 x 1]
#> # Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> n
#> <dbl>
#> 1 100As a consequence of this, the number of rows for other tables such as
visit_occurrence, condition_occurrence and
drug_strength will have more rows compared to the mock data
produced using default settings.
cdm$visit_occurrence |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 6
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ visit_occurrence_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
#> $ person_id <int> 1, 1, 1, 1, 2, 2, 2, 4, 4, 5, 5, 5, 6, 7, 7, 7, …
#> $ visit_concept_id <dbl> 9202, 9202, 9202, 9202, 9202, 9202, 9202, 9202, …
#> $ visit_start_date <date> 1989-11-05, 1988-01-17, 1986-10-19, 1996-12-22,…
#> $ visit_end_date <date> 2000-03-28, 1993-12-19, 1994-11-13, 1998-03-13,…
#> $ visit_type_concept_id <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …As we saw previously, the omop tables are automatically populated in
mockDrugUtilisation(). However, the user can customise
these tables. For example, to customise drug_exposure
table, one could do the following:
cdm <- mockDrugUtilisation(
drug_exposure = dplyr::tibble(
drug_exposure_id = 1:3,
person_id = c(1, 1, 1),
drug_concept_id = c(2, 3, 4),
drug_exposure_start_date = as.Date(c(
"2000-01-01", "2000-01-10", "2000-02-20"
)),
drug_exposure_end_date = as.Date(c(
"2000-02-10", "2000-03-01", "2000-02-20"
)),
quantity = c(41, 52, 1),
drug_type_concept_id = 0
)
)cdm$drug_exposure |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 7
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ drug_exposure_id <int> 1, 2, 3
#> $ person_id <dbl> 1, 1, 1
#> $ drug_concept_id <dbl> 2, 3, 4
#> $ drug_exposure_start_date <date> 2000-01-01, 2000-01-10, 2000-02-20
#> $ drug_exposure_end_date <date> 2000-02-10, 2000-03-01, 2000-02-20
#> $ quantity <dbl> 41, 52, 1
#> $ drug_type_concept_id <dbl> 0, 0, 0However, one needs to be vigilant that the customised omop table is implicitly dependent on other omop tables.
One could also modify other omop tables including
person, concept,
concept_ancestor, drug_strength,
observation_period, condition_occurrence,
observation, and concept_relationship.
In a similar fashion, cohort tables can also be customised.
cdm <- mockDrugUtilisation(
observation_period = dplyr::tibble(
observation_period_id = 1,
person_id = 1:2,
observation_period_start_date = as.Date("1900-01-01"),
observation_period_end_date = as.Date("2100-01-01"),
period_type_concept_id = 0
),
cohort1 = dplyr::tibble(
cohort_definition_id = 1,
subject_id = c(1, 1, 2),
cohort_start_date = as.Date(c("2000-01-01", "2001-01-01", "2000-01-01")),
cohort_end_date = as.Date(c("2000-03-01", "2001-03-01", "2000-03-01"))
)
)cdm$cohort1 |>
dplyr::glimpse()
#> Rows: ??
#> Columns: 4
#> Database: DuckDB v1.0.0 [root@Darwin 23.4.0:R 4.4.1/:memory:]
#> $ cohort_definition_id <dbl> 1, 1, 1
#> $ subject_id <dbl> 1, 1, 2
#> $ cohort_start_date <date> 2000-01-01, 2001-01-01, 2000-01-01
#> $ cohort_end_date <date> 2000-03-01, 2001-03-01, 2000-03-01