Here we illustrate some example use cases that involve parsing codemeta data.
library(jsonld)
library(jsonlite)
library(magrittr)
library(codemetar)
library(purrr)
library(dplyr)
library(printr)
library(tibble)We start with a simple example from the codemeta.json file of codemetar itself. First, we’ll just generate a copy of the codemeta record for the package:
write_codemeta("codemetar", "codemeta.json")We then digest this input using a JSON-LD “frame.” While not strictly necessary, this helps ensure the data matches the format we expect, even if the original file had errors or missing data. See the vignette “Validating in JSON-LD” in this package and the official JSON-LD docs for details). The codemetar package includes a reasonably explicit frame to get us started:
frame <- system.file("schema/frame_schema.json", package="codemetar")
meta <-
jsonld_frame("codemeta.json", frame) %>%
fromJSON(FALSE) %>% getElement("@graph") %>% getElement(1)Construct a citation
authors <-
lapply(meta$author,
function(author)
person(given = author$given,
family = author$family,
email = author$email,
role = "aut"))
year <- meta$datePublished
if(is.null(year))
year <- format(Sys.Date(), "%Y")
bibitem <-
bibentry(
bibtype = "Manual",
title = meta$name,
author = authors,
year = year,
note = paste0("R package version ", meta$version),
url = meta$URL,
key = meta$identifier
)Warning in bibentry(bibtype = "Manual", title = meta$name, author =
authors, : Not all arguments are of the same length, the following need to
be recycled: author
cat(format(bibitem, "bibtex"))@Manual{codemetar,
title = {codemetar: Generate 'CodeMeta' Metadata for R Packages},
year = {2018},
note = {R package version 0.1.3},
}
bibitem(2018). _codemetar: Generate 'CodeMeta' Metadata for R Packages_.
R package version 0.1.3.
The ropensci corpus consists of a list of codemeta files for all packages provided by the rOpenSci project, <ropensci.org>. This provides a good test-case for how a large collection of codemeta files can be manipulated to help us get a better picture of the corpus.
download.file("https://github.com/codemeta/codemetar/raw/master/inst/notebook/ropensci.json",
"ropensci.json")As before, it is helpful, though not essential, to start off by framing the input data.
frame <- system.file("schema/frame_schema.json", package="codemetar")
corpus <-
jsonld_frame("ropensci.json", frame) %>%
fromJSON(simplifyVector = FALSE) %>%
getElement("@graph") We’re now ready to start exploring. As usual, functions from purrr prove very useful for iterating through large JSON files. First, we look at some basic summary data:
## deal with nulls explicitly by starting with map
pkgs <- map(corpus, "name") %>% compact() %>% as.character()
# keep only those with package identifiers (names)
keep <- map_lgl(corpus, ~ length(.x$identifier) > 0)
corpus <- corpus[keep]
## now we can just do
all_pkgs <- map_chr(corpus, "name")
head(all_pkgs)[1] "AntWeb: programmatic interface to the AntWeb"
[2] "aRxiv: Interface to the arXiv API"
[3] "chromer: Interface to Chromosome Counts Database API"
[4] "ckanr: Client for the Comprehensive Knowledge Archive Network ('CKAN') 'API'"
[5] "dashboard: A package status dashboard"
[6] "ggit: Git Graphics"
## 60 unique maintainers
map_chr(corpus, c("maintainer", "familyName")) %>% unique() %>% length()[1] 61
## Mostly Scott
map_chr(corpus, c("maintainer", "familyName")) %>%
as_tibble() %>%
group_by(value) %>%
tally(sort=TRUE)| value | n |
|---|---|
| Chamberlain | 105 |
| Ooms | 12 |
| Mullen | 8 |
| Ram | 8 |
| Boettiger | 6 |
| Salmon | 5 |
| FitzJohn | 4 |
| Hart | 2 |
| Leeper | 2 |
| Marwick | 2 |
| Müller | 2 |
| Padgham | 2 |
| South | 2 |
| Varela | 2 |
| Vitolo | 2 |
| Arnold | 1 |
| Attali | 1 |
| Banbury | 1 |
| Becker | 1 |
| Bengtsson | 1 |
| Braginsky | 1 |
| Broman | 1 |
| Bryan | 1 |
| Dallas | 1 |
| Drost | 1 |
| Fischetti | 1 |
| Ghahraman | 1 |
| Goring | 1 |
| Harrison | 1 |
| Hughes | 1 |
| Jahn | 1 |
| Jones | 1 |
| Keyes | 1 |
| Krah | 1 |
| Lehtomaki | 1 |
| Lovelace | 1 |
| Lundstrom | 1 |
| McGlinn | 1 |
| McVey | 1 |
| Meissner | 1 |
| Michonneau | 1 |
| Moroz | 1 |
| Otegui | 1 |
| Pardo | 1 |
| Pennell | 1 |
| Poelen | 1 |
| Robinson | 1 |
| Ross | 1 |
| Rowlingson | 1 |
| Scott | 1 |
| Seers | 1 |
| Shotwell | 1 |
| Sievert | 1 |
| Sparks | 1 |
| Stachelek | 1 |
| Szöcs | 1 |
| Widgren | 1 |
| Wiggin | 1 |
| Winter | 1 |
| de Queiroz | 1 |
| hackathoners | 1 |
## number of co-authors ...
map_int(corpus, function(r) length(r$author)) %>%
as_tibble() %>%
group_by(value) %>%
tally(sort=TRUE)| value | n |
|---|---|
| 1 | 146 |
| 2 | 30 |
| 3 | 17 |
| 4 | 8 |
| 5 | 5 |
| 7 | 3 |
| 13 | 1 |
## Contributors isn't used as much...
map_int(corpus, function(r) length(r$contributor)) %>%
as_tibble() %>%
group_by(value) %>%
tally(sort=TRUE)| value | n |
|---|---|
| 0 | 178 |
| 2 | 13 |
| 4 | 9 |
| 3 | 7 |
| 5 | 1 |
| 6 | 1 |
| 8 | 1 |
Numbers (n) of packages with a total of (value) dependencies:
map_int(corpus, function(r) length(r$softwareRequirements)) %>%
as_tibble() %>%
group_by(value) %>%
tally(sort=TRUE)| value | n |
|---|---|
| 4 | 39 |
| 5 | 35 |
| 2 | 25 |
| 3 | 25 |
| 7 | 19 |
| 6 | 16 |
| 8 | 13 |
| 9 | 8 |
| 12 | 7 |
| 10 | 6 |
| 11 | 6 |
| 13 | 3 |
| 0 | 2 |
| 14 | 1 |
| 17 | 1 |
| 18 | 1 |
| 21 | 1 |
| 22 | 1 |
| 23 | 1 |
which dependencies are used most frequently?
corpus %>%
map_df(function(x){
## single, unboxed dep
if("name" %in% names(x$softwareRequirements))
dep <- x$name
else if("name" %in% names(x$softwareRequirements[[1]]))
dep <- map_chr(x$softwareRequirements, "name")
else { ## No requirementsß
dep <- NA
}
tibble(identifier = x$identifier, dep = dep)
}) -> dep_df
dep_df %>%
group_by(dep) %>%
tally(sort = TRUE)| dep | n |
|---|---|
| jsonlite | 99 |
| httr | 92 |
| R | 66 |
| tibble | 46 |
| dplyr | 43 |
| methods | 37 |
| xml2 | 37 |
| data.table | 35 |
| utils | 35 |
| crul | 31 |
| plyr | 29 |
| XML | 25 |
| magrittr | 24 |
| sp | 22 |
| stringr | 21 |
| curl | 18 |
| ggplot2 | 18 |
| lazyeval | 17 |
| stats | 17 |
| R6 | 14 |
| lubridate | 14 |
| rappdirs | 13 |
| RCurl | 12 |
| assertthat | 12 |
| digest | 12 |
| readr | 11 |
| rgdal | 10 |
| whisker | 10 |
| scales | 9 |
| ape | 8 |
| raster | 8 |
| tidyr | 8 |
| Rcpp | 7 |
| reshape2 | 7 |
| rvest | 7 |
| V8 | 6 |
| rgeos | 6 |
| hoardr | 5 |
| rjson | 5 |
| taxize | 5 |
| tools | 5 |
| R(>=3.2.1) | 4 |
| git2r | 4 |
| maps | 4 |
| oai | 4 |
| openssl | 4 |
| solrium | 4 |
| urltools | 4 |
| RColorBrewer | 3 |
| foreach | 3 |
| knitr | 3 |
| leaflet | 3 |
| maptools | 3 |
| memoise | 3 |
| mime | 3 |
| pdftools | 3 |
| purrr | 3 |
| rgbif | 3 |
| rmarkdown | 3 |
| shiny | 3 |
| spocc | 3 |
| stringi | 3 |
| uuid | 3 |
| wicket | 3 |
| yaml | 3 |
| Biostrings | 2 |
| MASS | 2 |
| R.cache | 2 |
| R.utils | 2 |
| base64enc | 2 |
| bibtex | 2 |
| crayon | 2 |
| devtools | 2 |
| downloader | 2 |
| fauxpas | 2 |
| gdata | 2 |
| gistr | 2 |
| graphics | 2 |
| grid | 2 |
| htmltools | 2 |
| htmlwidgets | 2 |
| httpcode | 2 |
| igraph | 2 |
| jqr | 2 |
| miniUI | 2 |
| ncdf4 | 2 |
| png | 2 |
| rcrossref | 2 |
| rentrez | 2 |
| reshape | 2 |
| rmapshaper | 2 |
| rplos | 2 |
| rvertnet | 2 |
| shinyjs | 2 |
| storr | 2 |
| tm | 2 |
| NA | 2 |
| Biobase | 1 |
| BiocGenerics | 1 |
| DBI | 1 |
| DT(>=0.1) | 1 |
| EML | 1 |
| GenomeInfoDb | 1 |
| GenomicFeatures | 1 |
| GenomicRanges(>=1.23.24) | 1 |
| Hmisc | 1 |
| IRanges | 1 |
| Matrix | 1 |
| RApiSerialize | 1 |
| RJSONIO | 1 |
| RMySQL | 1 |
| RPostgreSQL | 1 |
| RSQLite | 1 |
| Rmpfr | 1 |
| S4Vectors | 1 |
| SSOAP | 1 |
| SnowballC | 1 |
| USAboundariesData: Datasets for the ‘USAboundaries’ package | 1 |
| VariantAnnotation | 1 |
| WikidataR | 1 |
| aRxiv | 1 |
| analogue | 1 |
| antiword: Extract Text from Microsoft Word Documents | 1 |
| apipkgen: Package Generator for HTTP API Wrapper Packages | 1 |
| appl: Approximate POMDP Planning Software | 1 |
| binman | 1 |
| biomaRt | 1 |
| bold | 1 |
| caTools | 1 |
| ckanr | 1 |
| cld2: Google’s Compact Language Detector 2 | 1 |
| countrycode | 1 |
| cranlogs | 1 |
| crminer | 1 |
| crosstalk | 1 |
| dirdf: Extracts Metadata from Directory and File Names | 1 |
| doParallel | 1 |
| elastic | 1 |
| fastmatch | 1 |
| foreign | 1 |
| functionMap | 1 |
| genderdata: Historical Datasets for Predicting Gender from Names | 1 |
| geoaxe | 1 |
| geojson | 1 |
| geojsonrewind: Fix ‘GeoJSON’ Winding Direction | 1 |
| geonames | 1 |
| geoops: ‘GeoJSON’ Manipulation Operations | 1 |
| geosphere | 1 |
| getPass | 1 |
| ggm | 1 |
| ggmap | 1 |
| ggthemes | 1 |
| grDevices | 1 |
| graphql | 1 |
| gridExtra | 1 |
| gtools | 1 |
| hash | 1 |
| hexbin | 1 |
| historydata: Data Sets for Historians | 1 |
| httpuv | 1 |
| isdparser | 1 |
| jsonvalidate | 1 |
| jsonvalidate: Validate ‘JSON’ | 1 |
| leafletR | 1 |
| loggr | 1 |
| mapproj | 1 |
| markdown | 1 |
| memisc | 1 |
| miniUI(>=0.1.1) | 1 |
| nabor | 1 |
| natserv | 1 |
| openxlsx | 1 |
| osmar | 1 |
| outliers | 1 |
| pdftools: Text Extraction and Rendering of PDF Documents | 1 |
| phytools | 1 |
| plotly | 1 |
| plumber | 1 |
| progress | 1 |
| protolite | 1 |
| qlcMatrix | 1 |
| rJava | 1 |
| rapport | 1 |
| rbhl | 1 |
| rbison | 1 |
| rebird | 1 |
| redland | 1 |
| redux | 1 |
| remotes | 1 |
| ridigbio | 1 |
| ritis | 1 |
| rlist | 1 |
| rncl | 1 |
| rnoaa | 1 |
| rnrfa | 1 |
| rotl | 1 |
| rowr | 1 |
| rredis | 1 |
| rredlist | 1 |
| rstudioapi(>=0.5) | 1 |
| rtracklayer | 1 |
| rworldmap | 1 |
| rzmq: R Bindings for ZeroMQ | 1 |
| scrapeR | 1 |
| selectr | 1 |
| sf | 1 |
| shiny(>=0.13.2) | 1 |
| snow | 1 |
| spatstat | 1 |
| stringdist | 1 |
| sys | 1 |
| tabulizerjars | 1 |
| testthat | 1 |
| tif: Text Interchange Format | 1 |
| viridisLite | 1 |
| wdman(>=0.2.2) | 1 |
| wellknown | 1 |
| wicket: Utilities to Handle WKT Spatial Data | 1 |
| wikitaxa | 1 |
| withr | 1 |
| worrms | 1 |
| xslt: XSLT 1.0 Transformations | 1 |
| zoo | 1 |
Alternate approach using a frame instead of purrr functions for subsetting the data. Note that this gets all Depends and suggests (really all SoftwareApplication types mentioned)
dep_frame <- '{
"@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
"@explicit": "true",
"name": {}
}'
jsonld_frame("ropensci.json", dep_frame) %>%
fromJSON() %>%
getElement("@graph") %>%
filter(type == "SoftwareApplication") %>%
group_by(name) %>%
tally(sort = TRUE)| name | n |
|---|---|
| testthat | 168 |
| knitr | 122 |
| jsonlite | 105 |
| httr | 96 |
| roxygen2 | 92 |
| R | 72 |
| rmarkdown | 68 |
| covr | 52 |
| dplyr | 49 |
| tibble | 48 |
| xml2 | 41 |
| methods | 38 |
| utils | 37 |
| data.table | 36 |
| ggplot2 | 36 |
| crul | 33 |
| plyr | 32 |
| magrittr | 28 |
| sp | 26 |
| XML | 25 |
| curl | 21 |
| stringr | 21 |
| lazyeval | 18 |
| stats | 18 |
| lubridate | 16 |
| R6 | 14 |
| readr | 14 |
| rgdal | 14 |
| rappdirs | 13 |
| RCurl | 12 |
| assertthat | 12 |
| devtools | 12 |
| digest | 12 |
| raster | 12 |
| scales | 12 |
| Rcpp | 11 |
| whisker | 11 |
| leaflet | 10 |
| rgeos | 10 |
| taxize | 10 |
| tidyr | 10 |
| reshape2 | 9 |
| V8 | 8 |
| ape | 8 |
| maps | 8 |
| maptools | 7 |
| purrr | 7 |
| rvest | 7 |
| pdftools | 6 |
| rgbif | 6 |
| shiny | 6 |
| ggmap | 5 |
| git2r | 5 |
| hoardr | 5 |
| ncdf4 | 5 |
| png | 5 |
| rjson | 5 |
| tools | 5 |
| R(>=3.2.1) | 4 |
| RSQLite | 4 |
| oai | 4 |
| openssl | 4 |
| rcrossref | 4 |
| sf | 4 |
| solrium | 4 |
| urltools | 4 |
| uuid | 4 |
| yaml | 4 |
| DBI | 3 |
| MASS | 3 |
| R.utils | 3 |
| RColorBrewer | 3 |
| fauxpas | 3 |
| foreach | 3 |
| gdata | 3 |
| gistr | 3 |
| graphics | 3 |
| lintr | 3 |
| memoise | 3 |
| mime | 3 |
| miniUI | 3 |
| rentrez | 3 |
| rmapshaper | 3 |
| rvertnet | 3 |
| rworldmap | 3 |
| spocc | 3 |
| stringi | 3 |
| wicket | 3 |
| Biostrings | 2 |
| GGally | 2 |
| Matrix | 2 |
| R.cache | 2 |
| RcppRedis | 2 |
| base64enc | 2 |
| bibtex | 2 |
| broom | 2 |
| crayon | 2 |
| downloader | 2 |
| elastic | 2 |
| geiger | 2 |
| getPass | 2 |
| ggthemes | 2 |
| grDevices | 2 |
| grid | 2 |
| gridExtra | 2 |
| htmltools | 2 |
| htmlwidgets | 2 |
| httpcode | 2 |
| igraph | 2 |
| jqr | 2 |
| jsonvalidate | 2 |
| listviewer | 2 |
| mapproj | 2 |
| phylobase | 2 |
| phytools | 2 |
| readxl | 2 |
| remotes | 2 |
| reshape | 2 |
| rplos | 2 |
| shinyjs | 2 |
| storr | 2 |
| sys | 2 |
| tm | 2 |
| viridis | 2 |
| webp | 2 |
| zoo | 2 |
| Biobase | 1 |
| BiocGenerics | 1 |
| Cairo | 1 |
| DT(>=0.1) | 1 |
| EML | 1 |
| GSODR | 1 |
| GenomeInfoDb | 1 |
| GenomicFeatures | 1 |
| GenomicRanges(>=1.23.24) | 1 |
| Hmisc | 1 |
| IRanges | 1 |
| IRdisplay | 1 |
| MCMCglmm | 1 |
| RApiSerialize | 1 |
| RJSONIO | 1 |
| RMySQL | 1 |
| RNeXML | 1 |
| RPostgreSQL | 1 |
| RSclient | 1 |
| RSelenium | 1 |
| RUnit | 1 |
| Rcompression | 1 |
| RedisAPI | 1 |
| Rmpfr | 1 |
| Rserve | 1 |
| S4Vectors | 1 |
| SSOAP | 1 |
| SnowballC | 1 |
| Suggests:testthat | 1 |
| Sxslt | 1 |
| USAboundaries | 1 |
| USAboundariesData | 1 |
| VariantAnnotation | 1 |
| WikidataR | 1 |
| XMLSchema | 1 |
| aRxiv | 1 |
| akima | 1 |
| analogue | 1 |
| binman | 1 |
| biomaRt | 1 |
| bold | 1 |
| caTools | 1 |
| ckanr | 1 |
| corrplot | 1 |
| countrycode | 1 |
| cranlogs | 1 |
| crminer | 1 |
| crosstalk | 1 |
| dendextend | 1 |
| doParallel | 1 |
| dplyr(>=0.3.0.2) | 1 |
| etseed | 1 |
| fastmatch | 1 |
| fields | 1 |
| forecast | 1 |
| foreign | 1 |
| fulltext | 1 |
| functionMap | 1 |
| genderdata | 1 |
| geoaxe | 1 |
| geojson | 1 |
| geojsonio | 1 |
| geojsonlint | 1 |
| geonames | 1 |
| geosphere | 1 |
| ggalt | 1 |
| ggm | 1 |
| graphql | 1 |
| gtools | 1 |
| hash | 1 |
| hexbin | 1 |
| historydata | 1 |
| httpuv | 1 |
| isdparser | 1 |
| janeaustenr | 1 |
| jpeg | 1 |
| knitcitations | 1 |
| leafletR | 1 |
| loggr | 1 |
| magick | 1 |
| mapdata | 1 |
| markdown | 1 |
| memisc | 1 |
| miniUI(>=0.1.1) | 1 |
| mongolite | 1 |
| nabor | 1 |
| natserv | 1 |
| openair | 1 |
| openxlsx | 1 |
| osmar | 1 |
| outliers | 1 |
| pander | 1 |
| parallel | 1 |
| plot3D | 1 |
| plotKML | 1 |
| plotly | 1 |
| plumber | 1 |
| progress | 1 |
| protolite | 1 |
| purrrlyr | 1 |
| qlcMatrix | 1 |
| rJava | 1 |
| rapport | 1 |
| rbhl | 1 |
| rbison | 1 |
| rcdk | 1 |
| readtext | 1 |
| rebird | 1 |
| redland | 1 |
| redux | 1 |
| reeack | 1 |
| rfigshare | 1 |
| ridigbio | 1 |
| rinat | 1 |
| ritis | 1 |
| rlist | 1 |
| rnaturalearthdata | 1 |
| rnaturalearthhires | 1 |
| rncl | 1 |
| rnoaa | 1 |
| rnrfa | 1 |
| ropenaq | 1 |
| rotl | 1 |
| rowr | 1 |
| rrdf | 1 |
| rredis | 1 |
| rredlist | 1 |
| rrlite | 1 |
| rstudioapi(>=0.5) | 1 |
| rsvg | 1 |
| rtracklayer | 1 |
| sangerseqR | 1 |
| scrapeR | 1 |
| selectr | 1 |
| seqinr | 1 |
| shiny(>=0.13.2) | 1 |
| snow | 1 |
| sofa | 1 |
| spacetime | 1 |
| spatstat | 1 |
| stringdist | 1 |
| tabulizerjars | 1 |
| testthat(>=0.7) | 1 |
| tidytext | 1 |
| tidyverse | 1 |
| tiff | 1 |
| tmap | 1 |
| vegan | 1 |
| viridisLite | 1 |
| wdman(>=0.2.2) | 1 |
| weathermetrics | 1 |
| webmockr | 1 |
| webshot | 1 |
| wellknown | 1 |
| wikitaxa | 1 |
| withr | 1 |
| wordcloud2 | 1 |
| worrms | 1 |
| xtable | 1 |
| xts | 1 |
# summarise(count(name))