## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(ecocomDP) ## ----eval=FALSE--------------------------------------------------------------- # # ----------------------------------------------------------------------------- # # This function converts source dataset "knb-lter-hfr.118" (archived in the EDI # # Data Repository) to ecocomDP dataset "edi.193" (also archived in EDI) # # # # Arguments: # # # # path Where the ecocomDP tables will be written # # source_id Identifier of the source dataset # # derived_id Identifier of the derived dataset # # url The URL by which the derived tables and metadata can be accessed # # by a data repository. This argument is used when automating the # # repository publication step, but not used when manually # # publishing. # # # # Value: # # # # tables (.csv) ecocomDP tables # # metadata (.xml) EML metadata for tables # # # # Details: # # This function facilitates automated updates to the derived # # "edi.193" whenever new data are added to the source # # "knb-lter-hrf.118". The framework executing this maintenance # # routine is hosted on a remote server and jumps into action # # whenever an update notification is received for # # "knb-lter-hrf.118". The maintenance routine parses the # # notification to get the arguments to create_ecocomDP(). # # # # Landing page to source dataset "knb-lter-hfr.118": # # https://portal.edirepository.org/nis/mapbrowse?scope=knb-lter-hfr&identifier=118 # # Landing page to derived dataset "edi.193": # # https://portal.edirepository.org/nis/mapbrowse?scope=edi&identifier=193 # # ----------------------------------------------------------------------------- # # # Libraries used by this function # # library(ecocomDP) # library(xml2) # library(magrittr) # library(data.table) # library(lubridate) # library(tidyr) # library(dplyr) # library(EDIutils) # remotes::install_github("EDIorg/EDIutils") # library(taxonomyCleanr) # remotes::install_github("EDIorg/taxonomyCleanr") # # create_ecocomDP <- function(path, # source_id, # derived_id, # url = NULL) { # # # Read source dataset ------------------------------------------------------- # # # The source dataset is about ant communities and their functional traits # # changing in response to an invasive species. Observations are made across # # habitat types within the Harvard Experimental Forest. The dataset consists # # of a primary table listing abundances at sites through time, and an # # ancillary table listing physical and functional traits of observed species. # # # Read the source dataset from EDI # # eml <- EDIutils::api_read_metadata(source_id) # data <- EDIutils::read_tables( # eml = eml, # strip.white = TRUE, # na.strings = "", # convert.missing.value = TRUE, # add.units = TRUE) # # ants <- data$`hf118-01-ants.csv` # traits <- data$`hf118-02-functional-traits.csv` # ants$date <- ymd(ants$date) # # # Join and flatten the source dataset --------------------------------------- # # # Joining all source data and relevant metadata into one big flat table # # simplifies parsing into ecocomDP tables and facilitates referential # # integrity in the process. # # # Remove duplicate data from the ancillary table and join on species "code" # # traits <- traits %>% select(-genus, -species) # traits <- traits %>% rename(code = species.code) # wide <- left_join(ants, traits, by = "code") # # # Convert wide format to "flat" format. This is the wide form but gathered on # # core observation variables, which are often > 1 in source datasets. This # # "flat" table is the "widest" ecocomDP datasets can be consistently returned # # to by the ecocomDP::flatten_data() function, and is the input format # # required by the "create table" helpers we'll meet shortly. This dataset # # only has one core observation variable, "abundance", so gathering really # # only entails a change of column names. # # wide <- wide %>% rename(value_abundance = abundance) # flat <- pivot_longer( # wide, # cols = matches("abundance"), # names_to = c(".value", "variable_name"), # names_sep = '\\_') # # # We're now in a good place to begin adding columns of the ecocomDP tables # # we can create from this source dataset. We'll begin with the observation # # table. # # # Add columns for the observation table ------------------------------------- # # # The frequency and timing of surveys (events) varied throughout the history # # of this dataset and are uniquely identifiable by grouping sample dates by # # year and month. # # flat$event_id <- flat %>% group_by(month = floor_date(flat$date, "month"), # year) %>% group_indices() # flat <- flat %>% arrange(event_id) # # # Observations are made in plots, which are nested in blocks. Unique # # combinations of these form a location # # flat$location_id <- flat %>% group_by(plot, block) %>% group_indices() # # # Each row of the flattened source dataset represents an observation of taxa # # abundance and should have a unique ID for reference # # flat$observation_id <- seq(nrow(flat)) # # # Add columns for the location table ---------------------------------------- # # # Ideally, the source dataset would include latitude, longitude, and # # elevation for each location_id, but all we have are coordinates for the # # area encompassing all sampling locations. The best we can do here is use # # the middle of the bounding box and mean of the bounding elevations. # # geocov <- xml_find_all(eml, ".//geographicCoverage") # north <- xml_double(xml_find_all(geocov, './/northBoundingCoordinate')) # east <- xml_double(xml_find_all(geocov, './/eastBoundingCoordinate')) # south <- xml_double(xml_find_all(geocov, './/southBoundingCoordinate')) # west <- xml_double(xml_find_all(geocov, './/westBoundingCoordinate')) # elev_max <- xml_double(xml_find_all(geocov, './/altitudeMaximum')) # elev_min <- xml_double(xml_find_all(geocov, './/altitudeMinimum')) # # flat$latitude <- mean(c(north, south)) # flat$longitude <- mean(c(east, west)) # flat$elevation <- mean(c(elev_max, elev_min)) # # # Add columns for the taxon table ------------------------------------------- # # # Taxonomic entities of this dataset are comprised of unique genus and # # species pairs # # flat <- flat %>% # mutate(taxon_name = trimws(paste(genus, species))) %>% # select(-genus, -species) # # flat$taxon_id <- flat %>% group_by(taxon_name) %>% group_indices() # # # While not required, resolving taxonomic entities to an authority system # # improves the discoverability and interoperability of the ecocomDP dataset. # # We can resolve taxa by sending names through taxonomyCleanr for direct # # matches against the Integrated Taxonomic Information System # # (ITIS; https://www.itis.gov/). # # taxa_resolved <- taxonomyCleanr::resolve_sci_taxa( # x = unique(flat$taxon_name), # data.sources = 3) # # taxa_resolved <- taxa_resolved %>% # select(taxa, rank, authority, authority_id) %>% # rename(taxon_rank = rank, # taxon_name = taxa, # authority_system = authority, # authority_taxon_id = authority_id) # # flat <- left_join(flat, taxa_resolved, by = "taxon_name") # # # Add columns for the dataset_summary table --------------------------------- # # dates <- flat$date %>% stats::na.omit() %>% sort() # # # Use the calc_*() helper functions for consistency # # flat$package_id <- derived_id # flat$original_package_id <- source_id # flat$length_of_survey_years <- ecocomDP::calc_length_of_survey_years(dates) # flat$number_of_years_sampled <- ecocomDP::calc_number_of_years_sampled(dates) # flat$std_dev_interval_betw_years <- # ecocomDP::calc_std_dev_interval_betw_years(dates) # flat$max_num_taxa <- length(unique(flat$taxon_name)) # flat$geo_extent_bounding_box_m2 <- # ecocomDP::calc_geo_extent_bounding_box_m2(west, east, north, south) # # # Odds and ends ------------------------------------------------------------- # # # Rename source columns with an ecocomDP equivalent (date to datetime) # # flat <- flat %>% rename(datetime = date) # # # The hard work is done! The flat table contains all the source data and # # more! We can now use the "create" functions to parse this table into the # # ecocomDP tables. # # # Parse flat into ecocomDP tables ------------------------------------------- # # # Each ecocomDP table has an associated "create" function. Begin with the # # core required tables. # # observation <- ecocomDP::create_observation( # L0_flat = flat, # observation_id = "observation_id", # event_id = "event_id", # package_id = "package_id", # location_id = "location_id", # datetime = "datetime", # taxon_id = "taxon_id", # variable_name = "variable_name", # value = "value", # unit = "unit") # # location <- ecocomDP::create_location( # L0_flat = flat, # location_id = "location_id", # location_name = c("block", "plot"), # latitude = "latitude", # longitude = "longitude", # elevation = "elevation") # # taxon <- ecocomDP::create_taxon( # L0_flat = flat, # taxon_id = "taxon_id", # taxon_rank = "taxon_rank", # taxon_name = "taxon_name", # authority_system = "authority_system", # authority_taxon_id = "authority_taxon_id") # # dataset_summary <- ecocomDP::create_dataset_summary( # L0_flat = flat, # package_id = "package_id", # original_package_id = "original_package_id", # length_of_survey_years = "length_of_survey_years", # number_of_years_sampled = "number_of_years_sampled", # std_dev_interval_betw_years = "std_dev_interval_betw_years", # max_num_taxa = "max_num_taxa", # geo_extent_bounding_box_m2 = "geo_extent_bounding_box_m2") # # # Create the ancillary ecocomDP tables. These are optional, but should be # # included if possible. # # observation_ancillary <- ecocomDP::create_observation_ancillary( # L0_flat = flat, # observation_id = "observation_id", # variable_name = c("trap.type", "trap.num", "moose.cage")) # # location_ancillary <- ecocomDP::create_location_ancillary( # L0_flat = flat, # location_id = "location_id", # variable_name = "treatment") # # taxon_ancillary <- ecocomDP::create_taxon_ancillary( # L0_flat = flat, # taxon_id = "taxon_id", # variable_name = c( # "subfamily", "hl", "rel", "rll", "colony.size", # "feeding.preference", "nest.substrate", "primary.habitat", # "secondary.habitat", "seed.disperser", "slavemaker.sp", # "behavior", "biogeographic.affinity", "source"), # unit = c("unit_hl", "unit_rel", "unit_rll")) # # # Create the variable_mapping table. This is optional but highly recommended # # as it provides unambiguous definitions to variables and facilitates # # integration with other ecocomDP datasets. # # variable_mapping <- ecocomDP::create_variable_mapping( # observation = observation, # observation_ancillary = observation_ancillary, # location_ancillary = location_ancillary, # taxon_ancillary = taxon_ancillary) # # i <- variable_mapping$variable_name == 'abundance' # variable_mapping$mapped_system[i] <- 'Darwin Core' # variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/individualCount' # variable_mapping$mapped_label[i] <- 'individualCount' # # i <- variable_mapping$variable_name == 'treatment' # variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology' # variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00000506' # variable_mapping$mapped_label[i] <- 'Manipulative experiment' # # i <- variable_mapping$variable_name == 'trap.type' # variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology' # variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00001591' # variable_mapping$mapped_label[i] <- 'type of trap' # # i <- variable_mapping$variable_name == 'hl' # variable_mapping$mapped_system[i] <- 'Darwin Core' # variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/measurementType' # variable_mapping$mapped_label[i] <- 'measurementType' # # i <- variable_mapping$variable_name == 'rel' # variable_mapping$mapped_system[i] <- 'Darwin Core' # variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/measurementType' # variable_mapping$mapped_label[i] <- 'measurementType' # # i <- variable_mapping$variable_name == 'rll' # variable_mapping$mapped_system[i] <- 'Darwin Core' # variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/measurementType' # variable_mapping$mapped_label[i] <- 'measurementType' # # i <- variable_mapping$variable_name == 'colony.size' # variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology' # variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00000311' # variable_mapping$mapped_label[i] <- 'Population' # # i <- variable_mapping$variable_name == 'feeding.preference' # variable_mapping$mapped_system[i] <- 'Darwin Core' # variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/behavior' # variable_mapping$mapped_label[i] <- 'behavior' # # i <- variable_mapping$variable_name == 'primary.habitat' # variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology' # variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00002736' # variable_mapping$mapped_label[i] <- 'type of habitat' # # i <- variable_mapping$variable_name == 'secondary.habitat' # variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology' # variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00002736' # variable_mapping$mapped_label[i] <- 'type of habitat' # # i <- variable_mapping$variable_name == 'seed.disperser' # variable_mapping$mapped_system[i] <- 'Darwin Core' # variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/behavior' # variable_mapping$mapped_label[i] <- 'behavior' # # i <- variable_mapping$variable_name == 'slavemaker.sp' # variable_mapping$mapped_system[i] <- 'Darwin Core' # variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/behavior' # variable_mapping$mapped_label[i] <- 'behavior' # # i <- variable_mapping$variable_name == 'behavior' # variable_mapping$mapped_system[i] <- 'Darwin Core' # variable_mapping$mapped_id[i] <- 'http://rs.tdwg.org/dwc/terms/behavior' # variable_mapping$mapped_label[i] <- 'behavior' # # i <- variable_mapping$variable_name == 'biogeographic.affinity' # variable_mapping$mapped_system[i] <- 'The Ecosystem Ontology' # variable_mapping$mapped_id[i] <- 'http://purl.dataone.org/odo/ECSO_00002736' # variable_mapping$mapped_label[i] <- 'type of habitat' # # i <- variable_mapping$variable_name == 'source' # variable_mapping$mapped_system[i] <- 'Darwin Core' # variable_mapping$mapped_id[i] <- 'http://purl.org/dc/terms/references' # variable_mapping$mapped_label[i] <- 'references' # # # Write tables to file # # ecocomDP::write_tables( # path = path, # observation = observation, # location = location, # taxon = taxon, # dataset_summary = dataset_summary, # observation_ancillary = observation_ancillary, # location_ancillary = location_ancillary, # taxon_ancillary = taxon_ancillary, # variable_mapping = variable_mapping) # # # Validate tables ----------------------------------------------------------- # # # Validation checks ensure the derived set of tables comply with the ecocomDP # # model. Any issues at this point # # should be addressed in the lines of code above, the tables rewritten, and # # another round of validation, to be certain the fix worked. # # issues <- ecocomDP::validate_data(path = path) # # # Create metadata ----------------------------------------------------------- # # # Before publishing the derived ecocomDP dataset, we need to describe it. The # # create_eml() function does this all for us. It knows the structure of the # # ecocomDP model and applies standardized table descriptions and mixes in # # important elements of the source dataset metadata for purposes of # # communication and provenance tracking. # # # Convert "dataset level keywords" listed in the source to "dataset level # # annotations" in the derived. The predicate "is about" is used, which # # results in an annotation that reads "This dataset is about 'species # # abundance'", "This dataset is about an ecological 'Community'", etc. All # # source datasets involving a human induced manipulative experiment, not a # # natural disturbance/experiment, should include the "Manipulative # # experiment" annotation below to enable searching on this term. # # dataset_annotations <- c( # `species abundance` = # "http://purl.dataone.org/odo/ECSO_00001688", # Community = # "http://purl.dataone.org/odo/ECSO_00000310", # `Manipulative experiment` = # "http://purl.dataone.org/odo/ECSO_00000506", # `level of ecological disturbance` = # "http://purl.dataone.org/odo/ECSO_00002588", # `type of ecological disturbance` = # "http://purl.dataone.org/odo/ECSO_00002589") # # # Add contact information for the author of this script and dataset # # additional_contact <- data.frame( # givenName = 'Colin', # surName = 'Smith', # organizationName = 'Environmental Data Initiative', # electronicMailAddress = 'ecocomdp@gmail.com', # stringsAsFactors = FALSE) # # # Create EML metadata # # eml <- ecocomDP::create_eml( # path = path, # source_id = source_id, # derived_id = derived_id, # is_about = dataset_annotations, # script = "create_ecocomDP.R", # script_description = # "A function for converting knb-lter-hrf.118 to ecocomDP", # contact = additional_contact, # user_id = 'ecocomdp', # user_domain = 'EDI', # basis_of_record = "HumanObservation") # # } ## ----eval=FALSE--------------------------------------------------------------- # # # Create directory for tables and metadata # # mypath <- paste0(tempdir(), "/edi_193") # dir.create(mypath) # # # Create ecocomDP dataset "edi.193.5" from source dataset "knb-lter-hfr.118.33" # # create_ecocomDP( # path = mypath, # source_id = "knb-lter-hfr.118.33", # derived_id = "edi.193.5") # #> Retrieving EML for data package knb-lter-hfr.118.33 # #> [0%] Downloaded 0 bytes... # #> [0%] Downloaded 0 bytes... # #> # #> Searching ITIS for "Aphaenogaster picea" # #> Searching ITIS for "Camponotus novaeboracensis" # #> Searching ITIS for "Aphaenogaster fulva" # #> Searching ITIS for "Temnothorax longispinosus" # #> Searching ITIS for "Stenemma impar" # #> Searching ITIS for "Stenemma diecki" # #> Searching ITIS for "Camponotus pennsylvanicus" # #> Searching ITIS for "Lasius americanus" # #> Searching ITIS for "Myrmica punctiventris" # #> Searching ITIS for "Lasius nearcticus" # #> Searching ITIS for "Formica subaenescens" # #> Searching ITIS for "Lasius umbratus" # #> Searching ITIS for "Formica subsericea" # #> Searching ITIS for "Formica aserva" # #> Searching ITIS for "Formica neogagates" # #> Searching ITIS for "Camponotus nearcticus" # #> Searching ITIS for "Ponera pennsylvanica" # #> Searching ITIS for "Stenamma brevicorne" # #> Searching ITIS for "Lasius claviger" # #> Searching ITIS for "Stenamma impar" # #> Searching ITIS for "Temnothorax lognispinosus" # #> Searching ITIS for "Stenamma diecki" # #> Searching ITIS for "Camponotus herculeanus" # #> Searching ITIS for "Lasius speculiventris" # #> Searching ITIS for "Stenamma schmitti" # #> Searching ITIS for "Lasius neoniger" # #> Searching ITIS for "Camponotus pennsylvanica" # #> Searching ITIS for "Tapinoma sessile" # #> Searching ITIS for "Myrmica AF-smi" # #> Searching ITIS for "Formica neorufibarbis" # #> Searching ITIS for "Myrmica incompleta" # #> Searching ITIS for "Formica argentea" # #> Searching ITIS for "Myrmica AF-scu" # #> Searching ITIS for "Formica dolosa" # #> Searching ITIS for "Formica subintegra" # #> Searching ITIS for "Formica incerta" # #> Searching ITIS for "Myrmica nearctica" # #> Searching ITIS for "Formica pergandei" # #> Searching ITIS for "Formica lasioides" # #> Searching ITIS for "Myrmica pinetorum" # #> Searching ITIS for "Leptothorax canadensis" # #> Searching ITIS for "Myrmica detritinodis" # #> Searching ITIS for "Myrmecina americana" # #> Searching ITIS for "Crematogaster lineolata" # #> Searching ITIS for "Lasius interjectus" # #> Searching ITIS for "Camponotus chromaiodes" # #> Searching ITIS for "Formica pallidefulva" # #> Searching ITIS for "Temnothorax ambiguus" # #> Searching ITIS for "Lasius subglaber" # #> Searching ITIS for "Formica rubicunda" # #> Searching ITIS for "Lasius brevicornis" # #> Searching ITIS for "Lasius aphidicolus" # #> Searching ITIS for "Formica integra" # #> # #> Writing tables to file: # #> observation # #> location # #> taxon # #> dataset_summary # #> observation_ancillary # #> location_ancillary # #> taxon_ancillary # #> variable_mapping # #> # #> Validating edi_193: # #> Required tables # #> Column names # #> Required columns # #> Column classes # #> Datetime formats # #> Primary keys # #> Composite keys # #> Referential integrity # #> Latitude and longitude format # #> Latitude and longitude range # #> Elevation # #> variable_mapping # #> # #> Creating EML for derived data package (edi.193.5) # #> Reading EML of L0 data package knb-lter-hfr.118.33 # #> Creating EML of L1 data package edi.193.5 # #> Updating: # #> # #> # #> # #> # #> <pubDate> # #> <keywordSet> # #> <contact> # #> <methods> # #> <dataTable> # #> <otherEntity> # #> <annotations> # #> </eml> # #> Writing EML # #> Validating EML # #> Validation passed :) # #> Done. # # # The working directory contains a valid set of ecocomDP tables and metadata, # # which is ready for upload to EDI (or any other EML based repository) # dir(mypath) # #> [1] "create_ecocomDP.R" # #> [2] "dataset_summary.csv" # #> [3] "edi.193.5.xml" # #> [4] "location.csv" # #> [5] "location_ancillary.csv" # #> [6] "observation.csv" # #> [7] "observation_ancillary.csv" # #> [8] "taxon.csv" # #> [9] "taxon_ancillary.csv" # #> [10] "variable_mapping.csv" #