Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding new occ_search terms #730

Merged
merged 41 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
55c3bec
occ_search search term sex
May 7, 2024
c9a6e50
occ_search search term dwcaExtension
May 7, 2024
efe8dba
occ_search search term gbifId
May 7, 2024
d0ae97e
occ_search search term gbifRegion
May 7, 2024
ded438f
occ_search search term projectId
May 7, 2024
b4e4e63
occ_search search term programme
May 7, 2024
846a1e8
occ_search search term preparations
May 7, 2024
62d3103
occ_search search term datasetId
May 7, 2024
a1fd1c5
occ_search search term datasetName
May 7, 2024
5f2f158
occ_search search term publishedByGbifRegion
May 7, 2024
f57657f
occ_search search term island
May 7, 2024
0569e7f
occ_search search term islandGroup
May 7, 2024
84c57c1
occ_search search term recordedByID
May 7, 2024
7e631a1
occ_search search term taxonId
May 7, 2024
6ba8a26
occ_search search term taxonConceptId
May 7, 2024
178b2e0
occ_search search term taxonConceptId
May 7, 2024
0f08fe8
occ_search search term acceptedTaxonKey
May 7, 2024
f72952b
occ_search search term collectionKey
May 7, 2024
ce76e73
occ_search search term institutionKey
May 7, 2024
ea50cea
occ_search search term otherCatalogNumbers
May 7, 2024
d089ff1
occ_search search term georeferencedBy
May 7, 2024
66db636
occ_search search term installationKey
May 7, 2024
2c31fa4
occ_search search term hostingOrganizationKey
May 7, 2024
61d16b9
occ_search search term crawlId
May 7, 2024
8124a4a
occ_search search term modified
May 7, 2024
23c6b92
occ_search search term higherGeography
May 7, 2024
f27a322
occ_search search term parentEventId
May 7, 2024
34ccb6e
occ_search search term samplingProtocol
May 7, 2024
7a8a284
occ_search search term sampleSizeUnit
May 7, 2024
f8b1126
occ_search search term pathway
May 7, 2024
99bbbe9
occ_search search term gadmLevel2Gid
May 8, 2024
3ce7f56
occ_search search term fossil terms like bed
May 8, 2024
f26b5e7
occ_search search term associatedSequences
May 8, 2024
b968e1f
occ_search search term isSequenced
May 8, 2024
e746d8d
occ_search search term endDayOfYear
May 8, 2024
51bbca3
occ_search search term geoDistance
May 8, 2024
4a55147
updating occ_count
May 8, 2024
087945a
adding tests for geoDistance
May 8, 2024
b855070
adding tests for geoDistance
May 8, 2024
1a0afda
updating occ_data docs
May 8, 2024
3ce8549
updating occ_search docs
May 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 72 additions & 4 deletions R/occ_count.r
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,61 @@ occ_count <- function(...,occurrenceStatus="PRESENT", curlopts = list()) {
lifeStage = args$lifeStage,
isInCluster = args$isInCluster,
distanceFromCentroidInMeters = args$distanceFromCentroidInMeters,
geoDistance = args$geoDistance,
sex = args$sex,
dwcaExtension = args$dwcaExtension,
gbifId = args$gbifId,
gbifRegion = args$gbifRegion,
projectId = args$projectId,
programme = args$programme,
preparations = args$preparations,
datasetId = args$datsetId,
datasetName = args$datasetName,
publishedByGbifRegion = args$publishedByGbifRegion,
island = args$island,
islandGroup = args$islandGroup,
taxonId = args$taxonId,
taxonConceptId = args$taxonConceptId,
taxonomicStatus = args$taxonomicStatus,
acceptedTaxonKey = args$acceptedTaxonKey,
collectionKey = args$collectionsKey,
institutionKey = args$institutionKey,
otherCatalogNumbers = args$otherCatalogNumbers,
georeferencedBy = args$georeferencedBy,
installationKey = args$installationKey,
hostingOrganizationKey = args$hostingOrganizationKey,
crawlId = args$crawlId,
modified = args$modified,
higherGeography = args$higherGeography,
fieldNumber = args$fieldNumber,
parentEventId = args$parentEventId,
samplingProtocol = args$samplingProtocol,
sampleSizeUnit = args$sampleSizeUnit,
pathway = args$pathway,
gadmLevel0Gid = args$gadmLevel0Gid,
gadmLevel1Gid = args$gadmLevel1Gid,
gadmLevel2Gid = args$gadmLevel2Gid,
gadmLevel3Gid = args$gadmLevel3Gid,
earliestEonOrLowestEonothem = args$earliestEonOrLowestEonothem,
latestEonOrHighestEonothem = args$latestEonOrHighestEonothem,
earliestEraOrLowestErathem = args$earliestEraOrLowestErathem,
latestEraOrHighestErathem = args$latestEraOrHighestErathem,
earliestPeriodOrLowestSystem = args$earliestPeriodOrLowestSystem,
latestPeriodOrHighestSystem = args$latestPeriodOrHighestSystem,
earliestEpochOrLowestSeries = args$earliestEpochOrLowestSeries,
latestEpochOrHighestSeries = args$latestEpochOrHighestSeries,
earliestAgeOrLowestStage = args$earliestAgeOrLowestStage,
latestAgeOrHighestStage = args$latestAgeOrHighestStage,
lowestBiostratigraphicZone = args$lowestBiostratigraphicZone,
highestBiostratigraphicZone = args$highestBiostratigraphicZone,
group = args$group,
formation = args$formation,
member = args$member,
bed = args$bed,
associatedSequences = args$aassociatedSequences,
isSequenced = args$isSequenced,
startDayOfYear = args$startDayOfYear,
endDayOfYear = args$endDayOfYear,
limit=0,
start=0,
fields = 'all',
Expand All @@ -224,10 +279,23 @@ occ_count <- function(...,occurrenceStatus="PRESENT", curlopts = list()) {
facetLimit = args$facetLimit)

if("facet" %in% arg_names) {
not_facet_arg <- c("skip_validate","...","curlopts","facetMultiselect",
"facetMincount", "facet","return","fields","start",
"limit","verbatimTaxonId","geometry", "geom_big",
"geom_size","geom_n","search")
not_facet_arg <- c("skip_validate",
"...",
"curlopts",
"facetMultiselect",
"facetMincount",
"facet",
"return",
"fields",
"start",
"limit",
"verbatimTaxonId",
"geometry",
"geom_big",
"geom_size",
"geom_n",
"search",
"geoDistance")
acc_facet_arg <- formal_args[!formal_args %in% not_facet_arg]
if(!args$facet %in% acc_facet_arg) stop("Bad facet arg.")
count <- stats::setNames(res$facet[[1]],c(args$facet,"count"))
Expand Down
265 changes: 231 additions & 34 deletions R/occ_data.R
Original file line number Diff line number Diff line change
@@ -1,45 +1,242 @@
#' Search for GBIF occurrences - simplified for speed
#' Legacy alternative to occ_search
#'
#' @export
#' @template occsearch
#' @template oslimstart
#' @template occ
#' @template occ_data_egs
#' @seealso [downloads()], [occ_search()]
#' @section occ_data vs. occ_search:
#' This does nearly the same thing as [occ_search()], but
#' is simplified for speed, and is for the most common use case where
#' user just wants occurrence data, and not other information like taxon
#' hierarchies and media (e.g., images). Alot of time in [occ_search()]
#' is used parsing data to be more useable downstream. We do less of that
#' in this function.
#' @param taxonKey (numeric) A taxon key from the GBIF backbone. All included
#' and synonym taxa are included in the search, so a search for aves with
#' taxononKey=212 will match all birds, no matter which species. You can pass
#' many keys to \code{occ_search(taxonKey=c(1,212))}.
#' @param scientificName A scientific name from the GBIF backbone. All included
#' and synonym taxa are included in the search.
#' @param country (character) The 2-letter country code (ISO-3166-1)
#' in which the occurrence was recorded. \code{enumeration_country()}.
#' @param datasetKey (character) The occurrence dataset uuid key. That can be
#' found in the dataset page url. For example, "7e380070-f762-11e1-a439-00145
#' eb45e9a" is the key for [Natural History Museum (London) Collection Specimens](https://www.gbif.org/dataset/7e380070-f762-11e1-a439-00145eb45e9a).
#' @param eventDate (character) Occurrence date in ISO 8601 format: yyyy,
#' yyyy-MM, yyyy-MM-dd, or MM-dd. Supports range queries, 'smaller,larger'
#' ('1990,1991', whereas '1991,1990' wouldn't work).
#' @param catalogNumber (character) An identifier of any form assigned by the
#' source within a physical collection or digital dataset for the record which
#' may not unique, but should be fairly unique in combination with the
#' institution and collection code.
#' @param recordedBy (character) The person who recorded the occurrence.
#' @param recordedByID (character) Identifier (e.g. ORCID) for the person who
#' recorded the occurrence
#' @param identifiedByID (character) Identifier (e.g. ORCID) for the person who
#' provided the taxonomic identification of the occurrence.
#' @param collectionCode (character) An identifier of any form assigned by the
#' source to identify the physical collection or digital dataset uniquely within
#' the text of an institution.
#' @param institutionCode An identifier of any form assigned by the source to
#' identify the institution the record belongs to.
#' @param basisOfRecord (character) The specific nature of the data record. See
#' [here](https://gbif.github.io/parsers/apidocs/org/gbif/api/vocabulary/BasisOfRecord.html).
#'
#' \itemize{
#' \item "FOSSIL_SPECIMEN"
#' \item "HUMAN_OBSERVATION"
#' \item "MATERIAL_CITATION"
#' \item "MATERIAL_SAMPLE"
#' \item "LIVING_SPECIMEN"
#' \item "MACHINE_OBSERVATION"
#' \item "OBSERVATION"
#' \item "PRESERVED_SPECIMEN"
#' \item "OCCURRENCE"
#' }
#' @param year The 4 digit year. A year of 98 will be interpreted as AD 98.
#' Supports range queries, 'smaller,larger' (e.g., '1990,1991', whereas 1991,
#' 1990' wouldn't work).
#' @param month The month of the year, starting with 1 for January. Supports
#' range queries, 'smaller,larger' (e.g., '1,2', whereas '2,1' wouldn't work).
#' @param search (character) Query terms. The value for this parameter can be a
#' simple word or a phrase. For example, [search="puma"](https://www.gbif.org/occurrence/search?q=puma)
#' @param decimalLatitude Latitude in decimals between -90 and 90 based on
#' WGS84. Supports range queries, 'smaller,larger' (e.g., '25,30', whereas
#' '30,25' wouldn't work).
#' @param decimalLongitude Longitude in decimals between -180 and 180 based on
#' WGS84. Supports range queries (e.g., '-0.4,-0.2', whereas '-0.2,-0.4'
#' wouldn't work).
#' @param publishingCountry The 2-letter country code (as per ISO-3166-1) of
#' the country in which the occurrence was recorded. See
#' \code{enumeration_country()}.
#' @param elevation Elevation in meters above sea level. Supports range
#' queries, 'smaller,larger' (e.g., '5,30', whereas '30,5' wouldn't work).
#' @param depth Depth in meters relative to elevation. For example 10 meters
#' below a lake surface with given elevation. Supports range queries,
#' 'smaller,larger' (e.g., '5,30', whereas '30,5' wouldn't work).
#' @param geometry (character) Searches for occurrences inside a polygon in
#' Well Known Text (WKT) format. A WKT shape written as either
#'
#' \itemize{
#' \item "POINT"
#' \item "LINESTRING"
#' \item "LINEARRING"
#' \item "POLYGON"
#' \item "MULTIPOLYGON"
#' }
#'
#' For Example, "POLYGON((37.08 46.86,38.06 46.86,38.06 47.28,37.08 47.28,
#' 37.0 46.8))". See also the section **WKT** below.
#' @param geom_big (character) One"bbox" or "asis" (default).
#' @param geom_size (integer) An integer indicating size of the cell. Default:
#' 40.
#' @param geom_n (integer) An integer indicating number of cells in each
#' dimension. Default: 10.
#' @param hasGeospatialIssue (logical) Includes/excludes occurrence records
#' which contain spatial issues (as determined in our record interpretation),
#' i.e. \code{hasGeospatialIssue=TRUE} returns only those records with spatial
#' issues while \code{hasGeospatialIssue=FALSE} includes only records without
#' spatial issues. The absence of this parameter returns any record with or
#' without spatial issues.
#' @param issue (character) One or more of many possible issues with each
#' occurrence record. Issues passed to this parameter filter results by
#' the issue. One of many [options](https://gbif.github.io/gbif-api/apidocs/org/gbif/api/vocabulary/OccurrenceIssue.html).
#' See [here](https://data-blog.gbif.org/post/issues-and-flags/) for definitions.
#' @param hasCoordinate (logical) Return only occurrence records with lat/long
#' data (\code{TRUE}) or all records (\code{FALSE}, default).
#' @param typeStatus Type status of the specimen. One of many
#' [options](https://www.gbif.org/occurrence/search?type_status=PARATYPE).
#' @param recordNumber Number recorded by collector of the data, different from
#' GBIF record number.
#' @param lastInterpreted Date the record was last modified in GBIF, in ISO
#' 8601 format: yyyy, yyyy-MM, yyyy-MM-dd, or MM-dd. Supports range queries,
#' 'smaller,larger' (e.g., '1990,1991', whereas '1991,1990' wouldn't work).
#' @param continent The source supplied continent.
#'
#' \itemize{
#' \item "africa"
#' \item "antarctica"
#' \item "asia"
#' \item "europe"
#' \item "north_america"
#' \item "oceania"
#' \item "south_america"
#' }
#'
#' Continent is not inferred but only populated if provided by the
#' dataset publisher. Applying this filter may exclude many relevant records.
#' @param mediaType (character) Media type of "MovingImage", "Sound", or
#' "StillImage".
#' @param repatriated (character) Searches for records whose publishing country
#' is different to the country where the record was recorded in.
#' @param kingdomKey (numeric) Kingdom classification key.
#' @param phylumKey (numeric) Phylum classification key.
#' @param classKey (numeric) Class classification key.
#' @param orderKey (numeric) Order classification key.
#' @param familyKey (numeric) Family classification key.
#' @param genusKey (numeric) Genus classification key.
#' @param speciesKey (numeric) Species classification key.
#' @param subgenusKey (numeric) Subgenus classification key.
#' @param establishmentMeans (character) provides information about whether an
#' organism or organisms have been introduced to a given place and time through
#' the direct or indirect activity of modern humans.
#'
#' There are a number of data fields GBIF returns that we drop to speed up
#' processing time within R. These fields take extra time to process
#' because they are deeply nested and so take extra time to check if
#' they are empty or not, and if not, figure out how to parse them
#' into a data.frame. The fields are:
#' \itemize{
#' \item "Introduced"
#' \item "Native"
#' \item "NativeReintroduced"
#' \item "Vagrant"
#' \item "Uncertain"
#' \item "IntroducedAssistedColonisation"
#' }
#'
#' @param degreeOfEstablishment (character) Provides information about degree to
#' which an Organism survives, reproduces, and expands its range at the given
#' place and time. One of many [options](https://www.gbif.org/occurrence/search?advanced=1&degree_of_establishment=Managed).
#' @param protocol (character) Protocol or mechanism used to provide the
#' occurrence record. One of many [options](https://www.gbif.org/occurrence/search?protocol=DWC_ARCHIVE&advanced=1).
#' @param license (character) The type license applied to the dataset or record.
#'
#' - `gadm`
#' - `media`
#' - `facts`
#' - `relations`
#' - `extensions`
#' - `identifiers`
#' - `recordedByIDs`
#' - `identifiedByIDs`
#' \itemize{
#' \item "CC0_1_0"
#' \item "CC_BY_4_0"
#' \item "CC_BY_NC_4_0"
#' }
#'
#' @param organismId (numeric) An identifier for the Organism instance (as
#' opposed to a particular digital record of the Organism). May be a globally
#' unique identifier or an identifier specific to the data set.
#' @param publishingOrg (character) The publishing organization key (a UUID).
#' @param stateProvince (character) The name of the next smaller administrative
#' region than country (state, province, canton, department, region, etc.) in
#' which the Location occurs.
#' @param waterBody (character) The name of the water body in which the
#' locations occur
#' @param locality (character) The specific description of the place.
#' @param occurrenceStatus (character) Default is "PRESENT". Specify whether
#' search should return "PRESENT" or "ABSENT" data.
#' @param gadmGid (character) The gadm id of the area occurrences are desired
#' from. https://gadm.org/.
#' @param coordinateUncertaintyInMeters A number or range between 0-1,000,000
#' which specifies the desired coordinate uncertainty. A coordinateUncertainty
#' InMeters=1000 will be interpreted all records with exactly 1000m. Supports
#' range queries, 'smaller,larger' (e.g., '1000,10000', whereas '10000,1000'
#' wouldn't work).
#' @param verbatimScientificName (character) Scientific name as provided by the
#' source.
#' @param verbatimTaxonId (character) The taxon identifier provided to GBIF by
#' the data publisher.
#' @param eventId (character) identifier(s) for a sampling event.
#' @param identifiedBy (character) names of people, groups, or organizations.
#' @param networkKey (character) The occurrence network key (a uuid)
#' who assigned the Taxon to the subject.
#' @param occurrenceId (character) occurrence id from source.
#' @param organismQuantity A number or range which
#' specifies the desired organism quantity. An organismQuantity=5
#' will be interpreted all records with exactly 5. Supports range queries,
#' smaller,larger (e.g., '5,20', whereas '20,5' wouldn't work).
#' @param organismQuantityType (character) The type of quantification system
#' used for the quantity of organisms. For example, "individuals" or "biomass".
#' @param relativeOrganismQuantity (numeric) A relativeOrganismQuantity=0.1 will
#' be interpreted all records with exactly 0.1 The relative measurement of the
#' quantity of the organism (a number between 0-1). Supports range queries,
#' "smaller,larger" (e.g., '0.1,0.5', whereas '0.5,0.1' wouldn't work).
#' @param iucnRedListCategory (character) The IUCN threat status category.
#'
#' \itemize{
#' \item "NE" (Not Evaluated)
#' \item "DD" (Data Deficient)
#' \item "LC" (Least Concern)
#' \item "NT" (Near Threatened)
#' \item "VU" (Vulnerable)
#' \item "EN" (Endangered)
#' \item "CR" (Critically Endangered)
#' \item "EX" (Extinct)
#' \item "EW" (Extinct in the Wild)
#' }
#' @param lifeStage (character) the life stage of the occurrence. One of many
#' [options](https://www.gbif.org/occurrence/search?advanced=1&life_stage=Tadpole).
#' @param isInCluster (logical) identify potentially related records on GBIF.
#' @param distanceFromCentroidInMeters A number or range. A value of "2000,*"
#' means at least 2km from known centroids. A value of "0" would mean occurrences
#' exactly on known centroids. A value of "0,2000" would mean within 2km of
#' centroids. Max value is 5000.
#' @param skip_validate (logical) whether to skip wellknown::validate_wkt call
#' or not. passed down to check_wkt(). Default: TRUE
#' @param limit Number of records to return. Default: 500. Note that the per
#' request maximum is 300, but since we set it at 500 for the function, we
#' do two requests to get you the 500 records (if there are that many).
#' Note that there is a hard maximum of 100,000, which is calculated as the
#' \code{limit+start}, so \code{start=99,000} and \code{limit=2000} won't work
#' @param start Record number to start at. Use in combination with limit to
#' page through results. Note that we do the paging internally for you, but
#' you can manually set the \code{start} parameter
#' @param curlopts (list)
#'
#' @details
#' This function is a legacy alternative to `occ_search()`. It is not
#' recommended to use `occ_data()` as it is not as flexible as `occ_search()`.
#' New search terms will not be added to this function and it is only supported
#' for legacy reasons.
#'
#' To get these fields use [occ_search()] instead.
#' @note Maximum number of records you can get with this function is 100,000.
#' See https://www.gbif.org/developer/occurrence
#' @return An object of class `gbif_data`, which is a S3 class list, with
#' slots for metadata (`meta`) and the occurrence data itself
#' (`data`), and with attributes listing the user supplied arguments
#' and whether it was a "single" or "many" search; that is, if you supply
#' two values of the `datasetKey` parameter to searches are done, and
#' it's a "many". `meta` is a list of length four with offset, limit,
#' endOfRecords and count fields. `data` is a tibble (aka data.frame)

#' @export
#'
occ_data <- function(taxonKey=NULL,
scientificName=NULL,
country=NULL,
Expand Down Expand Up @@ -90,9 +287,6 @@ occ_data <- function(taxonKey=NULL,
stateProvince = NULL,
waterBody = NULL,
locality = NULL,
limit=500,
start=0,
skip_validate = TRUE,
occurrenceStatus = 'PRESENT',
gadmGid = NULL,
coordinateUncertaintyInMeters = NULL,
Expand All @@ -109,6 +303,9 @@ occ_data <- function(taxonKey=NULL,
lifeStage = NULL,
isInCluster = NULL,
distanceFromCentroidInMeters = NULL,
skip_validate = TRUE,
limit=500,
start=0,
curlopts = list(http_version=2)) {

geometry <- geometry_handler(geometry, geom_big, geom_size, geom_n)
Expand Down
Loading
Loading