Update catalogue, pad underlength station codes, update readme, impro…

…ve tests, remove dependency-heavy station_explorer()
ConorIA · Apr 11, 2018 · c3f618d · c3f618d
1 parent 8f8b7e1
commit c3f618d
Show file tree

Hide file tree

Showing 22 changed files with 347 additions and 386 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -2,10 +2,9 @@
 ^\.Rproj\.user$
 ^\.travis\.yml$
 ^\.gitlab-ci\.yml$
-appveyor.yml
-.gitattributes
-.gitignore
-README.md
-README.Rmd
-LICENSE.md
-install_senamhiR.R
+^appveyor\.yml$
+^\.gitattributes$
+^\.gitignore$
+^README\..*$
+^LICENSE\.md$
+^install_senamhiR\.R$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,26 +1,24 @@
 Package: senamhiR
 Type: Package
 Title: A Collection of Functions to Obtain Peruvian Climate Data
-Version: 0.4.2
+Version: 0.5.0
 Date: 2017-10-08
 Authors@R: c(person(given = c("Conor", "I."), family = "Anderson", 
-           role = c("aut","cre"), email = "conor.anderson@mail.utoronto.ca"),
+           role = c("aut","cre"), email = "[email protected]"),
            person(given = c("William", "A."), family = "Gough", role = "ths",
            email = "[email protected]"))
-Maintainer: Conor I. Anderson <conor@conr.ca>
+Maintainer: Conor I. Anderson <conor.anderson@utoronto.ca>
 Description: A collection of functions to obtain archived Peruvian climatological or
     hydrological data form the Peruvian National Meterology and Hydrology Service.
 Depends:
     R (>= 3.1.0)
 Imports:
     DBI,
-    DT,
     dplyr,
     geosphere,
     leaflet,
     magrittr,
     RMySQL,
-    shiny,
     tibble,
     utils,
     zoo

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,16 +5,12 @@ export(map_stations)
 export(qc)
 export(quick_audit)
 export(senamhiR)
-export(station_explorer)
 export(station_search)
 importFrom(DBI,dbConnect)
 importFrom(DBI,dbDisconnect)
 importFrom(DBI,dbGetQuery)
 importFrom(DBI,dbListTables)
 importFrom(DBI,dbReadTable)
-importFrom(DT,dataTableOutput)
-importFrom(DT,datatable)
-importFrom(DT,renderDataTable)
 importFrom(RMySQL,MySQL)
 importFrom(dplyr,filter)
 importFrom(dplyr,select)
@@ -25,16 +21,6 @@ importFrom(leaflet,awesomeIcons)
 importFrom(leaflet,leaflet)
 importFrom(leaflet,setView)
 importFrom(magrittr,"%>%")
-importFrom(shiny,br)
-importFrom(shiny,column)
-importFrom(shiny,em)
-importFrom(shiny,fluidPage)
-importFrom(shiny,fluidRow)
-importFrom(shiny,runApp)
-importFrom(shiny,selectInput)
-importFrom(shiny,shinyApp)
-importFrom(shiny,shinyUI)
-importFrom(shiny,titlePanel)
 importFrom(stats,sd)
 importFrom(tibble,add_column)
 importFrom(tibble,as_tibble)

diff --git a/R/download_data_sql.R b/R/download_data_sql.R
@@ -20,14 +20,24 @@
 
 download_data_sql <- function(station, year) {
 
+  if (nchar(station) < 6) {
+    station <- suppressWarnings(try(sprintf("%06d", as.numeric(station)), silent = TRUE))
+    if (inherits(station, "try-error") | !station %in% catalogue$StationID) {
+      stop("Station ID appears invalid.")
+    }
+  }
+
   station_data <- catalogue[catalogue$StationID == station, ]
   type = station_data$Type
   config = station_data$Configuration
 
   conn <- dbConnect(MySQL(), user = "anonymous", host = "pcd.conr.ca", dbname = "pcd")
 
   sql_table <- paste0("ID_", station)
-  if (sum(dbListTables(conn) %in% sql_table) != 1) stop("There was an error getting that table.")
+  if (sum(dbListTables(conn) %in% sql_table) != 1) {
+    dbDisconnect(conn)
+    stop("There was an error getting that table.")
+  }
 
   if (missing(year)) {
     dat <- as_tibble(dbReadTable(conn, sql_table, row.names = NULL))

diff --git a/R/map_stations.R b/R/map_stations.R
@@ -25,6 +25,16 @@ map_stations <- function(station, zoom) {
   if (inherits(station, "data.frame")) {
     station <- station$StationID
   }
+
+  if (any(nchar(station) < 6)) {
+    station[nchar(station) < 6] <- suppressWarnings(
+      try(sprintf("%06d", as.numeric(station[nchar(station) < 6])),
+          silent = TRUE))
+  }
+
+  if (inherits(station, "try-error") || !station %in% catalogue$StationID) {
+    stop("One or more requested stations invalid.")
+  }
 
   poi <- NULL
 

diff --git a/R/qc.R b/R/qc.R
@@ -3,9 +3,9 @@
 ##' @description A helper function to perform minimal quality control on the data. 
 ##' For now, this script only performs action on the three main temperature variables.
 ##' 
-##' @param dat an R object of type data.frame passed form the export_data script
+##' @param dat a \code{tbl_df} generated form the \code{senamhiR} package
 ##'
-##' @return an R object of type data.frame.
+##' @return a \code{tbl_df}
 ##' 
 ##' @importFrom dplyr select filter
 ##' @importFrom tibble add_column

diff --git a/R/quick_audit.R b/R/quick_audit.R
@@ -27,7 +27,7 @@ quick_audit <- function(station, variables, by = "year", report = "pct", reverse
     dat <- station
   } else {
     if (inherits(station, "character")) {
-      dat <- read_data(station)
+      dat <- download_data_sql(station)
     } else {
       stop("I can't figure out what data you've given me.")
     }

diff --git a/R/senamhiR.R b/R/senamhiR.R
@@ -21,7 +21,13 @@ senamhiR <- function(station, year) {
     station <- trimws(unlist(strsplit(station, split = ",")))
   }
 
-  if (!station %in% catalogue$StationID) {
+  if (any(nchar(station) < 6)) {
+    station[nchar(station) < 6] <- suppressWarnings(
+      try(sprintf("%06d", as.numeric(station[nchar(station) < 6])),
+          silent = TRUE))
+  }
+
+  if (inherits(station, "try-error") || !station %in% catalogue$StationID) {
     stop("One or more requested stations invalid.")
   }
 

diff --git a/R/station_explorer.R b/R/station_explorer.R
diff --git a/R/station_search.R b/R/station_search.R
@@ -39,6 +39,13 @@
 station_search <- function(name = NULL, ignore.case = TRUE, glob = FALSE, region = NULL, 
   baseline = NULL, config = NULL, target = NULL, dist = 0:100, sort = TRUE, ...) {
 
+  if (!is.null(target) && length(target) == 1L && nchar(target) < 6) {
+    target <- suppressWarnings(try(sprintf("%06d", as.numeric(target)), silent = TRUE))
+    if (inherits(target, "try-error") || !target %in% catalogue$StationID) {
+      stop("Target station appears invalid.")
+    }
+  }
+
   # If `name` is not NULL, filter by name
   if (!is.null(name)) {
     if (glob) name <- glob2rx(name)

diff --git a/R/zzz.R b/R/zzz.R
@@ -1,3 +1,3 @@
 .onAttach <- function(libname = find.package("senamhiR"), pkgname = "senamhiR") {
   packageStartupMessage("The information accessed by this package was compiled and maintained by Peru's National Meteorology and Hydrology Service (Senamhi). The use of this data is of your sole responsibility.")
-}
+}
diff --git a/README.Rmd b/README.Rmd
@@ -8,11 +8,12 @@ output:
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
 library(senamhiR)
+library(dplyr)
 ```
 
 [![build status](https://gitlab.com/ConorIA/senamhiR/badges/master/build.svg)](https://gitlab.com/ConorIA/senamhiR/commits/master) [![Build status](https://ci.appveyor.com/api/projects/status/60kbu1b7wkf7akqn?svg=true)](https://ci.appveyor.com/project/ConorIA/senamhir-bxb45) [![codecov](https://codecov.io/gl/ConorIA/senamhiR/branch/master/graph/badge.svg)](https://codecov.io/gl/ConorIA/senamhiR)
 
-The package provides an automated solution for the acquisition of archived Peruvian climate and hydrology data directly within R. The data was compiled from the Senamhi website, and contains all of the data that was available as of March 2017. This data was originally converted from HTML, and is stored in a MySQL database in tibble format.
+The package provides an automated solution for the acquisition of archived Peruvian climate and hydrology data directly within R. The data was compiled from the Senamhi website, and contains all of the data that was available as of April 10, 2018. This data was originally converted from HTML, and is stored in a MySQL database in tibble format.
 
 It is important to note that the info on the Senamhi website has not undergone quality control, however, this package includes a helper function to perform the most common quality control operations for the temperature variables. More functions will be added in the future.
 
@@ -77,23 +78,15 @@ If I wanted to download data for Requena (station no. 000280) from 1981 to 2010,
 ```{r}
 requ <- senamhiR("000280", 1981:2010)
 ```
-_Note: Since the StationID numbers contain leading zeros, they must be entered as a character (in quotation marks)._
+_Note: Since the StationID numbers contain leading zeros, any station that is less than six characters long will be padded with zeroes. i.e. 280 becomes 000280._
 
 ```{r}
 requ
 ```
 
 Make sure to use the assignment operator (`<-`) to save the data into an R object, otherwise the data will just print out to the console, and won't get saved anywhere in the memory. 
 
-## Additional functions
-
-`senamhiR` includes some additional functions to help visualize stations more easily. 
-
-### `station_explorer()`
-
-Often, irrespective of the number of filters one uses, it is simply easier to just mouse through a table and find the data that one needs. To make this "mousing" just a little easier, I have included a Shiny data table to help with navigating the list of stations. Call the table up by running `station_explorer()` with no arguments. 
-
-This table is also fully compatible with the advanced search function. To use a filtered list of stations with the Shiny table, just pass a search result as an argument to the function. This result can be a call to `station_search()`, or an object containing a saved search result.
+## For easier station visualization
 
 ### `map_stations()`
 
@@ -113,7 +106,7 @@ There are two functions included to perform some basic quality control.
 The `quick_audit()` function will return a tibble listing the percentage or number of missing values for a station. For instance, the following command will return the percentage of missing values in our 30-year Requena dataset:
 
 ```{r}
-quick_audit(requ, c("Tmean", "Tmax", "Tmin"))
+quick_audit(requ, c("Tmax", "Tmin"))
 ```
 
 Use `report = "n"` to show the _number_ of missing values. Use `by = "month"` to show missing data by month instead of year. For instance, the number of days for which Mean Temperature was missing at Tocache in 1980:
@@ -128,8 +121,10 @@ quick_audit(toca, "Tmean", by = "month", report = "n")
 There is an incomplete and experimental function to perform automated quality control on climate data acquired thought this package. For instance: 
 
 ```{r}
-toca <- senamhiR("000463", year = 1980)
-quick_audit(toca, "Tmean", by = "month", report = "n")
+requ_dirty <- senamhiR("000280") #1960 to 2018
+requ_qc <- qc(requ_dirty)
+requ_qc %>% filter(Observations != "") %>% select(Fecha, `Tmax (C)`, `Tmin (C)`, `Tmean (C)`, Observations)
+
 ```
 
 For now, the data has been tested for decimal place-errors with the following logic: 
@@ -142,28 +137,28 @@ If the number appears to have missed a decimal place (e.g. 324 -> 32.4; 251 -> 2
 
 If the number seems to be the result of some other typographical error (e.g. 221.2), we discard the data point. 
 
-##### Case 2: _T~max~_ < _T~min~_
+##### Case 2: _T_<sub>max</sub> < _T_<sub>min</sub>
 
-In case 2, we perform the same tests for both _T~max~_ and _T~min~_. If the number is within 1.5 standard deviations of all values 30 days before and after the day in question, we leave the number alone. (Note: this is often the case for _T~min~_ but seldom the case for _T~max~_). If the number does not fall within 1.5 standard deviations, we perform an additional level of testing to check if the number is the result of a premature decimal point (e.g. 3.4 -> 34.0; 3 -> 30.0). In this case, we try to multiply the number by 10. If this new result is within 1.5 standard deviations of all values 30 days before and after the day in question, we keep the result, otherwise, we discard it.
+In case 2, we perform the same tests for both _T_<sub>max</sub> and _T_<sub>min</sub>. If the number is within 1.5 standard deviations of all values 30 days before and after the day in question, we leave the number alone. (Note: this is often the case for _T_<sub>min</sub> but seldom the case for _T_<sub>max</sub>). If the number does not fall within 1.5 standard deviations, we perform an additional level of testing to check if the number is the result of a premature decimal point (e.g. 3.4 -> 34.0; 3 -> 30.0). In this case, we try to multiply the number by 10. If this new result is within 1.5 standard deviations of all values 30 days before and after the day in question, we keep the result, otherwise, we discard it.
 
 _I have less confidence in this solution than I do for Case 1._
 
 #### Cases that are currently missed:
 
- - Cases where _T~min~_ is small because of a typo.
- - Cases where _T~max~_ is small because of a typo, but not smaller than _T~min~_.
+ - Cases where _T_<sub>min</sub> is small because of a typo.
+ - Cases where _T_<sub>max</sub> is small because of a typo, but not smaller than _T_<sub>min</sub>.
 
 #### Cases where this function is plain wrong: 
 
  - When there are a number of similar errors within the 60-day period, bad data is sometimes considered okay. This is especially apparent at, for instance, Station 47287402.
 
 #### Variables controlled for: 
 
- - _T~max~_
- - _T~min~_
- - _T~mean~_
+ - _T_<sub>max</sub>
+ - _T_<sub>min</sub>
+ - _T_<sub>mean</sub>
 
-__No other variables are currently tested; hydrological data is not tested. This data should not be considered "high quality", use of the data is your responsibility.__ Note that all values that are modified form their original values will be recorded in a new "Observations" column in the resultant tibble.
+__No other variables are currently tested; hydrological data is not tested. This data should not be considered "high quality", use of the data is your responsibility.__ Note that all values that are modified from their original values will be recorded in a new "Observations" column in the resultant tibble.
 
 ## Disclaimer
 
@@ -177,7 +172,9 @@ If that seems like a lot of work, just think about how much work it would have b
 
 ## Senamhi terms of use
 
-Senamhi's terms of use were originally posted [here](http://www.senamhi.gob.pe/?p=0613), but that link is currently redirecting to the Senamhi home page. However, the text of the terms was identical to the [terms](http://www.peruclima.pe/?p=condiciones) of Senamhi's PeruClima website  ([Google translation](https://translate.google.com/translate?hl=en&sl=es&tl=en&u=http%3A%2F%2Fwww.peruclima.pe%2F%3Fp%3Dcondiciones)). The terms allow for the free and public access to information on their website. Likewise, the data may be used in for-profit and non-profit applications. However, Senamhi stipulates that any use of the data must be accompanied by a disclaimer that Senamhi is the proprietor of the information. The following text is recommended (official text in Spanish):
+Senamhi's terms of use are _technically_ supposed to be [here](http://senamhi.gob.pe/?p=terminos_condiciones), but that link is currently redirecting to the Senamhi home page. I authored this package when the terms were still available online. At the time of development, the terms allowed for the free and public access to information on their website, in both for-profit and non-profit applications. However, Senamhi stipulated that any use of the data must be accompanied by a disclaimer that Senamhi is the proprietor of the information. The following text was recommended (official text in Spanish):
 
 - **Official Spanish:** _Información recopilada y trabajada por el Servicio Nacional de Meteorología e Hidrología del Perú. El uso que se le da a esta información es de mi (nuestra) entera responsabilidad._
 - **English translation:** This information was compiled and maintained by Peru's National Meteorology and Hydrology Service (Senamhi). The use of this data is of my (our) sole responsibility.
+
+A message similar to the English message above is printed to the R console whenever the package is loaded.