Skip to content

Commit

Permalink
Bug fixes and fixing some tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
ConorIA committed Sep 24, 2018
1 parent 98aad78 commit 159f57d
Show file tree
Hide file tree
Showing 10 changed files with 406 additions and 322 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: senamhiR
Type: Package
Title: A Collection of Functions to Obtain Peruvian Climate Data
Version: 0.6.4
Version: 0.6.5
Date: 2018-06-27
Authors@R: c(person(given = c("Conor", "I."), family = "Anderson",
role = c("aut","cre"), email = "[email protected]"),
Expand Down
7 changes: 6 additions & 1 deletion R/qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

qc <- function(dat) {

attrs_to_append <- append(attributes(dat)[4:length(attributes(dat))], list(`QC Date` = Sys.Date()))

if (inherits(dat, "character") & !inherits(dat, "data.frame")) {
if (length(dat) > 1L) {
stop("Sorry, for now this script can only process one station at a time.")
Expand Down Expand Up @@ -51,7 +53,7 @@ qc <- function(dat) {
}

if (length(minshifts) > 0) {
for (i in 1:length(maxshifts)) {
for (i in 1:length(minshifts)) {
bad_table <- select(dat, Fecha, var = `Tmin (C)`)
fixes <- .fix_bad_data(bad_table, minshifts[i], "Tmin", "dps")
dat$`Tmin (C)`[minshifts[i]] <- unlist(fixes[1])
Expand Down Expand Up @@ -86,5 +88,8 @@ qc <- function(dat) {
observations[is.na(observations)] <- ''
dat <- add_column(dat, Observations = observations)

attributes(dat) <- append(attributes(dat), attrs_to_append)
rownames(dat) <- NULL

dat
}
100 changes: 56 additions & 44 deletions R/quick_audit.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
##'
##' @param station character; a station id number to process or a \code{tbl_df} containing the data to process
##' @param variables numeric or character; by default, all variables will be included. Pass a numeric vector to specify columns to include, or pass a character vector to try to match column names
##' @param by character; whether values should be reported annually (\code{by = "year"}), or monthly (\code{by = "month"})
##' @param by character; whether values should be reported annually (\code{by = "year"}), monthly (\code{by = "month"}, or as an overall (default).
##' @param report character; whether values should be reported as percentage missing (\code{report = "pct"}) or as number of missing values (\code{report = "n"})
##' @param reverse Boolean; if \code{TRUE}, will show percentage present instead of percentage missing (only applies if \code{report = "pct"})
##'
Expand All @@ -21,24 +21,25 @@
##' \dontrun{quick_audit("000401", 2:10, by = "month", report = "n")}
##'

quick_audit <- function(station, variables, by = "year", report = "pct", reverse = FALSE) {

quick_audit <- function(station, variables, by = NULL, report = "pct", reverse = FALSE) {
if (inherits(station, "tbl_df")) {
dat <- station
} else {
if (inherits(station, "character")) {
dat <- download_data_sql(station)
dat <- download_data(station)
} else {
stop("I can't figure out what data you've given me.")
}
}

if (by != "month" && by != "year") {
warning("By was neither \"month\" nor \"year\". Defaulting to year.")

if (!is.null(by) && (by != "month" && by != "year")) {
warning("By was neither \"month\" nor \"year\". Defaulting to overall total.")
by <- NULL
}

if (missing(variables)) {
variables <- 2:ncol(dat)
variables <- min(which(!(names(dat) %in% c("StationID", "Fecha")))):ncol(dat)
} else {
if (inherits(variables, "character")) {
for (var in seq_along(variables)) {
Expand All @@ -47,61 +48,72 @@ quick_audit <- function(station, variables, by = "year", report = "pct", reverse
}
variables <- as.numeric(variables)
}

if (reverse) {
ctl = 1
metric = "present"
} else {
ctl = 0
metric = "NA"
}

years <- min(format(dat$Fecha, format = "%Y")):max(format(dat$Fecha, format = "%Y"))

if (by == "month") {
months <- min(format(dat$Fecha, format = "%m")):max(format(dat$Fecha, format = "%m"))
yearmons <- apply(expand.grid(sprintf("%02d", months), years), 1, function(x) paste(x[2], x[1], sep = "-"))
out <- tibble(`Year-month` = yearmons)
timestep <- yearmons

if (is.null(by)) {
timestep <- 1
out <- tibble(Report = "Total")
} else {
out <- tibble(Year = years)
timestep <- years
if (by == "month") {
months <- min(format(dat$Fecha, format = "%m")):max(format(dat$Fecha, format = "%m"))
yearmons <- apply(expand.grid(sprintf("%02d", months), years), 1, function(x) paste(x[2], x[1], sep = "-"))
out <- tibble(`Year-month` = yearmons)
timestep <- yearmons
} else {
out <- tibble(Year = years)
timestep <- years
}
}

for (var in variables) {
integrity <- missingvec <- consecNAvec <- NULL
for (t in timestep) {
colname <- colnames(dat)[var]

for (var in variables) {
integrity <- missingvec <- consecNAvec <- NULL
for (t in timestep) {
colname <- colnames(dat)[var]
if (is.null(by)) {
timely <- dat
} else {
if (by == "month") {
timely <- dat[format(dat$Fecha, format = "%Y-%m") == t,]
} else {
timely <- dat[format(dat$Fecha, format = "%Y") == t,]
}
obs <- nrow(timely)
missing <- sum(is.na(timely[[var]]))
if (report == "pct") {
integrity <- c(integrity, as.numeric(abs(ctl-(missing/obs))*100))
} else {
missingvec <- c(missingvec, as.integer(missing))
consecNA <- rle(is.na(timely[[var]]))
consecNA <- max(consecNA$lengths[consecNA$values], 0)
consecNAvec <- c(consecNAvec, as.integer(consecNA))
}
}
obs <- nrow(timely)
missing <- sum(is.na(timely[[var]]))
if (report == "pct") {
out <- add_column(out, integ = integrity)
names(out)[which(names(out) == "integ")] <- paste(colname, "pct", metric)
integrity <- c(integrity, as.numeric(abs(ctl-(missing/obs))*100))
} else {
out <- add_column(out, consec = consecNAvec, rand = missingvec)
names(out)[which(names(out) == "consec")] <- paste(colname, "consec NA")
names(out)[which(names(out) == "rand")] <- paste(colname, "tot NA")
missingvec <- c(missingvec, as.integer(missing))
consecNA <- rle(is.na(timely[[var]]))
consecNA <- max(consecNA$lengths[consecNA$values], 0)
consecNAvec <- c(consecNAvec, as.integer(consecNA))
}

}
if (by == "month") {
out$`Year-month` <- as.yearmon(out$`Year-month`)
out <- add_column(out, Year = format(out$`Year-month`, format = "%Y"), Month = format(out$`Year-month`, format = "%m"), .before = 1)
if (report == "pct") {
out <- add_column(out, integ = integrity)
names(out)[which(names(out) == "integ")] <- paste(colname, "pct", metric)
} else {
out <- add_column(out, consec = consecNAvec, rand = missingvec)
names(out)[which(names(out) == "consec")] <- paste(colname, "consec NA")
names(out)[which(names(out) == "rand")] <- paste(colname, "tot NA")
}


}
if (is.null(by)) {
out <- out[,-1]
} else if (by == "month") {
out$`Year-month` <- as.yearmon(out$`Year-month`)
out <- add_column(out, Year = format(out$`Year-month`, format = "%Y"), Month = format(out$`Year-month`, format = "%m"), .before = 1)
}

return(out)
}
11 changes: 4 additions & 7 deletions README.Rmd
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
---
title: "senamhiR: A collection of functions to obtain Peruvian climate data in R"
output:
html_document:
keep_md: yes
github_document
---

```{r setup, include=FALSE}
Expand All @@ -11,8 +10,6 @@ library(senamhiR)
library(dplyr)
```

[![build status](https://gitlab.com/ConorIA/senamhiR/badges/master/build.svg)](https://gitlab.com/ConorIA/senamhiR/commits/master) [![Build status](https://ci.appveyor.com/api/projects/status/60kbu1b7wkf7akqn?svg=true)](https://ci.appveyor.com/project/ConorIA/senamhir-bxb45) [![codecov](https://codecov.io/gl/ConorIA/senamhiR/branch/master/graph/badge.svg)](https://codecov.io/gl/ConorIA/senamhiR)

The package provides an automated solution for the acquisition of archived Peruvian climate and hydrology data directly within R. The data was compiled from the Senamhi website, and contains all of the data that was available as of April 10, 2018. This data was originally converted from HTML, and is stored in a MySQL database in tibble format.

It is important to note that the info on the Senamhi website has not undergone quality control, however, this package includes a helper function to perform the most common quality control operations for the temperature variables. More functions will be added in the future.
Expand Down Expand Up @@ -103,13 +100,13 @@ There are two functions included to perform some basic quality control.

### `quick_audit()`

The `quick_audit()` function will return a tibble listing the percentage or number of missing values for a station. For instance, the following command will return the percentage of missing values in our 30-year Requena dataset:
The `quick_audit()` function will return a tibble listing the percentage or number of missing values for a station. For instance, the following command will return the percentage of missing values in our 30-year Requena data set:

```{r}
quick_audit(requ, c("Tmax", "Tmin"))
```

Use `report = "n"` to show the _number_ of missing values. Use `by = "month"` to show missing data by month instead of year. For instance, the number of days for which Mean Temperature was missing at Tocache in 1980:
Use `report = "n"` to show the _number_ of missing values. Use `by = "month"` or `by = "year"` to show missing data by month or year. For instance, the number of days for which Mean Temperature was missing at Tocache in 1980:

```{r}
toca <- senamhiR("000463", year = 1980)
Expand Down Expand Up @@ -172,7 +169,7 @@ If that seems like a lot of work, just think about how much work it would have b

## Senamhi terms of use

Senamhi's terms of use are [here](http://senamhi.gob.pe/?p=terminos_condiciones), but as of writing that link was redirecting to the Senamhi home page. An archived version is availale [here](https://web.archive.org/web/20170822092538/http://senamhi.gob.pe/?p=terminos_condiciones). The terms allow for the free and public access to information on the Senamhi website, in both for-profit and non-profit applications. However, Senamhi stipulates that any use of the data must be accompanied by a disclaimer that Senamhi is the proprietor of the information. The following text is recommended (official text in Spanish):
Senamhi's terms of use are [here](http://senamhi.gob.pe/?p=terminos_condiciones), but as of writing that link was redirecting to the Senamhi home page. An archived version is available [here](https://web.archive.org/web/20170822092538/http://senamhi.gob.pe/?p=terminos_condiciones). The terms allow for the free and public access to information on the Senamhi website, in both for-profit and non-profit applications. However, Senamhi stipulates that any use of the data must be accompanied by a disclaimer that Senamhi is the proprietor of the information. The following text is recommended (official text in Spanish):

- **Official Spanish:** _Información recopilada y trabajada por el Servicio Nacional de Meteorología e Hidrología del Perú. El uso que se le da a esta información es de mi (nuestra) entera responsabilidad._
- **English translation:** This information was compiled and maintained by Peru's National Meteorology and Hydrology Service (Senamhi). The use of this data is of my (our) sole responsibility.
Expand Down
Loading

0 comments on commit 159f57d

Please sign in to comment.