-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathGeocode.R
127 lines (80 loc) · 4.37 KB
/
Geocode.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# This script formats the downloaded csv from Airtable and formats it for geocoding. Steps are commented out below.
#load ggmap
library(ggmap)
library(tidyverse)
# Read in CSV
#origAddress <- read.csv("MGG-App/NGSData/1965.csv", header = TRUE, sep =",", stringsAsFactors = FALSE)
########READ IN DATA AND PREP FOR GEOCODING########
origAddress <- read.csv("2-OrigDataforGeocoding/originaldata-2001.csv", header = TRUE)
origAddress <- origAddress %>%
mutate_if(is.character, trimws)
# paste together the street address, city and state in order to ensure we use full addresses for geocoding. Will minimize mistakes caused by common streetnames.
origAddress$full.address <- paste(origAddress$streetaddress, ", ", origAddress$city, ", ", origAddress$state)
##########GEOCODE DATA############################
# Register the google api code for the georeferencing service.
register_google(key = Sys.getenv("MGG_GOOGLE_KEY"))
#register_google(key = Sys.getenv("GOOGLEGEOCODE_API_KEY"))
# Loop through the addresses to get the latitude and longitude of each address and add it to the origAddress data frame in new columns lat and lon
for(i in 1:nrow(origAddress)) {
# Print("Working...")
result <- tryCatch(geocode(origAddress$full.address[i], output = "latlona", source = "google"), warning = function(w) data.frame(lon = NA, lat = NA, address = NA))
origAddress$lon[i] <- as.numeric(result[1])
origAddress$lat[i] <- as.numeric(result[2])
origAddress$geoAddress[i] <- as.character(result[3])
}
# find NA values in the lat and lon fields
#na_indices <- which(is.na(origAddress$lat) | is.na(origAddress$lon))
# Subset the data table to get rows with NA values in the lat field
#rows_with_na <- origAddress[na_indices, ]
##########MANIPULATE UNCLEAR DATA#################
#Make unclear address match the geocoded dataset
uncleardata <- read.csv(file = "1-UnclearData/uncleardata-2001.csv", sep = ",", header = TRUE, stringsAsFactors = FALSE)
#trim white space
uncleardata <- uncleardata %>%
mutate_if(is.character, trimws)
#create full address column
uncleardata$full.address <- paste(uncleardata$streetaddress, ", ", uncleardata$city, ", ", uncleardata$state, sep="")
#make sure both dfs have same columns
origAddress['status'] = 'Geocoded'
uncleardata['geoAddress'] = 'unclear_coded_by_hand'
#check col names
colnames(origAddress)
colnames(uncleardata)
colnames(origAddress)[1] = "ID"
colnames(uncleardata)[2] = "title"
colnames(origAddress)[10] = "uncleartype"
colnames(uncleardata)[10] = "unclear_address"
colnames(uncleardata)[17] = "Last.Modified"
uncleardata <- uncleardata[,-16] #remove "dateadded" column
#origAddress <- origAddress[,-11] #remove "notes column"
alldata <- rbind(origAddress, uncleardata)
alldata$lat <- as.numeric(alldata$lat)
alldata$lon <- as.numeric(alldata$lon)
########MERGE THE TWO DATASETS########
# Write a CSV file containing origAddress to the working directory
write.csv(alldata, "3-GeocodedDatasets/data-2001.csv", row.names=FALSE)
write.csv(alldata, "4-FullVerifiedDatasets/data-2001.csv", row.names=FALSE)
#COMMIT HERE LUC
#NO NEED TO CONTINUE, VERIFY IN KEPPLER
maindataset <- read.csv("data.csv", header = TRUE)
newdata <- read.csv("4-FullVerifiedDatasets/data-1989.csv", header = TRUE)
colnames(newdata)
newdata <- newdata %>% select(-unclearaddress, -uncleartype, -lastmodified, -full.address, -geoAddress)
mergeddata <- rbind(mergeddata, newdata)
unique(mergeddata$status)
mergeddata <- mergeddata %>% mutate(Status_Revised = fct_collapse(status,
"Google Verified Location" = "Geocoded",
"Verified Location" = c("Found", "For Review"),
"Location could not be verified. General city or location coordinates used." = c("General Coordinates Used", "Genderal Coordinates Used")))
unique(mergeddata$Status_Revised)
unique(mergeddata$state)
mergeddata <- mergeddata %>% mutate(state = fct_collapse(state, "DC" = c("D.C.", "District of Columbia")))
mergeddata <- mergeddata %>% select(-"status")
mergeddata <- dplyr::rename(mergeddata, status = Status_Revised)
write.csv(mergeddata, "data.csv")
saveRDS(mergeddata, "data.rds")
#create a list of amenities
amenities <- origAddress %>% select(amenityfeatures)
#unclear 2917 9%
#verified 7218 21%
#G verified 24029 70%