-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path.Rhistory
512 lines (512 loc) · 26.4 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
count(dummy$year)
!is.na(dummy[-1,])
!is.na(dummy[,-1])
dummy<- DDcomplete %>%
group_by(dummy$year) %>%
summarize(countnotna = sum(!is.na(x)))
dummy %>%
group_by(year) %>%
summarize_each(funs(sum(!is.na(x))))
dummy <- all_data
dummy %>%
group_by(year) %>%
summarize(count = (sum(!is.na(x))))
dummy <- all_data
dummy %>%
group_by(year)
typeof("hello")
typeof(NA)
dummy <- all_data
dummy <- sapply(dummy, function(x) if(typeof(x) == "character") x = 1)
dummy
dummy <- all_data
dummy <- lapply(dummy, function(x) if(typeof(x) == "character") x = 1)
dummy
dummy <- all_data
dummy <- if(typeof(dummy) == "character") x = 1
dummy
dummy <- ifelse(typeof(dummy) == "character", 1,)
dummy <- ifelse(typeof(dummy) == "character", 1, dummy)
dummy <- all_data
dummy <- ifelse(typeof(dummy) == "character", 1, dummy)
dummy
dummy <- all_data
setDT(dummy)[, lapply(sum(!is.na)), by = year]
setDT(dummy)[, lapply(sum(!is.na())), by = year]
setDT(dummy)[, lapply(colSums(!is.na())), by = year]
setDT(dummy)[, lapply(colSums(!is.na), by = year]
dummy <- all_data
dummy %>%
group_by(year) %>%
summarise_each(funs(sum(is.na(.))))
dummy
dummy <- all_data
nacount <- dummy %>%
group_by(year) %>%
summarise_each(funs(sum(is.na(.))))
nacount
completeness.by.year <- 1-nacount[,-c(1)]/yearcounts
completeness.by.year
completeness.by.year$locale
completeness.by.year$degree_urbanization
all_data[year == "1996_97"]
all_data[year == "1996_97", "degree_urbanization"]
sum(!is.na(all_data[year == "1996_97", "degree_urbanization"]))
completeness.by.year
bool.complete.by.year <- completeness.by.year > .5
bad_cols <- names(which(colSums(bool.complete.by.year)==0))
bad_cols
good_cols <- names(all_data)[!(names(all_data) %in% bad_cols)]
cleaned_data <- all_data[, good_cols]
good_cols
all_data
nrow(all_data)
ncol(all_data)
str(good_cols)
cleaned_data <- all_data[good_cols]
str(all_data)
cleaned_data <- as.data.frame(all_data)[,good_cols]
ncol(cleaned_data)
completeness.by.year$degree_urbanization
setwd('/home/kevinisagirl/Desktop/workspace/datamunging/Project/EDA_College-Scorecard/')
saveRDS(cleaned_data, file="Clean_CollegeScorecard_Rurality.rds")
cleaned_data <- readRDS("Clean_CollegeScorecard_Rurality.rds")
nacount <- all_data %>%
group_by(year) %>%
summarise_each(funs(sum(is.na(.))))
completeness.by.year <- 1-nacount[,-c(1)]/yearcounts
bool.complete.by.year <- completeness.by.year > .5
bad_cols <- names(which(colSums(bool.complete.by.year)==0))
good_cols <- names(all_data)[!(names(all_data) %in% bad_cols)]
cleaned_data <- as.data.frame(all_data)[,good_cols]
head(cleaned_data)
setwd('/home/kevinisagirl/Desktop/workspace/datamunging/Project/EDA_College-Scorecard/')
saveRDS(cleaned_data, file="Clean_CollegeScorecard_Rurality.rds")
cleaned_data <- readRDS("Clean_CollegeScorecard_Rurality.rds")
cleaned_data
yearcounts <- c(unname(table(cleaned_data$year)))
nacount <- all_data %>%
group_by(year) %>%
summarise_each(funs(sum(is.na(.))))
completeness.by.year <- cbind(year = nacount$year, 1-nacount[,-c(1)]/yearcounts, stringsAsFactors=FALSE)
setwd('/home/kevinisagirl/Desktop/workspace/datamunging/Project/EDA_College-Scorecard/')
saveRDS(completeness.by.year, file="Completeness_Clean_CollegeScorecard_Rurality.rds")
completeness.by.year
completeness.by.year$degrees_awarded.predominant_recoded
collegeData <- readRDS("Clean_CollegeScorecard_Rurality.rds")
collegeData$degrees_awarded.predominant_recoded
collegeData["year" == "2014_15", "degrees_awarded.predominant_recoded"]
collegeData$degrees_awarded.predominant_recoded[collegeData$year == "2014_15"]
data(collegeData)
attach(collegeData)
year
lm.predomdegree.rurality <- lm(degrees_awarded.predominant_recoded ~ rurality)
plot(lm.predomdegree.rurality)
lm.predomdegree.rurality
plot(rurality, degrees_awarded.predominant_recoded)
library(ggplot)
install.packages("ggplot")
install.packages("ggplot2")
library(ggplot)
library(ggplot2)
ggplot2(collegeData[year == "2014_15"], aes(x=factor(rurality), y=degrees_awarded.predominant_recoded)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[year == "2014_15"], aes(x=factor(rurality), y=degrees_awarded.predominant_recoded)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2014_15"], aes(x=factor(rurality), y=degrees_awarded.predominant_recoded)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2014_15",], aes(x=factor(rurality), y=degrees_awarded.predominant_recoded)) + stat_summary(fun.y="mean", geom = "bar")
str(collegeData$degrees_awarded.predominant_recoded)
ggplot(collegeData[collegeData$year == "2014_15",], aes(x=factor(rurality), y=degrees_awarded.highest)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2014_15",], aes(x=factor(rurality), y=program_percentage.history)) + stat_summary(fun.y="mean", geom = "bar")
var_list = combn(names(collegeData)[11:57], 2, simplify=FALSE)
plot_list = list()
for (i in length(var_list)) {
p = ggplot(collegeData[collegeData$year == "2014_15"], aes_string(x=factor(rurality), y=var_list[[i]][2])) +
stat_summary(fun.y="mean", geom = "bar")
plot_list[[i]] = p
}
for (i in length(var_list)) {
p = ggplot(collegeData[collegeData$year == "2014_15",], aes_string(x=factor(rurality), y=var_list[[i]][2])) +
stat_summary(fun.y="mean", geom = "bar")
plot_list[[i]] = p
}
pdf("plots.pdf")
for (i in length(var_list)) {
print(plot_list[[i]])
}
for (i in length(plot_list)) {
print(plot_list[[i]])
}
pdf("plots.pdf")
for (i in length(plot_list)) {
print(plot_list[[i]])
}
length(plot_list)
head(plot_list)
var_list
var_list = (names(collegeData)[11:57]
# Make plots.
plot_list = list()
var_list = names(collegeData)[11:57]
var_list
plot_list = list()
var_list[1]
for (i in length(var_list)) {
p = ggplot(collegeData[collegeData$year == "2014_15",], aes_string(x=factor(rurality), y=var_list[i])) +
stat_summary(fun.y="mean", geom = "bar")
plot_list[i] = p
}
plot_list
pdf("plots.pdf")
for (i in length(plot_list)) {
print(plot_list[[i]])
}
dev.off()
plot_list = list()
for (i in length(var_list)) {
p = ggplot(collegeData[collegeData$year == "2014_15",], aes(x=factor(rurality), y=var_list[i])) +
stat_summary(fun.y="mean", geom = "bar")
plot_list[i] = p
}
var_list = names(collegeData)[15:52]
plot_list = list()
for (i in length(var_list)) {
p = ggplot(collegeData[collegeData$year == "2014_15",], aes(x=factor(rurality), y=var_list[i])) +
stat_summary(fun.y="mean", geom = "bar")
plot_list[i] = p
}
rowSums(completeness.by.year[,c(15:52)] != 0)
var_list = names(collegeData)[15:52]
plot_list = list()
for (i in length(var_list)) {
p = ggplot(collegeData[collegeData$year == "2014_16",], aes(x=factor(rurality), y=var_list[i])) +
stat_summary(fun.y="mean", geom = "bar")
plot_list[i] = p
}
for (i in length(var_list)) {
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=var_list[i])) +
stat_summary(fun.y="mean", geom = "bar")
plot_list[i] = p
}
for (i in c(15:52)) {
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=var_list[i])) +
stat_summary(fun.y="mean", geom = "bar")
plot_list[i] = p
}
plot_list
for (i in c(15:52)) {
print(plot_list[[i]])
}
p[[1]]
p[1]
plot_list[1]
plot_list[[15]]
plot_list[15]
for (i in 1) {
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=var_list[i])) +
stat_summary(fun.y="mean", geom = "bar")
test[i] = p
}
test = list()
for (i in 1) {
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=var_list[i])) +
stat_summary(fun.y="mean", geom = "bar")
test[i] = p
}
test = list()
for (i in 1) {
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=collegedata[,var_list[i]])) +
stat_summary(fun.y="mean", geom = "bar")
test[i] = p
}
test
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=collegedata[,var_list[i]])) +
stat_summary(fun.y="mean", geom = "bar")
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=collegedata[,var_list[1]])) +
stat_summary(fun.y="mean", geom = "bar")
p
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=collegeData[,var_list[1]])) +
stat_summary(fun.y="mean", geom = "bar")
p
var_list[1]
as.name(var_list[1])
collegeData$as.name(var_list[1])
var_list = as.name(names(collegeData)[15:52])
var_list[1]
as.name(names(collegeData))
as.name(names(collegeData[,15:25]))
names(collegeData[1:5])
lapply(names(collegeData[1:5]), function(x) as.name(x))
sapply(names(collegeData[1:5]), function(x) as.name(x))
var_list = lapply(names(collegeData[15:52]), function(x) as.name(x))
var_list
var_list[38]
var_list[[38]]
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=(var_list[[1]]))) +
stat_summary(fun.y="mean", geom = "bar")
p
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=var_list[[1]])) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
p
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=var_list[[1]])) +
stat_summary(fun.y="mean", geom = "bar")
p
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.agriculture)) +
stat_summary(fun.y="mean", geom = "bar")
p
var_list = names(collegeData[15:52])
var_list
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=var_list[38])) +
stat_summary(fun.y="mean", geom = "bar")
p
var_list[38]
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=var_list[1])) +
stat_summary(fun.y="mean", geom = "bar")
p
(p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=collegeData$var_list[1])) + stat_summary(fun.y="mean", geom = "bar"))
(p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=collegeData[var_list[1]]) + stat_summary(fun.y="mean", geom = "bar"))
test[i] = p
(p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=collegeData[var_list[1]])) + stat_summary(fun.y="mean", geom = "bar"))
var_list[1]
as.name[var_list[1]]
as.name(var_list[1])
(p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=as.name(var_list[1]))) + stat_summary(fun.y="mean", geom = "bar"))
var_list = names(collegeData[15:52])
as.name(var_list[1])
(p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=names(collegeData)[15])) + stat_summary(fun.y="mean", geom = "bar"))
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
p
var_list = combn(names(iris)[1:3], 2, simplify=FALSE)
plot_list = list()
for (i in 1:3) {
p = ggplot(iris, aes_string(x=var_list[[i]][1], y=var_list[[i]][2])) +
geom_point(size=3, aes(colour=Species))
plot_list[[i]] = p
}
pdf("plots.pdf")
for (i in 1:3) {
print(plot_list[[i]])
}
plot_list[[1]]
var_list = combn(names(collegeData)[15:18], 2, simplify=FALSE)
for (i in 1:3) {
p = ggplot(collegeData[c(1:50),], aes_string(x=var_list[[i]][1], y=var_list[[i]][2])) +
geom_point(size=3, aes(colour=Species))
plot_list[[i]] = p
}
plot_list[[1]]
for (i in 1:3) {
p = ggplot(collegeData[c(1:50),], aes_string(x=var_list[[i]][1], y=var_list[[i]][2])) +
geom_point(size=3)
plot_list[[i]] = p
}
plot_list[[1]]
var_list = combn(names(collegeData)[15:18], simplify=FALSE)
var_list = combn(names(collegeData)[15:18], 1, simplify=FALSE)
var_list = combn(names(collegeData)[15:18], 1, simplify=FALSE)
plot_list = list()
for (i in 1:3) {
p = ggplot(collegeData[c(1:50),], aes_string(x=rurality, y=var_list[[i]][1])) +
geom_point(size=3)
plot_list[[i]] = p
}
plot_list[[1]]
plot_list = list()
for (i in 1:3) {
p = ggplot(collegeData[c(1:50),], aes_string(x=rurality, y=var_list[[i]][1])) + stat_summary(fun.y="mean", geom = "bar") +
geom_point(size=3)
plot_list[[i]] = p
}
plot_list[[1]]
plot_list = list()
for (i in 1:3) {
p = ggplot(collegeData[c(1:4),], aes_string(x=rurality, y=var_list[[i]][1])) + stat_summary(fun.y="mean", geom = "bar") +
geom_point(size=3)
plot_list[[i]] = p
}
plot_list[[1]]
var_list[[1]][1]
for (i in 1:3) {
p = ggplot(collegeData[c(1:4),], aes_string(x=rurality, y=var_list[[i]][1])) + stat_summary(fun.y="mean", geom = "bar")
plot_list[[i]] = p
}
plot_list = list()
for (i in 1:3) {
p = ggplot(collegeData[c(1:4),], aes_string(x=rurality, y=var_list[[i]][1])) + stat_summary(fun.y="mean", geom = "bar")
plot_list[[i]] = p
}
plot_list[[1]]
plot_list = list()
for (i in 1:3) {
p = ggplot(collegeData[c(1:4),], aes(x=rurality, y=var_list[[i]][1])) + stat_summary(fun.y="mean", geom = "bar")
plot_list[[i]] = p
}
plot_list[[1]]
for (i in 1:3) {
ggplot(collegeData[c(1:4),], aes(x=rurality, y=var_list[[i]][1])) + stat_summary(fun.y="mean", geom = "bar")
}
ggplot(collegeData[c(1:4),], aes(x=rurality, y=var_list[[i]][1])) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=var_list[[i]][1])) + stat_summary(fun.y="mean", geom = "bar")
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
p
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
p = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
p
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
q = ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
q
dev.off()
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.ethnic_cultural_gender)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=program_percentage.mechanic_repair_technology)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality)[1:3], y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality %in% c(1:3)), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(subset(collegeData, rurality %in% c(1:3))), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16", college$rurality %in% c(1:3)], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16", collegeData$rurality %in% c(1:3)], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16", collegeData$rurality == 1:3], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16", collegeData$rurality == 1], aes(x=factor(rurality), y=program_percentage.agriculture)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=faculty_salary)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(locale), y=faculty_salary)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_25_older)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_firstgeneration)) + stat_summary(fun.y="mean", geom = "bar")
completeness.by.year$share_firstgeneration
collegeData[collegeData$year == "2015_16", "share_firstgeneration"]
mean(collegeData[collegeData$year == "2015_16", "share_firstgeneration"])
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_firstgeneration)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(subset(collegeData[collegeData$year == "2015_16",], share_firstgeneration != "PrivacySuppressed"), aes(x=factor(rurality), y=share_firstgeneration)) + stat_summary(fun.y="mean", geom = "bar")
subset(collegeData[collegeData$year == "2015_16",], share_firstgeneration != "PrivacySuppressed")
subset(collegeData[collegeData$year == "2015_16",], share_firstgeneration != "PrivacySuppressed")[share_firstgeneration]
subset(collegeData[collegeData$year == "2015_16",], share_firstgeneration != "PrivacySuppressed")[, share_firstgeneration]
subset(collegeData[collegeData$year == "2015_16"], share_firstgeneration != "PrivacySuppressed")[, share_firstgeneration]
subset(collegeData[collegeData$year == "2015_16", "share_firstgeneration"], share_firstgeneration != "PrivacySuppressed")
all_data <- readRDS("CollegeScorecard_Rurality.rds")
setwd('/home/kevinisagirl/Desktop/workspace/datamunging/Project/EDA_College-Scorecard/')
all_data <- readRDS("CollegeScorecard_Rurality.rds")
all_data[all_data == "PrivacySuppressed"] <- NA
library(doBy)
yearcounts <- c(unname(table(all_data$year)))
nacount <- all_data %>%
group_by(year) %>%
summarise_each(funs(sum(is.na(.))))
completeness.by.year <- 1-nacount[,-c(1)]/yearcounts
bool.complete.by.year <- completeness.by.year > .5
bad_cols <- names(which(colSums(bool.complete.by.year)==0))
good_cols <- names(all_data)[!(names(all_data) %in% bad_cols)]
cleaned_data <- as.data.frame(all_data)[,good_cols]
setwd('/home/kevinisagirl/Desktop/workspace/datamunging/Project/EDA_College-Scorecard/')
saveRDS(cleaned_data, file="Clean_CollegeScorecard_Rurality.rds")
yearcounts <- c(unname(table(cleaned_data$year)))
nacount <- all_data %>%
group_by(year) %>%
summarise_each(funs(sum(is.na(.))))
completeness.by.year <- cbind(year = nacount$year, 1-nacount[,-c(1)]/yearcounts, stringsAsFactors=FALSE)
setwd('/home/kevinisagirl/Desktop/workspace/datamunging/Project/EDA_College-Scorecard/')
saveRDS(completeness.by.year, file="Completeness_Clean_CollegeScorecard_Rurality.rds")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_firstgeneration)) + stat_summary(fun.y="mean", geom = "bar")
collegeData <- readRDS("Clean_CollegeScorecard_Rurality.rds")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_firstgeneration)) + stat_summary(fun.y="mean", geom = "bar")
collegeData$share_firstgeneration
collegeData$share_firstgeneration[year == "2015_16"]
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_25_older)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_firstgeneration)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_firstgeneration_parents.middleschool)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_25_older)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_firstgeneration_parents.highschool)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=degrees_awarded.highest)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=tuition.out_of_state)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=tuition.in_state)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=attendance.academic_year)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=demographics.age_entry)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=demographics.over_23_at_entry)) + stat_summary(fun.y="mean", geom = "bar")
completeness.by.year
completeness.by.year[demographics.over_23_at_entry]
completeness.by.year$demographics.over_23_at_entry
c(completeness.by.year$year, completeness.by.year$demographics.over_23_at_entry)
completeness.by.year$year
ggplot(collegeData[collegeData$year == "2005_06",], aes(x=factor(rurality), y=demographics.over_23_at_entry)) + stat_summary(fun.y="mean", geom = "bar")
collegeData$demographics.age_entry
collegeData[year == "2005_06", demographics.age_entry]
collegeData[year == "2005_06",]$demographics.over_23_at_entry
as.numeric(collegeData[year == "2005_06",]$demographics.over_23_at_entry)
str(collegeData[year == "2005_06",]$demographics.over_23_at_entry)
str(cleaned_data)
transform(cleaned_data, share_firstgeneration = as.numeric(share_firstgeneration),
share_firstgeneration_parents.highschool = as.numeric(share_firstgeneration_parents.highschool),
share_firstgeneration_parents.middleschool = as.numeric(share_firstgeneration_parents.middleschool),
share_firstgeneration_parents.somecollege = as.numeric(share_firstgeneration_parents.somecollege),
demographics.age_entry = as.numeric(demographics.age_entry),
demographics.over_23_at_entry = as.numeric(demographics.over_23_at_entry),
demographics.first_generation = as.numeric(demographics.first_generation))
str(cleaned_data)
cleaned_data <- transform(cleaned_data, share_firstgeneration = as.numeric(share_firstgeneration),
share_firstgeneration_parents.highschool = as.numeric(share_firstgeneration_parents.highschool),
share_firstgeneration_parents.middleschool = as.numeric(share_firstgeneration_parents.middleschool),
share_firstgeneration_parents.somecollege = as.numeric(share_firstgeneration_parents.somecollege),
demographics.age_entry = as.numeric(demographics.age_entry),
demographics.over_23_at_entry = as.numeric(demographics.over_23_at_entry),
demographics.first_generation = as.numeric(demographics.first_generation))
str(cleaned_data)
setwd('/home/kevinisagirl/Desktop/workspace/datamunging/Project/EDA_College-Scorecard/')
saveRDS(cleaned_data, file="Clean_CollegeScorecard_Rurality.rds")
yearcounts <- c(unname(table(cleaned_data$year)))
nacount <- all_data %>%
group_by(year) %>%
summarise_each(funs(sum(is.na(.))))
completeness.by.year <- cbind(year = nacount$year, 1-nacount[,-c(1)]/yearcounts, stringsAsFactors=FALSE)
setwd('/home/kevinisagirl/Desktop/workspace/datamunging/Project/EDA_College-Scorecard/')
saveRDS(completeness.by.year, file="Completeness_Clean_CollegeScorecard_Rurality.rds")
collegeData <- readRDS("Clean_CollegeScorecard_Rurality.rds")
completeness.by.year <- readRDS("Completeness_Clean_CollegeScorecard_Rurality.rds")
attach(collegeData)
collegeData
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_firstgeneration)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_firstgeneration_parents.middleschool)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=attendance.academic_year)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2005_06",], aes(x=factor(rurality), y=demographics.over_23_at_entry)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_firstgeneration)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=share_firstgeneration_parents.middleschool)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=demographics.first_generation)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=demographics.age_entry)) + stat_summary(fun.y="mean", geom = "bar")
ggplot(collegeData[collegeData$year == "2015_16",], aes(x=factor(rurality), y=title_iv.still_enrolled_by.8yrs)) + stat_summary(fun.y="mean", geom = "bar")
str(cleaned_data)
bad_cols
all_data$title_iv.depend.died_by.2yrs
subset(all_data, !is.na(title_iv.depend.died_by.2yrs))
subset(all_data, !is.na(title_iv.depend.died_by.2yrs))[,title_iv.depend.died_by.2yrs]
yearcounts <- c(unname(table(all_data$year)))
nacount <- all_data %>%
group_by(year) %>%
summarise_each(funs(sum(is.na(.))))
completeness.by.year <- 1-nacount[,-c(1)]/yearcounts
completeness.by.year$title_iv.depend.died_by.2yrs
bool.complete.by.year <- completeness.by.year > .5
bad_cols <- names(which(colSums(bool.complete.by.year)==0))
bad_cols
all_data <- readRDS("CollegeScorecard_combinedyears.rds")
all_data$zip <-sapply(all_data$zip, function(x) substr(x, 1, 5))
data(zipcode)
nm <- c("location.lat", "location.lon")
ll <- c("latitude", "longitude")
all_data[nm] <- zipcode[match(all_data$zip, zipcode$zip), ll]
all_data$rurality <- NA
zipcoderuralitydata = read.xls("http://www.psc.isr.umich.edu/dis/data/kb/downloads/t1101_ziprural.xls", sheet=5, stringsAsFactors = FALSE)
zipcoderuralitydata$zip <- clean.zipcodes(zipcoderuralitydata$zip)
tsn <- "rurality"
zcrdn <- "ru2003"
all_data[tsn] <- zipcoderuralitydata[match(all_data$zip, zipcoderuralitydata$zip), zcrdn]
setwd('/home/kevinisagirl/Desktop/workspace/datamunging/Project/EDA_College-Scorecard/')
saveRDS(all_data, file="CollegeScorecard_Rurality.rds")
all_data[all_data == "PrivacySuppressed"] <- NA
library(doBy)
yearcounts <- c(unname(table(all_data$year)))
nacount <- all_data %>%
group_by(year) %>%
summarise_each(funs(sum(is.na(.))))
completeness.by.year <- 1-nacount[,-c(1)]/yearcounts
bool.complete.by.year <- completeness.by.year > .5
bad_cols <- names(which(colSums(bool.complete.by.year)==0))
good_cols <- names(all_data)[!(names(all_data) %in% bad_cols)]
cleaned_data <- as.data.frame(all_data)[,good_cols]
bad_cols