-
Notifications
You must be signed in to change notification settings - Fork 8
/
majors_538_md.py
245 lines (199 loc) · 8.33 KB
/
majors_538_md.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
"""
FiveThirtyEight College Major Data
Data pulled from American Community Survey 2010-2012 as used in this article:
'http://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/'
Written by Matt Davis, NYU, February 2015
Created in Python 3.4
"""
# PRELIMINARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from operator import itemgetter
# IMPORTING DATA
# All students
url_all = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/all-ages.csv'
df_all = pd.read_csv(url_all)
# Recent graduates, a subset of all students consisting of <28 year olds with an undergraduate degree
url_recent = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv'
df_recent = pd.read_csv(url_recent)
# Graduate students, a subset of all students consisting of >25 year olds enrolled in graduate school
url_grad = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/grad-students.csv'
df_grad = pd.read_csv(url_grad)
# Recent STEM students, a subset of recent graduates
url_stem = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/women-stem.csv'
df_stem = pd.read_csv(url_stem)
#FUNCTIONS
"""
Parameters
----------
df : the dataframe being considered. Choose from:
df_recent: sample <28 years old with a bachelor's degree
df_grad: sample >25 years old enrolled in graduate school
df_all: all subjects contained in the two above
df_stem: sample <28 years old majoring in a STEM subject
category : the measure being used
Choose one out of the following, wrapped in single quotation marks
For df_recent
Rank : Rank by median earnings relative to other members in their dataframe
Total: Total number of people with major
Sample_size Sample size (unweighted) of full-time, year-round ONLY (used for earnings)
Men Male graduates
Women Female graduates
ShareWomen Women as share of total
Employed Number employed (ESR == 1 or 2)
Full_time Employed 35 hours or more
Part_time Employed less than 35 hours
Full_time_year_round Employed at least 50 weeks (WKW == 1) and at least 35 hours (WKHP >= 35)
Unemployed Number unemployed (ESR == 3)
Unemployment_rate Unemployed / (Unemployed + Employed)
Median Median earnings of full-time, year-round workers
P25th 25th percentile of earnigns
P75th 75th percentile of earnings
College_jobs Number with job requiring a college degree
Non_college_jobs Number with job not requiring a college degree
Low_wage_jobs Number in low-wage service jobs
For df_grad
Grad_total
Grad_employed
sample_size
Grad_employed
Grad_full_time_year_round
Grad_unemployed
Grad_unemployment_rate
Grad_median
Grad_P25
Grad_P75
Nongrad_total
Nongrad_employed
Nongrad_full_time_year_round
Nongrad_unemployed
Nongrad_unemployment_rate
Nongrad_median
Nongrad_P25
Nongrad_P75
Grad_share
Grad_premium
For df_stem
Total
Men
Women
ShareWomen
Median
For df_all
Total
Employed
Employed_full_time_year_round
Unemployed
Unemployment_rate
Median
P25th
P75th
majcat: Choose one of the following major categories:
Agriculture & Natural Resources
Biology & Life Science
Engineering
Humanities & Liberal Arts
Communications & Journalism
Computers & Mathematics
Industrial Arts & Consumer Services
Education
Law & Public Policy
Health
Interdisciplinary
Physical Sciences
Psychology & Social Work
Arts
Business
"""
def sort(df, category, order, n):
"""
Ranks the top-n or bottom-n majors by a given category
Parameters
----------
order: ascending or descending
0 means descending by chosen category
1 means ascending by chosen category
2 means descending by median salary
n: truncates the number of majors in the list to n
n = 0 gives an untruncated list (i.e. all 173 majors)
"""
catlist = list(zip(df.Major, df[category]))
if order == 0:
catlist = sorted(catlist, key=itemgetter(1), reverse=True)
if order == 1:
catlist = sorted(catlist, key=itemgetter(1))
if order == 2:
catlist = catlist
if n > 0:
catlist = catlist[:n]
return pd.DataFrame(catlist)
def cutoff(df, category, cut, direction, order, n):
"""
Retrieve a list of majors that obey a given cutoff criterion customizable by category and order
Parameters
----------
cut : the numerical cutoff point within the category
direction: filtering to include only those majors below, above, or exactly the cutoff point
'above' or 'below' or 'exact'
order: ascending or descending
0 means descending by chosen category
1 means ascending by chosen category
2 means descending by median salary
n: truncates the list to the top (or bottom) n majors
n = 0 gives an untruncated list (i.e. all 173 majors)
"""
if direction == 'below':
filt = list(zip(df[df[category] <= cut].Major, df[df[category] <= cut][category]))
if direction == 'above':
filt = list(zip(df[df[category] >= cut].Major, df[df[category] >= cut][category]))
if direction == 'exact':
filt = list(zip(df[df[category] == cut].Major, df[df[category] == cut][category]))
if order == 0:
filt = sorted(filt, key=itemgetter(1), reverse=True)
if order == 1:
filt = sorted(filt, key=itemgetter(1))
if n > 0:
filt = filt[:n]
return pd.DataFrame(filt)
def stackedbars(df, majcat, n):
"""
Produces a series of n bar graphs depicting interquartile salary between majors within a given major category
Due to length of majors, major codes were used as a proxy on the x axis. These can be easily cross-referenced with the appropriate data frame.
Parameters
----------
majcat: Enter a string representing the major category of interest
n: truncates the majors of interest to the top n majors within the major category
n = 0 gives an untruncated list
"""
N = len(df[df.Major_category == majcat])
width = 0.35
newdata = df[df.Major_category == majcat]
if n >= 1:
newdata = newdata[:n]
N = n
Q1s = np.array(newdata.P25th.values.T) # vector of first quartile values
Meds = np.array(newdata.Median.values.T) # vector of median values
Q3s = np.array(newdata.P75th.values.T) # vector of third quartile values
p1 = plt.bar(np.arange(N), Q1s, width, color = 'r')
p2 = plt.bar(np.arange(N), Meds-Q1s, width, color = 'y', bottom = Q1s)
p3 = plt.bar(np.arange(N), Q3s-Meds, width, color = 'g', bottom = Meds)
if n == 0:
Q1s = np.array(newdata.P25th.values.T) # vector of first quartile values
Meds = np.array(newdata.Median.values.T) # vector of median values
Q3s = np.array(newdata.P75th.values.T) # vector of third quartile values
p1 = plt.bar(np.arange(N), Q1s, width, color = 'r')
p2 = plt.bar(np.arange(N), Meds-Q1s, width, color = 'y', bottom = Q1s)
p3 = plt.bar(np.arange(N), Q3s-Meds, width, color = 'g', bottom = Meds)
plt.ylabel('Salary')
plt.xlabel('Major code')
plt.title('Interquartile salary range of graduates in ' + str(majcat))
plt.xticks(np.arange(N)+width/2., newdata.Major_code.values.T)
plt.legend((p1[0], p2[0], p3[0]),('First quartile', 'Median', 'Third quartile'), loc=3)
#Examples
# The graduate-school majors with the five lowest unemployment
sort(df_grad, 'Grad_unemployment_rate', 1, 5)
# The undergradate majors with fewer than 500 women enrolled sorted in ascending order by number of women enrolled
cutoff(df_recent, 'Women', 500, 'below', 1, 0)
# Interquartile bargraphs of the five majors within business with the highest median salaries for recent graduates
stackedbars(df_recent, 'Business', 5)