-
Notifications
You must be signed in to change notification settings - Fork 4
/
data_scrape.py
226 lines (186 loc) · 10.2 KB
/
data_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import scipy.io
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
def get_data():
'''
Extracts the data of all the mat files in the data directory.
Then extracts the second major column from all the schools into a 2D matrix
with the row showing the data and each row is a different school
:return: school_second_major
:type: numpy array
'''
schools = scipy.io.loadmat('schools.mat') #load the mat file with the schools in order
school_data = [] #initialize array
for num,i in enumerate(schools['schools'],1): #iterate through the school names arrays
fnamestring = np.array_str(i[0]) #get the school name from the array
fname = fnamestring[3:-2] #slice the string to just get the name of the school
school_data.append(scipy.io.loadmat('data/'+ fname + str(num))['local_info']) #append the matfile to the array
school_data = np.asarray((school_data)) #turn the array to a np.array
np.savetxt('test.txt',school_data[1],delimiter='\t')
school_student_flag = np.array([i[:,0] for i in school_data]) #the faculty_student flag is the first column
school_second_major = np.array([i[:,3] for i in school_data]) #second major info is the fourth column
year = np.array([i[:,-2] for i in school_data])
gender = np.array([i[:,1] for i in school_data]) #gender info is on second column
state = pd.read_excel('State-University.xlsx') #read the state location of each university
return school_student_flag, school_second_major, gender, year, state #return all arrays as a tuple
def find_faculty_student_rate(flags):
'''
Gets the rate of faculty vs. student ratio for each school and then plots it
A 2 represents a person is faculty and a 1 represents a person is a student
Plots faculty-student ratio vs rank of the school and saves it as 'faculty_student_ratio_vs_rank.png'
:param flags: 2D numpy array. Each row is a different school
:return: None
'''
assert isinstance(flags,np.ndarray)
value_counts = [dict(Counter(x).most_common()) for x in flags]
rates = [float(y[2])/y[1] for y in value_counts]
fig1,ax1 = plt.subplots()
plt.title('Faculty/Student Ratio vs. Rank')
plt.scatter(range(1, 101), rates, color='darkred', marker='o', s=16)
plt.plot(range(1, 101),rates, color='indianred', linestyle='--', linewidth=1)
plt.ylabel('Faculty/Student Ratio')
plt.xlabel('Rank of School')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
plt.tight_layout()
plt.savefig('visualizations/faculty_student_ratio_vs_rank.png')
plt.show()
def universities_in_state(state):
'''
Creates a pie chart of how many top 100 universities each state has and then saves it as
'top100_state.png'
:param state: a pandas dataframe with university and state columns
:return:None
'''
assert isinstance(state, pd.DataFrame)
num_in_top100 = state['State'].value_counts()#get back a Series of how many of each value there is
labels = np.array(num_in_top100.index) #the indicies are the state names
values = np.array(num_in_top100)#get the values of each state index
fig2, ax2 = plt.subplots()
ax2.pie(values, labels=labels, shadow=True,
startangle=90, rotatelabels=True, pctdistance=0.3) #create a pie chart of all the labels and values
plt.tight_layout()
plt.title('Number of universities in top 100 by State')
plt.savefig('visualizations/top100_state.png', dpi=100) #save the figure as png
plt.show()
def second_major_gender(second_major, gender):
'''
:param second_major: np array of second major info
:param gender: np array of gender info
:return: None
'''
assert isinstance(second_major,np.ndarray)
assert isinstance(gender, np.ndarray)
assert all(isinstance(x,np.ndarray) for x in second_major)
assert all(isinstance(x,np.ndarray) for x in gender)
males = []
females = []
for i in range(len(second_major)): #split the second major/minor info by gender
males.append(np.array([second_major[i][idx] for idx in range(len(second_major[i])) if gender[i][idx]==1]))
females.append(np.array([second_major[i][idx] for idx in range(len(second_major[i])) if gender[i][idx] == 2]))
#convert lists to arrays
males = np.asarray(males)
females = np.asarray(females)
#get how many times each value occurs in the arrays
males_10 = [dict(Counter(i)) for i in males]
females_10 = [dict(Counter(i)) for i in females]
#get the number of people who don't have second major/minor
males_zero = [float(x[0]) for x in males_10]
females_zero = [float(x[0]) for x in females_10]
#group the arrays by each 10 and get how many people didn't have a second major/minor
males_rate = [sum(males_zero[i:i+10])/sum([sum(x.values()) for x in males_10[i:i+10]])
for i in np.linspace(0,90,10, dtype=int)]
females_rate = [sum(females_zero[i:i + 10]) / sum([sum(x.values()) for x in females_10[i:i + 10]]) for i in
np.linspace(0, 90, 10, dtype=int)]
#get the number of people who DID have a second major by taking its complement
m = np.subtract(1,males_rate)
f = np.subtract(1,females_rate)
ind = np.arange(10) # the x locations for the groups
width = 0.35 # the width of the bars
fig3, ax3 = plt.subplots()
rects1 = ax3.bar(ind, m, width, color='navy') #bar chart with 2 rectangles for each x tick
rects2 = ax3.bar(ind + width, f, width, color='gold')
ax3.set_ylabel('Rate of taking minor/second major')
ax3.set_xlabel('Rank of School')
ax3.set_title('Minor/2nd Major Rate by Gender and Rank')
ax3.set_xticks(ind + width / 2)
ax3.set_xticklabels(('1-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70',
'71-80', '81-90', '91-100'))
ax3.legend((rects1[0], rects2[0]), ('Men', 'Women'))
plt.savefig('visualizations/2ndmajor by gender and rank.png') #save figure as png
plt.show()
def plot_year(year):
'''
Creates a bar chart with the number of undergraduates per gradutes organized by every 10 ranks
:param year: 2d np array with graduating year of people
:return: None
'''
assert isinstance(year,np.ndarray)
classes = [dict(Counter(i)) for i in year] #get the number occurences for each value in dict form
grads = [(x[2004]+x[2003]) for x in classes] #since the data was taken in 2005, grads graduated in 2004 and 2003
undergrads = [(x[2005]+x[2006]+x[2007]+x[2008]) for x in classes] #undergrads graduate 2005-2009
undergrad_values = [np.sum(undergrads[i*10:i*10+10]) for i in range(10)] #lump up the schools by every 10 ranks
grad_values = [np.sum(grads[i*10:i*10+10]) for i in range(10)] #same for grads
ratio = [x[0]/float(x[1]) for x in zip(undergrad_values,grad_values)]#zip the undergrad and grad together and divide
ind = np.arange(10) # the x locations for the groups
width = 0.7 # the width of the bars
fig4, ax4 = plt.subplots() #get bar chart of undergraduate/graduate for every 10 ranks
ax4.bar(ind, ratio, width, color='darkorange')
ax4.set_ylabel('# of undergraduates per graduate student')
ax4.set_xlabel('Rank of School')
ax4.set_title('Ratio of undergraduates/graduate per rank')
ax4.set_xticks(ind)
ax4.set_xticklabels(('1-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70',
'71-80', '81-90', '91-100'))
plt.savefig('visualizations/# of undergrad and grad.png')
plt.show()
def faculty_gender(student_faculty, gender):
'''
Creates a bar chart ratio male and female vs rank of the school
:param student_faculty: 100 x m students numpy array with student_faculty info
:param gender: 100 x m numpy array with gender info
:return:None
'''
assert isinstance(student_faculty, np.ndarray)
assert isinstance(gender, np.ndarray)
male_faculty = []
female_faculty = []
for i in range(len(student_faculty)): #creates a 2d matrix with the size of each array representing the number of people
#specified by array name
male_faculty.append(np.array([1 for idx in range(len(student_faculty[i]))
if gender[i][idx]==1 and student_faculty[i][idx]==2]))
female_faculty.append(np.array([1 for idx in range(len(student_faculty[i]))
if gender[i][idx] == 2 and student_faculty[i][idx]==2]))
male_faculty = [np.sum(male_faculty[i]) for i in range(100)] #get number of male faculty for each school
female_faculty = [np.sum(female_faculty[i]) for i in range(100)] #get number of female faculty for each school
male_every10 = [np.sum(male_faculty[i*10:i*10+10]) for i in range(10)] #lump up by 10
female_every10 = [np.sum(female_faculty[i * 10:i * 10 + 10]) for i in range(10)]
ind = np.arange(10) # the x locations for the groups
width = 0.35 # the width of the bars
fig5, ax5 = plt.subplots() #create the bar chart
rects1 = ax5.bar(ind, male_every10, width, color='c')
rects2 = ax5.bar(ind + width, female_every10, width, color='rebeccapurple')
ax5.set_ylabel('# of faculty')
ax5.set_xlabel('Rank of school')
ax5.set_title('Gender of faculty per rank')
ax5.set_xticks(ind + width/2)
ax5.set_xticklabels(('1-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70',
'71-80', '81-90', '91-100'))
ax5.legend((rects1[0],rects2[0]),('Male','Female'))
plt.savefig('visualizations/faculty_gender.png')
plt.show()
def main():
'''
Main function which calls all the helper functions to plot the data
:return: None
'''
plt.close('all') #close all the previous figures
student_faculty, second_major, gender, year, state = get_data() #get all needed data from mat files
find_faculty_student_rate(student_faculty) #plots faculty/student ratio vs rank of school
universities_in_state(state)#pie chart of which state has more universities in top 100
second_major_gender(second_major, gender)#compares secmond major/minor vs gender
plot_year(year)#plots undergraduate/graduate ratio vs the rank of school
faculty_gender(student_faculty, gender)#bar chart of how many male and female faculty vs rank
main()