import pandas as pd
import matplotlib.pyplot as plt

salary_df = pd.read_pickle('nmsu_salaries_cleaned.pkl')

# Sort data by increasing salary
salary_df = salary_df.sort_values('Salary')
salary_df = salary_df.reset_index()
salary_df.head()


# Change data type from object to float, and then to integer
salary_df['Salary'] = salary_df['Salary'].astype(float)
salary_df['Salary'] = salary_df['Salary'].astype(int)

# Drop columns im not interested in
salary_df = salary_df.drop(columns=['RMP_overall_rating','RMP_num_ratings'],axis=1)

# Change categorical values for simplicity 
salary_df['EstimatedRace'] = salary_df['EstimatedRace'].str.replace('nh_white','white')
salary_df['EstimatedRace'] = salary_df['EstimatedRace'].str.replace('nh_black','black')


# Fx to find the percent of positions held by a given demographic
def percent_of_positions(df, column_name, value):
    data_subset = df[df[column_name]==(value)]
    percent = len(data_subset)/len(df) * 100
    result = round(percent,2)
    return result

# Fx to create percent data dictionaries
def create_data(df, column_name):
    if column_name == 'EstimatedGender':
        data = {'female':0, 'male':0}
        for value in data:
            data[value] = percent_of_positions(df, column_name, value)
        data['unknown'] = round(100 - (data['female'] + data['male']), 2)
        return data
    elif column_name == 'EstimatedRace':
        data = {'white':0, 'hispanic':0, 'asian':0, 'black':0}
        for value in data:
            data[value] = percent_of_positions(df, column_name, value)
        data['nonwhite'] = round(data['hispanic'] + data['asian'] + data['black'], 2)
        return data

perc_gender = create_data(salary_df, 'EstimatedGender')
perc_race = create_data(salary_df, 'EstimatedRace')

print('Gender percentages: ', perc_gender)
print('Race percentages: ', perc_race)

Gender percentages:  {'female': 49.54, 'male': 42.49, 'unknown': 7.97}
Race percentages:  {'white': 64.24, 'hispanic': 29.97, 'asian': 3.25, 'black': 2.54, 'nonwhite': 35.76}


# Isolate professor data
titles = ['Asst Prof', 'Assc Prof', 'Professor', 'Professor, Distinguished']
prof_df = salary_df[salary_df['Title'].isin(titles)]

prof_perc_gender = create_data(prof_df, 'EstimatedGender')
prof_perc_race = create_data(prof_df, 'EstimatedRace')

print('Gender percentages: ', prof_perc_gender)
print('Race percentages: ', prof_perc_race)

Gender percentages:  {'female': 36.76, 'male': 49.02, 'unknown': 14.22}
Race percentages:  {'white': 72.16, 'hispanic': 14.83, 'asian': 10.44, 'black': 2.57, 'nonwhite': 27.84}


# Isolate department head data
dh_df = salary_df[salary_df['Title']==("Acad Dept Head")]

dh_perc_gender = create_data(dh_df, 'EstimatedGender')
dh_perc_race = create_data(dh_df, 'EstimatedRace')

print('Gender percentages: ', dh_perc_gender)
print('Race percentages: ', dh_perc_race)

Gender percentages:  {'female': 21.43, 'male': 76.19, 'unknown': 2.38}
Race percentages:  {'white': 88.1, 'hispanic': 7.14, 'asian': 4.76, 'black': 0.0, 'nonwhite': 11.9}


# Isolate dean data
dean_df = salary_df[salary_df['Title'].str.contains('Dean')]

dean_perc_gender = create_data(dean_df, 'EstimatedGender')
dean_perc_race = create_data(dean_df, 'EstimatedRace')

print('Gender percentages: ', dean_perc_gender)
print('Race percentages: ', dean_perc_race)

Gender percentages:  {'female': 40.62, 'male': 53.12, 'unknown': 6.26}
Race percentages:  {'white': 84.38, 'hispanic': 12.5, 'asian': 0.0, 'black': 3.12, 'nonwhite': 15.62}


fig,ax = plt.subplots(1,4,figsize=(10,8))
plt.subplots_adjust(wspace=0, hspace=0.13)
width = 1.3
x = [2,4]

# List of above percentages to use for the subplots. 
mino = [[perc_gender['female'], perc_race['nonwhite']],[prof_perc_gender['female'], prof_perc_race['nonwhite']],[dh_perc_gender['female'], dh_perc_race['nonwhite']],
        [dean_perc_gender['female'],dean_perc_race['nonwhite']]]
maj = [[perc_gender['male'],perc_race['white']],[prof_perc_gender['male'],prof_perc_race['white']],[dh_perc_gender['male'],dh_perc_race['white']],
       [dean_perc_gender['male'],dean_perc_race['white']]]

titles = ['All Positions', 'Professor', 'Department Head', 'Dean']

# axes parameters
for i in range(4):
    ax1 = ax[i].bar(x, mino[i],width=1.3,edgecolor='black',color='paleturquoise')
    ax2 = ax[i].bar(x, maj[i], width, bottom=mino[i], edgecolor='black',color='cadetblue')
    ax[i].tick_params(bottom=False)
    ax[i].set_xticks([1, 2, 4, 5])
    ax[i].set_xticklabels(['','Gender','Race', ''])
    ax[i].set_xlim(1,5)
    ax[i].set_ylim(-3,103)
    ax[i].set_title(titles[i],size=10)
    if i == 0:
        ax[i].set_yticks([0, 20, 40, 60, 80, 100])
        ax[i].set_yticklabels(['0%','20%','40%','60%','80%','100%'])
        ax[i].set_ylabel('Percentage of Positions', size=11)
    else:
        ax[i].set_yticks([0, 20, 40, 60, 80, 100])
        ax[i].set_yticklabels([]) 

#Make the figure legends
key = {'labels':[['Male','Female'], ['White','Nonwhite']], 'bbox':[(1.05, 0.55),(1.05, 0.4)], 'title':['Gender Key', '  Race Key  ']}
for i in range(2):
    fig.legend([ax2,ax1], key['labels'][i],loc='upper right', bbox_to_anchor=key['bbox'][i], 
            edgecolor='black', title=key['title'][i], title_fontproperties=dict(weight='semibold', size=11), 
            handlelength=2,handleheight=2.5,handletextpad=.5,fontsize=9)

#The subtitle for the figure
plt.text(-13,-15, 'Figure 1. The percentage of males vs females and whites vs nonwhites in various NMSU positions.', 
         weight='bold')

plt.show()


low_df = salary_df[salary_df['Salary']<50000]
midlow_df = salary_df[(salary_df['Salary']>=50000) & (salary_df['Salary']<100000)]
midhigh_df = salary_df[(salary_df['Salary']>=100000) & (salary_df['Salary']<200000)]
high_df = salary_df[salary_df['Salary']>=200000]

perc_gender_high = create_data(high_df, 'EstimatedGender')
perc_race_high = create_data(high_df, 'EstimatedRace')
print('Gender percentages, >=200k: ', perc_gender_high)
print('Race percentages, >=200k: ', perc_race_high)

Gender percentages, >=200k:  {'female': 16.67, 'male': 66.67, 'unknown': 16.66}
Race percentages, >=200k:  {'white': 91.67, 'hispanic': 0.0, 'asian': 8.33, 'black': 0.0, 'nonwhite': 8.33}


perc_gender_midhigh = create_data(midhigh_df, 'EstimatedGender')
perc_race_midhigh = create_data(midhigh_df, 'EstimatedRace')
print('Gender percentages, 100-200k: ', perc_gender_midhigh)
print('Race percentages, 100-200k: ', perc_race_midhigh)

Gender percentages, 100-200k:  {'female': 31.54, 'male': 59.62, 'unknown': 8.84}
Race percentages, 100-200k:  {'white': 80.0, 'hispanic': 11.15, 'asian': 6.92, 'black': 1.92, 'nonwhite': 19.99}


perc_gender_midlow = create_data(midlow_df, 'EstimatedGender')
perc_race_midlow = create_data(midlow_df, 'EstimatedRace')
print('Gender percentages, 50-100k: ', perc_gender_midlow)
print('Race percentages, 50-100k: ', perc_race_midlow)

Gender percentages, 50-100k:  {'female': 42.02, 'male': 48.29, 'unknown': 9.69}
Race percentages, 50-100k:  {'white': 74.29, 'hispanic': 17.49, 'asian': 5.57, 'black': 2.65, 'nonwhite': 25.71}


perc_gender_low = create_data(low_df, 'EstimatedGender')
perc_race_low = create_data(low_df, 'EstimatedRace')
print('Gender percentages, <50k: ', perc_gender_low)
print('Race percentages, <50k: ', perc_race_low)

Gender percentages, <50k:  {'female': 56.65, 'male': 36.63, 'unknown': 6.72}
Race percentages, <50k:  {'white': 55.8, 'hispanic': 40.35, 'asian': 1.3, 'black': 2.55, 'nonwhite': 44.2}


# Sizes each piece of the pie will be (4 pies total)
sizes = [[perc_gender_low['female'], perc_gender_low['male']], 
         [perc_gender_midlow['female'], perc_gender_midlow['male']], 
         [perc_gender_midhigh['female'], perc_gender_midhigh['male']], 
         [perc_gender_high['female'], perc_gender_high['male']]]

titles = ['Below 50K', '50-100K', '100-200K', '200K and above']

fig,ax = plt.subplots(1,4, figsize=(8,2))

for i in range(4):
    ax[i].pie(sizes[i],autopct='%1.1f%%',shadow=True, startangle=90, colors=['gainsboro','cadetblue'])
    ax[i].axis('equal')
    ax[i].set_title(titles[i], size=10, weight='bold')
    if i == 0:
        ax[0].legend(labels = ['female','male'], bbox_to_anchor=(6, 0.6))

plt.text(-9,-1.5, 'Figure 2. The percentage of males and females in four different pay groups.', weight='bold')

plt.show()


label = ['white','hispanic', 'asian', 'black']
dicts = [perc_race_low, perc_race_midlow, perc_race_midhigh, perc_race_high]
# Sizes of each piece of the pie
sizes = []
for dict in dicts:
    sizes.append([dict['white'], dict['hispanic'], dict['asian'], dict['black']])

titles = ['Below 50K', '50-100K', '100-200K', '200K and above']

fig,ax = plt.subplots(1,4, figsize=(8,2))

for i in range(4):
    ax[i].pie(sizes[i],shadow=True, startangle=90, colors=['paleturquoise','lightgray', 'teal','black'],)
    ax[i].axis('equal')
    ax[i].set_title(titles[i], size=10, weight='bold')

ax[0].legend(labels = label, bbox_to_anchor=(6, 0.8))

# Percent text annotations
locs = [[[-0.8,-0.1], [0.3,0]], [[-0.6,-0.3], [0.3,0.1]], [[-0.3,-0.4], [0.3,0.3]], [[-0.3,-0.4], [0.03,0.7]]]
percents = [['56%', '40%'], ['74%', '17%'], ['80%', '11%'], ['92%', '8%']]
for i in range(4):
    ax[i].text(locs[i][0][0], locs[i][0][1], percents[i][0])
    ax[i].text(locs[i][1][0], locs[i][1][1], percents[i][1])

plt.text(-9,-1.5, 'Figure 3. The percentage of whites, hispanics, asians, and blacks in four different pay groups.', weight='bold')

plt.show()

	index	Salary	Department	Title	EstimatedRace	RMP_overall_rating	RMP_num_ratings	EstimatedGender
0	2285	100000	Athletics	Deputy Dir,Athletics	nh_white	NaN	NaN	male
1	463	100000	SW Technology Development Institute	SWTDI Director	nh_white	NaN	NaN	male
2	2919	100353.02	NMDA Veterinary Diagnostic Svc	Vet Pathologist	nh_white	NaN	NaN	male
3	235	100565.11	Art	Acad Dept Head	nh_white	3.7	3.0	female
4	3349	100632.97	Entomology Plant Path and Weed Sci	Acad Dept Head	nh_white	NaN	NaN	male

NMSU Employee Insights¶

Conclusion¶