# Data Collection and Cleaning
import requests                                             #https://requests.readthedocs.io/en/master/
import os                                                   #https://docs.python.org/3/library/os.html
import os.path                                              #https://docs.python.org/3/library/os.path.html 
import warnings                                             #https://docs.python.org/3/library/warnings.html    
warnings.filterwarnings('ignore')                           #https://docs.python.org/3/library/warnings.html#warnings.filterwarnings
import pandas as pd                                         #https://pandas.pydata.org/
import numpy as np                                          #https://numpy.org/               
import ast                                                  #https://docs.python.org/3/library/ast.html
from ast import literal_eval                                #https://docs.python.org/3/library/ast.html         
from IPython.display import clear_output, display, HTML     #https://ipython.readthedocs.io/en/stable/api/generated/IPython.display.html
from bs4 import BeautifulSoup                               #https://www.crummy.com/software/BeautifulSoup/bs4/doc/
import re                                                   #https://docs.python.org/3/library/re.html

# Data Visualization
import matplotlib.pyplot as plt                             #https://matplotlib.org/
import matplotlib.ticker as ticker                          #https://matplotlib.org/api/ticker_api.html
import seaborn as sns                                       #https://seaborn.pydata.org/

# Data Analysis
from statsmodels.formula.api import *                       #https://www.statsmodels.org/stable/index.html
from sklearn.metrics import accuracy_score                  #https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
from sklearn.neighbors import KNeighborsClassifier          #https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn import linear_model                            #https://scikit-learn.org/stable/
import nltk                                                 #https://www.nltk.org/
from nltk.sentiment import SentimentIntensityAnalyzer       #https://www.nltk.org/api/nltk.sentiment.html


reviews_exist = os.path.exists("src/1_collect_data/planet_terp_data/PT_review_data.csv")
grades_exist = os.path.exists("src/1_collect_data/planet_terp_data/PT_grade_data.csv")


'''Function to scrape PlanetTerp for review data'''

def scrape_reviews():
    reviews = []
    done = False
    offset = 0
    # maximum limit is 100, keep requesting until all of the data has been collected
    while done == False:
        r = requests.get("https://planetterp.com/api/v1/professors", params = {"offset":offset, "reviews": "true", "limit":100},)
        if r.json() == []:
            done = True
        else:
            # add all reviews to a list
            reviews.append(r.json())
            offset = offset+100
    df = pd.DataFrame()
    count = 0
    # separate out data, each review gets its own row
    for i in reviews:
        for j in i:
            if j.get("reviews") != []:
                for k in j.get("reviews"):
                    # only add review if it is for a professor, not a TA, and there is a course attached to it
                    if k.get("course") != None and j.get("type") == "professor":
                        df.at[count, "name"] = j.get("name")
                        df.at[count, "slug"] = j.get("slug")
                        df.at[count, "type"] = j.get("type")
                        df.at[count, "course"] = k.get("course")
                        df.at[count, "rating"] = k.get("rating")
                        df.at[count, "review"] = k.get("review")
                        df.at[count, "date"] = k.get("created")[:10]
                        count = count + 1

    # remove duplicate reviews
    df = df.sort_values(by=["name","course"])
    df.to_csv("src/1_collect_data/PT_review_data.csv", encoding='utf-8', index=False)


'''Function to scrape PlanetTerp for grade data'''

def scrape_grade():
    offset = 0
    grades = []
    professors = df["name"].drop_duplicates()
    # get grade data for each professor
    for prof in professors:
        r = requests.get("https://planetterp.com/api/v1/grades", params = {"offset":offset, "reviews": "true", "limit":100, "professor": prof})
        grades.append(r.json())
    grade_df = pd.DataFrame()
    count = 0
    # each semester of each course gets its own row in the dataframe
    for i in grades:
        if i != []:
            for j in i:
                grade_df.at[count, "professor"] = j.get("professor")
                grade_df.at[count, "course"] = j.get("course")
                grade_df.at[count, "semester"] = j.get("semester")
                grade_df.at[count, "section"] = j.get("section")
                grade_df.at[count, "A+"] = j.get("A+")
                grade_df.at[count, "A"] = j.get("A")
                grade_df.at[count, "A-"] = j.get("A-")
                grade_df.at[count, "B+"] = j.get("B+")
                grade_df.at[count, "B"] = j.get("B")
                grade_df.at[count, "B-"] = j.get("B-")
                grade_df.at[count, "C+"] = j.get("C+")
                grade_df.at[count, "C"] = j.get("C")
                grade_df.at[count, "C-"] = j.get("C-")
                grade_df.at[count, "D+"] = j.get("D+")
                grade_df.at[count, "D"] = j.get("D")
                grade_df.at[count, "D-"] = j.get("D-")
                grade_df.at[count, "F"] = j.get("F")
                grade_df.at[count, "W"] = j.get("W")
                grade_df.at[count, "Other"] = j.get("Other")
                count = count + 1
    
    grade_df = grade_df.sort_values(by=["professor","course"])
    grade_df.to_csv("src/1_collect_data/PT_grade_data.csv", encoding = "utf-8", index = False)


'''function to download all of the data from the UMD Diamondback salary API'''

def get_salary_data():
    # get years that api is valid for
    r_years = requests.get("https://api.dbknews.com/salary/years")
    years = r_years.json()["data"]

    # create empty dataframe
    df = pd.DataFrame()

    # for each year, get salary data
    for year in years:
        r = requests.get(f"https://api.dbknews.com/salary/year/{year}")

        # get number of entries for each year
        count = r.json()["count"]
        year_df = pd.DataFrame()
        page = 0

        # each page only has 10 entries, so keep requesting until all entries are collected
        while page * 10 < count:
            page += 1

            # get salary data for 1 page
            r = requests.get(f"https://api.dbknews.com/salary/year/{year}?page={page}")
            page_df = pd.DataFrame.from_dict(r.json()["data"])

            # remove division and title columns, modify department col, and add year
            page_df = page_df.drop(['Division', "Title"], axis=1)
            page_df["Department"] = page_df["Department"].str.slice(stop=4)
            page_df["year"] = [f"{year}"] * len(page_df.index)
            year_df = pd.concat([year_df, page_df], axis=0)

        # save entire year's data into separate csv
        print(f"year {year} finished")
        year_df.to_csv(f'src/1_collect_data/salary_data/{year}data.csv', index=False)


'''function to combine all of the yearly data from the UMD Diamondback salary API into one csv'''

def combine():
    # get data for all of the years into one dataframe
    r_years = requests.get("https://api.dbknews.com/salary/years")
    years = r_years.json()["data"]
    df = pd.DataFrame()

    # read each of the csvs
    for year in years:
        year_df = pd.read_csv(f'src/1_collect_data/salary_data/{year}data.csv')
        year_df['Employee'] = year_df['Employee'].str.replace('\n', ' ')
        df = pd.concat([df, year_df], axis=0)

    # group by professor and aggregate the rest of the data into one row
    df_grouped = df.groupby(['Employee']).apply(aggregate).reset_index()
    df_grouped['name'] = df_grouped['Employee'].apply(lambda x: (x.split(', ')[1].split(" ")[0]+ ' ' + x.split(', ')[0].split(" ")[-1]).upper())
    df_grouped.to_csv(f'src/1_collect_data/salary_data/DB_combined_data.csv', index=False)

'''helper function to aggregate years, salaries, and departments columns for each professor'''
def aggregate(group):
    # group columns to list
    years = group['year'].tolist()
    salaries = group['Salary'].tolist()
    salaries = [float(s.replace(",","")[1:]) for s in salaries]
    departments = group['Department'].tolist()
    
    # For professors with mulitple sources of income per year, aggreate the salaries
    i = 0
    while i < len(years) - 1:
        if (years[i] == years[i+1]):
            if salaries[i] != salaries[i + 1]:
                salaries[i] = salaries[i] + salaries[i + 1]
            years.pop(i+1)
            salaries.pop(i+1)
            departments.pop(i+1)
        else:
            i += 1

    return pd.Series({
        'years_taught': years,
        'salaries': salaries,
        'departments': departments,
    })


# open all of the downloaded data
reviews_df = pd.read_csv("src/1_collect_data/planet_terp_data/PT_review_data.csv")
grades_df = pd.read_csv("src/1_collect_data/planet_terp_data/PT_grade_data.csv")
salaries_df = pd.read_csv("src/1_collect_data/salary_data/DB_combined_data.csv")


reviews_df.head()


grades_df.head()


salaries_df.head()


"""Getting the month from date formatted as YYYY-MM-DD"""

for index, row in reviews_df.iterrows():
    reviews_df.at[index, "year"] = str(reviews_df.at[index, "date"])[:4]
    if int(str(reviews_df.at[index, "date"])[5:7]) > 8:
        reviews_df.at[index, "season"] = "fall"
    elif str(reviews_df.at[index, "date"])[5:7] == "01":
        reviews_df.at[index, "season"] = "winter"
    elif str(reviews_df.at[index, "date"])[5:7] == "06" or str(reviews_df.at[index, "date"])[5:7] == "07" or str(reviews_df.at[index, "date"])[5:7] == "08" :
        reviews_df.at[index, "season"] = "summer"
    else:
        reviews_df.at[index, "season"] = "spring"

reviews_df.head()


# sort by the following to ensure that reviews corresponding to the same semester get grouped together
reviews_df = reviews_df.sort_values(by = ["name", "course", "year", "season"], ignore_index = True)

average_reviews_df = pd.DataFrame()

values_to_check = ["name", "course", "year", "season"]
curr = [reviews_df.at[0, "name"], reviews_df.at[0, "course"], reviews_df.at[0, "year"], reviews_df.at[0, "season"]]
count = 0
averages = [0, 0]
for index, row in reviews_df.iterrows():
    match = True
    # see if the next row in the dataframe is for the current semester
    for i in range(4):
        if curr[i] != reviews_df.at[index, values_to_check[i]]:
            match = False
    if not match:
        # once it is not a match, write the current accumulated information to the dataframe and reset
        for i in range(4):
            average_reviews_df.at[count, values_to_check[i]] = curr[i]
            curr[i] = reviews_df.at[index, values_to_check[i]]
        average_reviews_df.at[count, "average_rating"] = averages[0]/averages[1]
        average_reviews_df.at[count, "num_reviews"] = averages[1]
        count = count + 1
        averages = [0, 0]
    averages[0] = averages[0] + reviews_df.at[index, "rating"]
    averages[1] = averages[1] + 1
# add in the last row
for i in range(4):
    average_reviews_df.at[count, values_to_check[i]] = curr[i]
average_reviews_df.at[count, "average_rating"] = averages[0]/averages[1]
average_reviews_df.at[count, "num_reviews"] = averages[1]

average_reviews_df.head()


for index, row in grades_df.iterrows():
    grades_df.at[index, "year"] = str(grades_df.at[index, "semester"])[:4]
    if str(grades_df.at[index, "semester"])[4:] == "08":
        grades_df.at[index, "season"] = "fall"
    else:
        grades_df.at[index, "season"] = "spring"


# sort by the following to ensure that grades corresponding to the same semester get grouped together
grades_df = grades_df.sort_values(by = ["professor", "course", "year", "season"], ignore_index = True)

average_grades_df = pd.DataFrame()
# GPA values for each letter grade
gpa = {"A+":4.0, "A":4.0, "A-":3.7,"B+":3.3, "B":3.0, "B-":2.7, "C+":2.3, "C":2.0, "C-":1.7, "D+":1.3, "D":1.0, "D-":0.7, "F":0.0}

values_to_check = ["professor", "course", "year", "season"]
curr = [grades_df.at[0, "professor"], grades_df.at[0, "course"], grades_df.at[0, "year"], grades_df.at[0, "season"]]
count = 0
averages = [0, 0]
for index, row in grades_df.iterrows():
    match = True
    # see if the next row in the dataframe is for the current semester
    for i in range(4):
        if curr[i] != grades_df.at[index, values_to_check[i]]:
            match = False
    if not match:
        # once it is not a match, write the current accumulated information to the dataframe and reset
        for i in range(4):
            if averages[1] != 0:
                average_grades_df.at[count, values_to_check[i]] = curr[i]
            curr[i] = grades_df.at[index, values_to_check[i]]
        # it is possible for the number of students taking a class to be 0, this would occur when all of the students
        # in a class have grades corresponding to W or Other, neither of which have a GPA value
        if averages[1] != 0:
            average_grades_df.at[count, "average_gpa"] = averages[0]/averages[1]
            average_grades_df.at[count, "num_students"] = averages[1]
            count = count + 1
        averages = [0, 0]
    for i in gpa.keys():
        averages[0] = averages[0] + gpa.get(i)*grades_df.at[index, i]
        averages[1] = averages[1] + grades_df.at[index, i]
# add in the last row
for i in range(4):
    if averages[1] != 0:
        average_grades_df.at[count, values_to_check[i]] = curr[i]
if averages[1] != 0:
    average_grades_df.at[count, "average_gpa"] = averages[0]/averages[1]
    average_grades_df.at[count, "num_students"] = averages[1]


average_grades_df.head()


average_grades_df = average_grades_df.rename(columns = {"professor" : "name"})

both_averages_df = pd.merge(
    average_reviews_df,
    average_grades_df,
    how = "outer",
    on = ["name", "course", "year", "season"]
)

both_averages_df.head()


def combine(group):
     return pd.Series({
        'course': group['course'].tolist(),
        'semester': [a + " " + b for a, b in zip(group['season'], group['year'])],
        'average_rating': group['average_rating'].tolist(),
        'num_reviews': group['num_reviews'].tolist(),
        'average_gpa': [round(x, 2) for x in group['average_gpa']],
        'num_students': [int(x) for x in np.nan_to_num(group['num_students'])]
    })
df_grouped = both_averages_df.groupby(['name']).apply(combine).reset_index()
df_grouped["name"] = df_grouped["name"].apply(lambda x: x.split()[0].upper()) + " " +df_grouped["name"].apply(lambda x: x.split()[-1].upper())

if os.path.exists("src/1_collect_data/planet_terp_data/PT_grade_data.csv") == False:
    df_grouped.to_csv(f'src/1_collect_data/planet_terp_data/PT_combined_data.csv', index=False)

df_grouped.head()


df = pd.read_csv("src/1_collect_data/salary_data/DB_combined_data.csv")

combine_df = pd.DataFrame()

curr = ["", "", "", ""]
names = []
count = 0

# method to find the intersection of two lists
def intersection(l1, l2):
    output = []
    temp1 = l1[1:-1]
    temp2 = l2[1:-1]
    list1 = list(temp1.split(", "))
    list2 = list(temp2.split(", "))
    for i in list1:
        if i in list2:
            output.append(i)
    return output


for index, row in df.iterrows():
    # check the three conditions
    if (curr[0] == df.at[index, "name"] and 
        intersection(curr[1], df.at[index, "years_taught"]) == [] and 
        intersection(curr[3], df.at[index, "departments"]) != []):

        # combining professors that are detected to be the same
        curr[1] = curr[1][:-1] + ", " + (df.at[index, "years_taught"])[1:]
        curr[2] = curr[2][:-1] + ", " + (df.at[index, "salaries"])[1:]
        curr[3] = curr[3][:-1] + ", " +  (df.at[index, "departments"])[1:]
        names.append(df.at[index, "Employee"])

        # if we end on a match, add that match to the dataframe
        if index == len(df.index) -1:
            combine_df.at[count, "name"] = curr[0]
            combine_df.at[count, "years_taught"] = curr[1]
            combine_df.at[count, "salaries"] = curr[2]
            combine_df.at[count, "departments"] = curr[3]
            combine_df.at[count, "real_name(s)"] = " | ".join(names)
            names = [df.at[index, "Employee"]]
            curr = [df.at[index, "name"], df.at[index, "years_taught"], df.at[index, "salaries"], df.at[index, "departments"]]
            count = count + 1
    
    # once we detect someone isn't a match, add the accumulated information to the dataframe
    else:
        combine_df.at[count, "name"] = curr[0]
        combine_df.at[count, "years_taught"] = curr[1]
        combine_df.at[count, "salaries"] = curr[2]
        combine_df.at[count, "departments"] = curr[3]
        combine_df.at[count, "real_name(s)"] = " | ".join(names)
        names = [df.at[index, "Employee"]]
        curr = [df.at[index, "name"], df.at[index, "years_taught"], df.at[index, "salaries"], df.at[index, "departments"]]
        count = count + 1

combine_df = combine_df.drop(labels = 0, axis = 0)

combine_df.head()


david_todd = combine_df.loc[[combine_df.index[combine_df['name'] == "DAVID TODD"].tolist()[0]]]
david_todd


if os.path.exists("src/2_clean_data/clean_salary_data.csv") == False:
    combine_df.to_csv(f'src/2_clean_data/clean_salary_data.csv', index=False)


gpa_review_df = pd.read_csv('src/1_collect_data/planet_terp_data/PT_combined_data.csv')
salary_df = pd.read_csv("src/2_clean_data/clean_salary_data.csv")

outer = pd.merge(
    gpa_review_df,
    salary_df,
    how = "outer",
    on = "name",
    indicator=True
)

left_only = outer[outer['_merge'] == 'left_only']
print("LEFT ONLY: " + str(len(left_only)))
both = outer[outer['_merge'] == 'both']
print("BOTH: " + str(len(both)))

LEFT ONLY: 655
BOTH: 2490


curr_prof = ""
dups = []
for index, row in both.iterrows():
    if both.at[index, "name"] == curr_prof:
        dups.append(both.at[index, "name"])
    else:
        curr_prof = both.at[index, "name"]

print(len(dups))

54


def remove_dups(i1, i2, both):
    for i in i2:
        both.at[i1, "years_taught"] = both.at[i1, "years_taught"][:-1] + ", " + both.at[i, "years_taught"][1:]
        both.at[i1, "departments"] = both.at[i1, "departments"][:-1] + ", " + both.at[i, "departments"][1:]
        both.at[i1, "salaries"] = both.at[i1, "salaries"][:-1] + ", " + both.at[i, "salaries"][1:]
        both.at[i1, "real_name(s)"] = both.at[i1, "real_name(s)"] + " | " + both.at[i, "real_name(s)"]
    return both


gpa_review_df['course'] = gpa_review_df['course'].apply(literal_eval)
df = gpa_review_df.explode(['course'])

uniqueMajors = df['course'].str[0:4].unique()
print(df['course'].str[0:4].unique())

['AMST' 'PSYC' 'ASTR' 'CMSC' 'ENGL' 'ECON' 'HESI' 'EDHI' 'MATH' 'COMM'
 'INST' 'ENAE' 'ENME' 'ENMA' 'ENRE' 'ARTH' 'PHYS' 'STAT' 'PHSC' 'ANTH'
 'BIOE' 'GEMS' 'BMGT' 'BULM' 'CMLT' 'GERM' 'JWST' 'ISRL' 'BUSO' 'WMST'
 'LGBT' 'WGSS' 'DANC' 'ARHU' 'TDPS' 'UNIV' 'ENEE' 'PERS' 'ARAB' 'ENCE'
 'HIST' 'PHIL' 'AOSC' 'HACS' 'CCJS' 'GEOL' 'FIRE' 'SOCY' 'CPSD' 'CPSP'
 'PLCY' 'AASP' 'HNUH' 'BUFN' 'BUSI' 'ENTS' 'MSML' 'HONR' 'ITAL' 'ARCH'
 'URSP' 'HDCC' 'PUAF' 'MUSC' 'JOUR' 'LING' 'BSCI' 'BIOL' 'EDCP' 'EDHD'
 'EDUC' 'TLTC' 'EDMS' 'CHEM' 'CPMS' 'ENES' 'HESP' 'HEBR' 'GEOG' 'GVPT'
 'LARC' 'IDEA' 'FMSC' 'NEUR' 'HLTH' 'THET' 'MIEH' 'BUMK' 'ENST' 'ANSC'
 'INAG' 'CHBE' 'ENCH' 'SPAN' 'PORT' 'USLT' 'SLLC' 'ENTM' 'BUSM' 'FREN'
 'KNES' 'MLSC' 'CHIN' 'EALL' 'AAST' 'BUMO' 'EDCI' 'TLPL' 'ENPM' 'AREC'
 'BISI' 'CBMG' 'ENSP' 'PHPE' 'ENNU' 'HLSA' 'BSOS' 'SPHL' 'BUDT' 'NAVY'
 'CHSE' 'HHUM' 'CPSF' 'LBSC' 'BCHM' 'EPIB' 'ARTT' 'BUAC' 'HLSC' 'CLFS'
 'MUED' 'SMLP' 'HLMN' 'MLAW' 'SURV' 'EDSP' 'AGNR' 'PLSC' 'EDPS' 'LACS'
 'ENFP' 'INFM' 'UMEI' 'BEES' 'AMSC' 'EMBA' 'CLAS' 'LATN' 'IMMR' 'BIOM'
 'CHPH' 'CPBE' 'NACS' 'IMDM' 'CINE' 'FILM' 'CPSN' 'NFSC' 'ENSE' 'CPSS'
 'AGST' 'DATA' 'FGSM' 'RUSS' 'ARMY' 'CPGH' 'RELS' 'BSST' 'CPJT' 'JAPN'
 'MUET' 'GREK' 'LASC' 'BSGC' 'HISP' 'CPSA' 'RDEV' 'BSCV' 'HGLO' 'HEIP'
 'BIPH' 'PEER' 'ENEB' 'BMSO' 'CPSG' 'HBUS' 'CONS' 'CPPL' 'ENBC' 'CPET'
 'SLAA' 'VMSC' 'MEES' 'KORA']


#Get full names for each major

code_fullname_dict = {}

undergrad_majors = requests.get("https://academiccatalog.umd.edu/undergraduate/approved-courses/")
undergrad_majors_soup = BeautifulSoup(undergrad_majors.text, 'html.parser')

for major in undergrad_majors_soup.find(id="/undergraduate/approved-courses/").find_all("li"):
    majorInfo = major.text.replace('\u200b', '').split(' - ')
    code_fullname_dict[majorInfo[0]] = majorInfo[1].strip()
    
grad_majors = requests.get("https://academiccatalog.umd.edu/graduate/courses/")
grad_majors_soup = BeautifulSoup(grad_majors.text, 'html.parser')

for major in grad_majors_soup.find(id="/graduate/courses/").find_all("li"):
    majorInfo = major.text.replace('\u200b', '').split(' - ')
    if majorInfo[0] not in code_fullname_dict:
        code_fullname_dict[majorInfo[0]] = majorInfo[1].strip()


#Manually add full names for codes that don't have full names yet

for code in uniqueMajors:
    if code not in code_fullname_dict:
        print(code)

code_fullname_dict['BUSO'] = 'Online MBA Program'
code_fullname_dict['BMSO'] = 'Online MS in Business Analytics'
code_fullname_dict['HBUS'] = "Interdisciplinary Business Honors"

BUSO
BMSO
HBUS


#Get majors under each undergraduate school

majors_schools = {}

schools = requests.get("https://academiccatalog.umd.edu/undergraduate/colleges-schools/")
schools_soup =  BeautifulSoup(schools.text, 'html.parser')

links_to_schools = {}

#Get links to each school
for schools in schools_soup.find_all(id="/undergraduate/colleges-schools/"):
    for majors in schools.find_all("li"):
        if majors.find("a").get("href").count("/") == 4:
            links_to_schools[majors.find("a").text] = majors.find("a").get("href")

# Get majors under each school
for school in links_to_schools:
    majors = requests.get("https://academiccatalog.umd.edu" + links_to_schools[school] + "#degreeprogramstext")
    majors_soup = BeautifulSoup(majors.text, 'html.parser')

    if (len(majors_soup.find_all(id='degreeprogramstextcontainer')) == 0): #Journalism school and office of undergraduate studies
        for section in majors_soup.find_all(id='programstextcontainer'):
            for program in section.find_all("ul")[0]:
                majors_schools[program.text] = school
    else:
        for section in majors_soup.find_all(id='degreeprogramstextcontainer'):
            for program in section.find_all("li"):
                majors_schools[program.text] = school


#get colleges for some grad school majors

final_code_to_school = {}

gradMajorsSchools = requests.get("https://gradschool.umd.edu/admissions/programs-a-to-z")
gradMajorsSchools_soup = BeautifulSoup(gradMajorsSchools.text, 'html.parser')

for row in gradMajorsSchools_soup.find_all("table")[1].find_all("tr"):
    if len(row.find_all("strong")) == 0:
        final_code_to_school[row.find_all("td")[1].text] = row.find_all("td")[2].text


#Rename long college names to short version

def rename_colleges(dictionary):
    for code in dictionary.keys():
        if 'business' in dictionary[code].lower():
            dictionary[code] = 'BMGT' #Robert H. Smith School of Business
        elif 'engineering' in dictionary[code].lower():
            dictionary[code] = 'ENGR' #A. James Clark School of Engineering
        elif 'information studies' in dictionary[code].lower():
            dictionary[code] = 'INFO' #College of Information Studies
        elif 'computer' in dictionary[code].lower():
            dictionary[code] = 'CMNS' #College of Computer, Mathematical, and Natural Sciences
        elif 'arts' in dictionary[code].lower():
            dictionary[code] = 'ARHU' #College of Arts and Humanities
        elif 'behavioral' in dictionary[code].lower():
            dictionary[code] = 'BSOS' #College of Behavioral and Social Sciences
        elif 'education' in dictionary[code].lower():
            dictionary[code] = 'EDUC' #College of Education
        elif 'architecture' in dictionary[code].lower():
            dictionary[code] = 'ARCH' #School of Architecture, Planning, and Preservation
        elif 'health' in dictionary[code].lower():
            dictionary[code] = 'SPHL' #School of Public Health
        elif 'agric' in dictionary[code].lower():
            dictionary[code] = 'AGNR' #College of Agriculture and Natural Resources
        elif 'journalism' in dictionary[code].lower():
            dictionary[code] = 'JOUR' #College of Journalism
        elif 'undergraduate studies' in dictionary[code].lower() or 'inter' in dictionary[code].lower():
            dictionary[code] = 'OUGS' #office of undergraduate studies
        elif 'policy' in dictionary[code].lower():
            dictionary[code] = 'SPP' #school of public policy
        elif 'shady grove' in dictionary[code].lower():
            dictionary[code] = 'UGS' #universities at shady grove


rename_colleges(final_code_to_school)
rename_colleges(majors_schools)


def find_matching_string(string, string_list):
    matches = []
    for element in string_list:
        if string.lower() in element.lower():
            matches.append(element)
    return matches


for i in uniqueMajors:
    if i not in final_code_to_school:
        matches = find_matching_string(code_fullname_dict[i], majors_schools.keys())
        if len(matches) > 0:
            final_code_to_school[i] = majors_schools[matches[0]]
        else: #now will need to match keywords of major name to school
            if any(x in code_fullname_dict[i].lower() for x in ['biolog', 'systematics', 'biom', 'machine']):
                final_code_to_school[i] = 'CMNS'
            elif 'engineer' in code_fullname_dict[i].lower():
                final_code_to_school[i] = 'ENGR'
            elif any(x in code_fullname_dict[i].lower() for x in ['ed', 'teach']):
                final_code_to_school[i] = 'EDUC'
            elif any(x in code_fullname_dict[i].lower() for x in ['college park scholars', 'gemstone', 'honors', 'first-year', 'aces', 'civicus', 'design cultures', 'university', 'global', 'nav','maryland']):
                final_code_to_school[i] = 'OUGS'
            elif any(x in code_fullname_dict[i].lower() for x in ['business', 'mba', 'accounting', 'entrepreneur', 'manage', 'decision']):
                final_code_to_school[i] = 'BMGT'
            elif any(x in code_fullname_dict[i].lower() for x in ['theatre', 'film','lang','latin', 'germanic', 'immigration', 'art', 'gay']):
                final_code_to_school[i] = 'ARHU'
            elif any(x in code_fullname_dict[i].lower() for x in ['health', 'epid']):
                final_code_to_school[i] = 'SPHL'
            elif any(x in code_fullname_dict[i].lower() for x in ['behavior', 'law']):
                final_code_to_school[i] = 'BSOS'
            elif 'information' in code_fullname_dict[i].lower():
                final_code_to_school[i] = 'INFO'
            elif 'urban' in code_fullname_dict[i].lower(): #architecture
                final_code_to_school[i] = 'ARCH'


for i in uniqueMajors:
    if i not in final_code_to_school:
        print(i)


df_code_dept = pd.DataFrame.from_dict(final_code_to_school, orient='index').reset_index(names='Major').rename(columns={0:'Department'})


df_code_dept.head()


majors_df = df_code_dept

'''Helper funcation that gives potential matches for a professor given a first
and last name, and a list of courses they teach'''
def find_similar(first, last, courses, outer):
    c = []
    # find the department that the major is listed under
    for i, course in enumerate(courses.replace("'", "")[1:-1].split(", ")):
        index = majors_df.index[majors_df['Major'] == course[:4]].to_list()
        if index != []:
            temp = majors_df.at[index[0], "Department"]
            if len(temp) == 4:
                c.append(temp)
            elif temp == 'Public Policy':
                c.append('PLCY')
    print(c)
    # separates out hyphenated last names
    names = [first, last]
    if "-" in last:
        names.append(last.split("-")[0])
        names.append(last.split("-")[1])

    # looks for professors that match our criteria
    df = outer.loc[(outer["_merge"] != "left_only") & 
                   ((outer["first_name"].isin(names)) |
                    (outer["last_name"].isin(names))
                    )][["real_name(s)", "years_taught", "departments", "name"]]
    df = df[df['departments'].str.contains('|'.join(c))]
    df = df.sort_values('real_name(s)', ascending=True)
    bad_dep = ["VPSA", "VPAF", "SVPA", "PRES", "VPAA", "LIBR", "VPA-", "DIT-", 
               "VPR-", "IT-N", "VPUR", "IT-I"]
    # drops potential matches belonging to "bad departments", essentially those who don't teach
    for i, row in df.iterrows():
        for dep in bad_dep:
            if dep in row["departments"]:
                df.drop(i, inplace = True)
                break
    return df.reset_index().drop("index", axis = 1)


#left_copy = left_only.copy() # uncomment this when you want to start over
#left_rename = pd.DataFrame(columns = left_copy.columns)  # uncomment this when you want to start over

left_copy = pd.read_csv(f'src/2_clean_data/left_remaining.csv') # uncomment this if you want to continue where you left off
left_rename = pd.read_csv(f'src/2_clean_data/renamed_lefts.csv') # uncomment this if you want to continue where you left off


responses = ["n", "nah","no", "nono", "no way", "never", ""]

# go 10 at a time, saving progress each time
for i, row in left_copy.head(10).iterrows():
    first = row["name"].split()[0]
    last =  row["name"].split()[1]
    # the courses taught and semesters they taught in are displayed to better help decided if potential matches are
    # actually the same person
    print(row["course"])
    print(row["semester"])
    similar = find_similar(first,last,row["course"], outer)
    if similar.empty == False:      
        display(HTML(similar.to_html()))
        response = input(f"{first} {last}: ")
        # if the user confirms no matches, we move on
        if (response in responses):
            left_copy.drop(i, inplace = True)
        # if the user confirms a match, the match is recorded and renamed so that it can then be joined later
        else:
            left_copy.loc[i, 'name'] = similar.loc[int(response), 'name']
            left_rename = left_rename.append(left_copy.loc[i], ignore_index=True)
            left_copy.drop(i, inplace = True)
    else:
        left_copy.drop(i, inplace = True)

    clear_output()

if os.path.exists("src/2_clean_data/renamed_lefts.csv") == False and os.path.exists("src/2_clean_data/left_remaining.csv") == False:
    left_rename.to_csv('src/2_clean_data/renamed_lefts.csv', index = False)
    left_copy.to_csv(f'src/2_clean_data/left_remaining.csv', index=False)


combined_data = pd.read_csv(f'src/2_clean_data/combined_data_remaining.csv', converters={'years_taught': ast.literal_eval, 'semester': ast.literal_eval,'course': ast.literal_eval, 'salaries': ast.literal_eval, 'departments': ast.literal_eval})
#combined_data = pd.read_csv(f'combined_data.csv')
salary_data = pd.read_csv(f'src/2_clean_data/clean_salary_data.csv', converters={'years_taught': ast.literal_eval, 'salaries': ast.literal_eval, 'departments': ast.literal_eval})
new_combine = pd.read_csv(f'src/2_clean_data/new_combined_data.csv', converters={'years_taught': ast.literal_eval, 'salaries': ast.literal_eval, 'departments': ast.literal_eval})


# check if there is missing salary data and look for a replacement
for i, row in combined_data.head(100).iterrows():
    semesters = row['semester']
    missing = []
    for sem in semesters:
        year = int(sem[-5:])
        if "fall" in sem: 
            year += 1
        # missing year detected
        if year >= 2013 and year <= 2022 and (not year in row["years_taught"]) :
            if not year in missing:
                missing.append(year)
    if len(missing) >  0:
        first = row["name"].split(" ")[0]
        last = row["name"].split(" ")[1]
        count +=1

        # find people who match the criteria
        similar = find_similar(first,last,row["course"], salary_data, year)
        if len(similar) >= 1:
            years = row["years_taught"].copy()
            years.sort()
            missing.sort()
            print(f"years: {years}")
            print(f"missing years: {missing}")
            display(HTML(similar.to_html()))
            response = input(f"{first} {last}: ")
            if (not response in responses):
                # if a mathc is found, append the data to the existing data
                new_years = similar.loc[int(response), 'years_taught']
                combined_data.loc[i, 'years_taught'].extend(new_years)

                new_salaries = similar.loc[int(response), 'salaries']
                combined_data.loc[i, 'salaries'].extend(new_salaries)

                new_departments = similar.loc[int(response), 'departments']
                combined_data.loc[i, 'departments'].extend(new_departments)

                new_combine = new_combine.append(combined_data.loc[i], ignore_index=True)
            clear_output()
    combined_data.drop(i, inplace = True)

if os.path.exists("src/2_clean_data/combined_data_remaining.csv") == False and os.path.exists("src/2_clean_data/new_combined_data.csv") == False:
    combined_data.to_csv('combined_data_remaining.csv', index=False)
    new_combine.to_csv('new_combined_data.csv', index = False)


df = pd.read_csv(f'src/2_clean_data/combined_data.csv')
changed_names = pd.read_csv(f'src/2_clean_data/new_combined_data.csv')

df = df.drop(df[df['name'] == "P KSHETRY"].index)
df = df.drop(df[df['name'] == "R APTER"].index)

for i, row in changed_names.iterrows():
    df = df.drop(df[df['name'] == row['name']].index)

result = pd.concat([df, changed_names], axis=0)
result = result.reset_index(drop=True)

if os.path.exists("src/2_clean_data/final_combine_data.csv") == False:
    result.to_csv('src/2_clean_data/final_combine_data.csv', index = False)


#Columns with lists are represented as strings, so this is converting them into list/np.array types

df = pd.read_csv("src/2_clean_data/final_combine_data.csv")

for index, row in df.iterrows():
    df.iloc[index]['salaries'] = ast.literal_eval(df.iloc[index]['salaries'])
    df.iloc[index]['semester'] = ast.literal_eval(df.iloc[index]['semester'])
    df.iloc[index]['years_taught'] = ast.literal_eval(df.iloc[index]['years_taught'])
    df.iloc[index]['course'] = ast.literal_eval(df.iloc[index]['course'])
    df.iloc[index]['average_rating'] = np.fromstring(df.iloc[index]['average_rating'].strip("[]"), sep=',')
    df.iloc[index]['num_reviews'] = np.fromstring(df.iloc[index]['num_reviews'].strip("[]"), sep=',')
    df.iloc[index]['average_gpa'] = np.fromstring(df.iloc[index]['average_gpa'].strip("[]"), sep=',')
    df.iloc[index]['num_students'] = ast.literal_eval(df.iloc[index]['num_students'])

df = df.drop([1615, 1616])


# Getting the minimum year

df['min_year'] = df['years_taught'].apply(lambda x: min([sem for sem in x]))
# Getting lifespan of professor from first year taught to current semester that is being taught
df['year'] = df['semester'].apply(lambda x: min([sem.split(" ")[1] for sem in x]))
df['min_year'] = df[['min_year', 'year']].min(axis=1)
df = df.drop(['year'], axis=1)
df = df.explode(['course', 'semester', 'average_rating', 'num_reviews', 'average_gpa', 'num_students'])
df['curr_year'] = df['semester'].apply(lambda x: int(x.split(" ")[1]))
df['lifetime'] = df['curr_year'] - df['min_year']
df.loc[df['lifetime'] < 0, 'lifetime'] = np.nan
df[['sem', 'year']] = df['semester'].str.split(' ', 1, expand=True)

# Getting current salary
# get the current year, get the index in years taught, get the value of salaries at that index
def getIndex(year, array):
    differenceArray = np.absolute(np.array(array)-year)
    return differenceArray.argmin()
def getSalary(index, salary):
    return salary[index]
df['index'] = df.apply(lambda x: getIndex(x['curr_year'], x['years_taught']), axis=1)
df['current_salary'] = df.apply(lambda x: getSalary(x['index'], x['salaries']), axis=1)
df = df.drop('index', axis=1)


df = pd.read_csv('src/2_clean_data/final_combine_data.csv')


for index, row in df.iterrows():
    df.iloc[index]['salaries'] = ast.literal_eval(df.iloc[index]['salaries'])
    df.iloc[index]['semester'] = ast.literal_eval(df.iloc[index]['semester'])
    df.iloc[index]['years_taught'] = ast.literal_eval(df.iloc[index]['years_taught'])
    df.iloc[index]['course'] = ast.literal_eval(df.iloc[index]['course'])
    df.iloc[index]['average_rating'] = np.fromstring(df.iloc[index]['average_rating'].strip("[]"), sep=',')
    df.iloc[index]['num_reviews'] = np.fromstring(df.iloc[index]['num_reviews'].strip("[]"), sep=',')
    df.iloc[index]['average_gpa'] = np.fromstring(df.iloc[index]['average_gpa'].strip("[]"), sep=',')
    df.iloc[index]['num_students'] = ast.literal_eval(df.iloc[index]['num_students'])


df.head()


df = df.drop([1615, 1616]) #drop rows that don't have aligned data from PlanetTerp

# Getting the minimum year
df['min_year'] = df['years_taught'].apply(lambda x: min([sem for sem in x]))

# Getting lifespan of professor from first year taught to current semester that is being taught
df['year'] = df['semester'].apply(lambda x: min([sem.split(" ")[1] for sem in x]))
df['min_year'] = df[['min_year', 'year']].min(axis=1)
df = df.drop(['year'], axis=1)

df = df.explode(['course', 'semester', 'average_rating', 'num_reviews', 'average_gpa', 'num_students']) #columns consisting of data from PlanetTerp
df['curr_year'] = df['semester'].apply(lambda x: int(x.split(" ")[1]))
df['lifetime'] = df['curr_year'] - df['min_year']
df.loc[df['lifetime'] < 0, 'lifetime'] = np.nan


def getIndex(year, array):
    differenceArray = np.absolute(np.array(array)-year)
    return differenceArray.argmin()
def getSalary(index, salary):
    return salary[index]
df['index'] = df.apply(lambda x: getIndex(x['curr_year'], x['years_taught']), axis=1)
df['current_salary'] = df.apply(lambda x: getSalary(x['index'], x['salaries']), axis=1)
df = df.drop('index', axis=1)


df_codes = pd.read_csv('src/2_clean_data/code_departments.csv')


#Add the department that a course is a part of based on the major-department conversion table

df['department'] = [df_codes[df_codes['Major'].str.contains(row['course'][0:4])]['Department'].values[0] for index, row in df.iterrows()]


#Reset index

df.reset_index(drop=True, inplace=True)


#Histogram distribution for number of reviews written for a course (logarithmic on y axis)

fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(12,4))

g = sns.histplot(data = df, x = 'num_reviews', ax = axes[0])
g.set_yscale('log')
g.set_title("Number of Reviews written for a course")

#Boxplot of distribution of number of reviews written
b = sns.boxplot(data = df, x = 'num_reviews', ax=axes[1]).set_title("Distribution of number of reviews written for a course")

fig.tight_layout()


plt.hist(df['average_gpa'])
plt.gca().set(title='Frequency Histogram of course GPAs', ylabel='Frequency', xlabel='Average GPA');
print("Average GPA: ", df['average_gpa'].mean())
print("Average GPA: ", df['average_gpa'].median())

Average GPA:  3.433031475294573
Average GPA:  3.51


#Average rating plots

#Most students are given classes ratings of close to a 5/5
#Average of average rating is 4, average goes down as courses have more reviews written for it

fig, axes = plt.subplots(nrows = 2, ncols = 2, figsize=(8,6))
numReviews = 5

sns.histplot(data = df, x='average_rating', ax = axes[0][0]).set_title("Average ratings given for courses")
sns.boxplot(data = df, x='average_rating', ax = axes[0][1])

sns.histplot(data = df[df['num_reviews'] >= numReviews], x='average_rating', ax = axes[1][0]).set_title("Average ratings given for courses with >= " + str(numReviews) + " reviews")
sns.boxplot(data = df[df['num_reviews'] >= numReviews], x='average_rating', ax = axes[1][1])

fig.tight_layout()


df_test = df[(df['num_reviews'] >= 0)]

g = sns.pairplot(df_test[["average_rating", "average_gpa", "lifetime", "current_salary", "num_students"]])


#Relationship between lifetime and salary

fig, axes = plt.subplots(figsize=(10,4))

numReviews = 0

sns.scatterplot(data=df[df['num_reviews'] >= 0], x='lifetime', y='current_salary', ax = axes).set_title("Teaching lifetime vs salary of professor")
sns.regplot(data=df[df['num_reviews'] >= 0], x='lifetime', y='current_salary', ax = axes)
fig.tight_layout()


fig, ax = plt.subplots()
rating = {}

for years_taught in sorted(df[df['lifetime'].notna()]['lifetime'].unique()):
    df_years_taught = df[(df['lifetime'] == years_taught) & (df['num_students'].notna())]
    rating[years_taught] = df_years_taught['average_rating']    

df_ratings = pd.DataFrame.from_dict(rating)
s = sns.boxplot(data=df_ratings)
s.set(title="Distribution of average ratings of courses vs. years of teaching experience at UMD", xlabel='Years of experience', ylabel='Average rating')
#sns.lineplot(x=np.arange(0,10), y=median_gpas, c='red', linewidth=3)

plt.show()


#GPA of courses taught by professors with different years of teaching experience
fig, ax = plt.subplots()
gpa = {}
median_gpas = []

for years_taught in sorted(df[df['lifetime'].notna()]['lifetime'].unique()):
    df_years_taught = df[(df['lifetime'] == years_taught) & (df['average_gpa'].notna())]
    gpa[years_taught] = df_years_taught['average_gpa']
    median_gpas.append(df_years_taught['average_gpa'].median())
    

df_gpa = pd.DataFrame.from_dict(gpa)
s = sns.boxplot(data=df_gpa)
s.set(title="Distribution of GPA of courses vs. years of teaching experience at UMD", xlabel='Years of Experience', ylabel='Average GPA')

plt.show()


#Average rating given in each department

s = sns.barplot(data = df.groupby('department')['average_rating'].mean().reset_index(), x = 'department', y='average_rating')
s.set(title="Average rating given for a course per department")
plt.tight_layout()


#Average rating given in each department

s = sns.barplot(data = df.groupby('department')['average_gpa'].mean().reset_index(), x = 'department', y='average_gpa')
s.set(title="Average GPA given for a course per department")
plt.tight_layout()


# Getting and splitting dataset into training and testing data

lifeTimeGrades = df[['lifetime', 'average_gpa']]
lifeTimeGrades.dropna(inplace=True)
lifeTimeRating = df[['lifetime', 'average_rating']]
lifeTimeRating.dropna(inplace=True)
salaryGrades = df[['current_salary', 'average_gpa']]
salaryGrades.dropna(inplace=True)
salaryRating = df[['current_salary', 'average_rating']]
salaryRating.dropna(inplace=True)
lsGrades = df[['lifetime', 'current_salary', 'average_gpa']]
lsGrades.dropna(inplace=True)
lsRating = df[['lifetime', 'current_salary', 'average_rating']]
lsRating.dropna(inplace=True)


import sklearn.model_selection
import sklearn.linear_model
# Linear Regression Analysis

# Regression of lifetime vs grades
lifetime_train, lifetime_test, grade_train, grade_test = sklearn.model_selection.train_test_split(lifeTimeGrades['lifetime'], lifeTimeGrades['average_gpa'], test_size=0.3)
linear = sklearn.linear_model.LinearRegression().fit(np.array(lifetime_train).reshape(-1,1), grade_train)
print(linear.score(np.array(lifetime_test).reshape(-1,1), grade_test))
gradePredict = linear.predict(np.array(lifetime_test).reshape(-1,1))
plt.scatter(np.array(lifetime_test).reshape(-1,1), grade_test)
plt.plot(np.array(lifetime_test).reshape(-1,1), gradePredict, color='red')
plt.title('Lifetime vs Grades')
plt.xlabel('Lifetime')
plt.ylabel('Grades')
plt.show()

# Regression of lifetime vs starRating
lifetime_train, lifetime_test, rating_train, rating_test = sklearn.model_selection.train_test_split(lifeTimeRating['lifetime'], lifeTimeRating['average_rating'], test_size=0.3)
linear = sklearn.linear_model.LinearRegression().fit(np.array(lifetime_train).reshape(-1,1), rating_train)
print(linear.score(np.array(lifetime_test).reshape(-1,1), rating_test))
ratingPredict = linear.predict(np.array(lifetime_test).reshape(-1,1))
plt.scatter(np.array(lifetime_test).reshape(-1,1), rating_test)
plt.plot(np.array(lifetime_test).reshape(-1,1), ratingPredict, color='red')
plt.title('Lifetime vs Star Rating')
plt.xlabel('Lifetime')
plt.ylabel('Star rating')
plt.show()

# Regression of salary vs grades
salary_train, salary_test, grade_train, grade_test = sklearn.model_selection.train_test_split(salaryGrades['current_salary'], salaryGrades['average_gpa'], test_size=0.3)
linear = sklearn.linear_model.LinearRegression().fit(np.array(salary_train).reshape(-1,1), grade_train)
print(linear.score(np.array(salary_test).reshape(-1,1), grade_test))
gradePredict = linear.predict(np.array(salary_test).reshape(-1,1))
plt.scatter(np.array(salary_test).reshape(-1,1), grade_test)
plt.plot(np.array(salary_test).reshape(-1,1), gradePredict, color='red')
plt.title('Salary vs Grades')
plt.xlabel('Salary')
plt.ylabel('Grades')
plt.show()
# Regression of salary vs starRating
salary_train, salary_test, rating_train, rating_test = sklearn.model_selection.train_test_split(salaryRating['current_salary'], salaryRating['average_rating'], test_size=0.3)
linear = sklearn.linear_model.LinearRegression().fit(np.array(salary_train).reshape(-1,1), rating_train)
print(linear.score(np.array(salary_test).reshape(-1,1), rating_test))
ratingPredict = linear.predict(np.array(salary_test).reshape(-1,1))
plt.scatter(np.array(salary_test).reshape(-1,1), rating_test)
plt.plot(np.array(salary_test).reshape(-1,1), ratingPredict, color='red')
plt.title('Salary vs Star Rating')
plt.xlabel('Salary')
plt.ylabel('Star rating')
plt.show()

# 3d Regression of salary and lifetime as x variables and grades as y variable
X = lsGrades.drop('average_gpa', axis=1)
y = lsGrades['average_gpa']
ls_train, ls_test, grade_train, grade_test = sklearn.model_selection.train_test_split(X, y, test_size=0.3)
linear = sklearn.linear_model.LinearRegression().fit(np.array(ls_train), grade_train)
print(linear.score(ls_test, grade_test))
gradePredict = linear.predict(ls_test)

fig = plt.figure(figsize=(8,6))
ax = plt.axes(projection='3d')

ax.scatter3D(ls_test['lifetime'], ls_test['current_salary'], grade_test)
ax.set_title('Lifetime vs Salary vs Grades')
ax.set_xlabel('Lifetime')
ax.set_ylabel('Salary')
ax.set_zlabel('Grades')
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'${x/1000:.0f}k'))

#Create a meshgrid for the linear regression surface plot
x = np.linspace(min(ls_test['lifetime']), max(ls_test['lifetime']), 100)
y = np.linspace(min(ls_test['current_salary']), max(ls_test['current_salary']), 100)
X, Y = np.meshgrid(x, y)

#Predict GPA values
Z = linear.predict(np.column_stack((X.ravel(), Y.ravel())))
Z = Z.reshape(X.shape)

ax.plot_surface(X, Y, Z, color='r', alpha=0.5, label='Regression Plane')
plt.show()

# 3d Regression of salary and lifetime as x variables and star rating as y variable
X = lsRating.drop('average_rating', axis=1)
y = lsRating['average_rating']
ls_train, ls_test, rating_train, rating_test = sklearn.model_selection.train_test_split(X, y, test_size=0.3)
linear = sklearn.linear_model.LinearRegression().fit(np.array(ls_train), rating_train)
print(linear.score(ls_test, rating_test))
ratingPredict = linear.predict(ls_test)

fig = plt.figure(figsize=(8,6))
ax = plt.axes(projection='3d')

ax.scatter3D(ls_test['lifetime'], ls_test['current_salary'], rating_test)
ax.set_title('Lifetime vs Salary vs Star Rating')
ax.set_xlabel('Lifetime')
ax.set_ylabel('Salary')
ax.set_zlabel('Rating')
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'${x/1000:.0f}k'))

#Create a meshgrid for the linear regression surface plot
x = np.linspace(min(ls_test['lifetime']), max(ls_test['lifetime']), 100)
y = np.linspace(min(ls_test['current_salary']), max(ls_test['current_salary']), 100)
X, Y = np.meshgrid(x, y)

#Predict GPA values
Z = linear.predict(np.column_stack((X.ravel(), Y.ravel())))
Z = Z.reshape(X.shape)

ax.plot_surface(X, Y, Z, color='r', alpha=0.5, label='Regression Plane')
plt.show()

0.007793159218596135

-0.0002646354774540338

0.0006477196794330009

0.0034571856739065154

0.004349964113033233

0.006626511678791802


#Regression of liftime vs grades
lifetime_train, lifetime_test, grade_train, grade_test = sklearn.model_selection.train_test_split(lifeTimeGrades['lifetime'], lifeTimeGrades['average_gpa'], test_size=0.3)
logistic = sklearn.linear_model.LogisticRegression().fit(np.array(lifetime_train).reshape(-1,1), grade_train.astype('int'))
print(logistic.score(np.array(lifetime_test).reshape(-1,1), grade_test.astype('int')))
gradePredict = logistic.predict(np.array(lifetime_test).reshape(-1,1))
plt.scatter(np.array(lifetime_test).reshape(-1,1), grade_test)
plt.plot(np.array(lifetime_test).reshape(-1,1), gradePredict, color='red')
plt.title('Lifetime vs Grades')
plt.xlabel('Lifetime')
plt.ylabel('Grades')
plt.show()

# Regression of lifetime vs starRating
lifetime_train, lifetime_test, rating_train, rating_test = sklearn.model_selection.train_test_split(lifeTimeRating['lifetime'], lifeTimeRating['average_rating'], test_size=0.3)
logistic = sklearn.linear_model.LogisticRegression().fit(np.array(lifetime_train).reshape(-1,1), rating_train.astype('int'))
print(logistic.score(np.array(lifetime_test).reshape(-1,1), rating_test.astype('int')))
ratingPredict = logistic.predict(np.array(lifetime_test).reshape(-1,1))
plt.scatter(np.array(lifetime_test).reshape(-1,1), rating_test)
plt.plot(np.array(lifetime_test).reshape(-1,1), ratingPredict, color='red')
plt.title('Lifetime vs Star Rating')
plt.xlabel('Lifetime')
plt.ylabel('Star rating')
plt.show()

# Regression of salary vs grades
salary_train, salary_test, grade_train, grade_test = sklearn.model_selection.train_test_split(salaryGrades['current_salary'], salaryGrades['average_gpa'], test_size=0.3)
logistic = sklearn.linear_model.LogisticRegression().fit(np.array(salary_train).reshape(-1,1), grade_train.astype('int'))
print(logistic.score(np.array(salary_test).reshape(-1,1), grade_test.astype('int')))
gradePredict = logistic.predict(np.array(salary_test).reshape(-1,1))
plt.scatter(np.array(salary_test).reshape(-1,1), grade_test)
plt.plot(np.array(salary_test).reshape(-1,1), gradePredict, color='red')
plt.title('Salary vs Grades')
plt.xlabel('Salary')
plt.ylabel('Grades')
plt.show()

# Regression of salary vs starRating
salary_train, salary_test, rating_train, rating_test = sklearn.model_selection.train_test_split(salaryRating['current_salary'], salaryRating['average_rating'], test_size=0.3)
logistic = sklearn.linear_model.LogisticRegression().fit(np.array(salary_train).reshape(-1,1), rating_train.astype('int'))
print(logistic.score(np.array(salary_test).reshape(-1,1), rating_test.astype('int')))
ratingPredict = logistic.predict(np.array(salary_test).reshape(-1,1))
plt.scatter(np.array(salary_test).reshape(-1,1), rating_test)
plt.plot(np.array(salary_test).reshape(-1,1), ratingPredict, color='red')
plt.title('Salary vs Star Rating')
plt.xlabel('Salary')
plt.ylabel('Star rating')
plt.show()

# 3d Regression of salary and lifetime as x variables and grades as y variable
X = lsGrades.drop('average_gpa', axis=1)
y = lsGrades['average_gpa']
ls_train, ls_test, grade_train, grade_test = sklearn.model_selection.train_test_split(X, y, test_size=0.3)
logistic = sklearn.linear_model.LogisticRegression().fit(np.array(ls_train), grade_train.astype('int'))
print(logistic.score(ls_test, grade_test.astype('int')))
gradePredict = logistic.predict(ls_test)

fig = plt.figure(figsize=(8,6))
ax = plt.axes(projection='3d')

ax.scatter3D(ls_test['lifetime'], ls_test['current_salary'], grade_test)
ax.set_title('Lifetime vs Salary vs Grades')
ax.set_xlabel('Lifetime')
ax.set_ylabel('Salary')
ax.set_zlabel('Grades')
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'${x/1000:.0f}k'))

#Create a meshgrid for the logistic regression surface plot
x = np.linspace(min(ls_test['lifetime']), max(ls_test['lifetime']), 100)
y = np.linspace(min(ls_test['current_salary']), max(ls_test['current_salary']), 100)
X, Y = np.meshgrid(x, y)

#Predict GPA values
Z = logistic.predict(np.column_stack((X.ravel(), Y.ravel())))
Z = Z.reshape(X.shape)

ax.plot_surface(X, Y, Z, color='r', alpha=0.5, label='Regression Plane')
plt.show()

# 3d Regression of salary and lifetime as x variables and star rating as y variable
X = lsRating.drop('average_rating', axis=1)
y = lsRating['average_rating']
ls_train, ls_test, rating_train, rating_test = sklearn.model_selection.train_test_split(X, y, test_size=0.3)
logistic = sklearn.linear_model.LogisticRegression().fit(np.array(ls_train), rating_train.astype('int'))
print(logistic.score(ls_test, rating_test.astype('int')))
ratingPredict = logistic.predict(ls_test)

fig = plt.figure(figsize=(8,6))
ax = plt.axes(projection='3d')

ax.scatter3D(ls_test['lifetime'], ls_test['current_salary'], rating_test)
ax.set_title('Lifetime vs Salary vs Star Rating')
ax.set_xlabel('Lifetime')
ax.set_ylabel('Salary')
ax.set_zlabel('Rating')
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'${x/1000:.0f}k'))

#Create a meshgrid for the logistic regression surface plot
x = np.linspace(min(ls_test['lifetime']), max(ls_test['lifetime']), 100)
y = np.linspace(min(ls_test['current_salary']), max(ls_test['current_salary']), 100)
X, Y = np.meshgrid(x, y)

#Predict GPA values
Z = logistic.predict(np.column_stack((X.ravel(), Y.ravel())))
Z = Z.reshape(X.shape)

ax.plot_surface(X, Y, Z, color='r', alpha=0.5, label='Regression Plane')
plt.show()

0.7577045696068013

0.46554552912223135

0.7551468972085867

0.43117697349672274

0.7592577454426551

0.46267432321575064


def addCategorizedGrade(data):
    data['categorized_grade'] = ""
    for index, row in data.iterrows():
        if row['average_gpa'] > 3.7:
            data.loc[index, 'categorized_grade'] = "A"
        elif row['average_gpa'] > 3.3:
            data.loc[index, 'categorized_grade'] = "A-"
        elif row['average_gpa'] > 3.0:
            data.loc[index, 'categorized_grade'] = "B+"
        elif row['average_gpa'] > 2.7:
            data.loc[index, 'categorized_grade'] = "B"
        elif row['average_gpa'] > 2.3:
            data.loc[index, 'categorized_grade'] = "B-"
        elif row['average_gpa'] > 2.0:
            data.loc[index, 'categorized_grade'] = "C+"
        elif row['average_gpa'] > 1.7:
            data.loc[index, 'categorized_grade'] = "C"
        elif row['average_gpa'] > 1.3:
            data.loc[index, 'categorized_grade'] = "C-"
        elif row['average_gpa'] > 1.0:
            data.loc[index, 'categorized_grade'] = "D+"
        elif row['average_gpa'] > 0.7:
            data.loc[index, 'categorized_grade'] = "D"
        else:
            data.loc[index, 'categorized_grade'] = "F"


# Creating categorized values for SVM

df_svm = df.copy()
df_svm = df_svm[df_svm['average_gpa'].notna() & df_svm['lifetime'].notna()]

# Categorize grades
teacherLifetimeGrades = df[(df['lifetime'].notna()) & (df['average_gpa'].notna())]
teacherLifetimeRating = df[(df['lifetime'].notna()) & (df['average_rating'].notna())]
salaryGrades = df[(df['current_salary'].notna()) & (df['average_gpa'].notna())]
salaryRating = df[(df['current_salary'].notna()) & (df['average_rating'].notna())]
lsGrades = df[(df['lifetime'].notna()) & (df['current_salary'].notna()) & (df['average_gpa'].notna())]
lsRating = df[(df['lifetime'].notna()) & (df['current_salary'].notna()) & (df['average_rating'].notna())]

addCategorizedGrade(teacherLifetimeGrades)
addCategorizedGrade(salaryGrades)
addCategorizedGrade(lsGrades)

teacherLifetimeGrades = teacherLifetimeGrades[['lifetime', 'categorized_grade']]
teacherLifetimeRating = teacherLifetimeRating[['lifetime', 'average_rating']]
salaryGrades = salaryGrades[['current_salary', 'categorized_grade']]
salaryRating = salaryRating[['current_salary', 'average_rating']]
lsGrades = lsGrades[['lifetime', 'current_salary', 'categorized_grade']]
lsRating = lsRating[['lifetime', 'current_salary', 'average_rating']]

#categorize rating data
teacherLifetimeRating['average_rating'] = teacherLifetimeRating['average_rating'].astype(int) 
salaryRating['average_rating'] = salaryRating['average_rating'].astype(int) 
lsRating['average_rating'] = lsRating['average_rating'].astype(int)


def performanceMetrics(groundTruth, predictions):
    accuracy = accuracy_score(groundTruth, predictions)
    return accuracy


# SVM Analysis

# SVM of lifetime and salary vs grades
ls_train, ls_test, grade_train, grade_test = sklearn.model_selection.train_test_split(lsGrades[['lifetime', 'current_salary']], lsGrades['categorized_grade'], test_size=0.3)
svm = sklearn.svm.SVC().fit(np.array(ls_train), grade_train)
svm.score(np.array(ls_test), grade_test)
gradePredict = svm.predict(np.array(ls_test))

print("Accuracy of SVM of lifetime and salary and grades:", performanceMetrics(grade_test, gradePredict))

# SVM of lifetime and salary vs rating
ls_train, ls_test, rating_train, rating_test = sklearn.model_selection.train_test_split(lsRating[['lifetime', 'current_salary']], lsRating['average_rating'], test_size=0.3)
svm = sklearn.svm.SVC().fit(np.array(ls_train), rating_train)
svm.score(np.array(ls_test), rating_test)
ratingPredict = svm.predict(np.array(ls_test))

print("Accuracy of SVM of lifetime and salary and grades:", performanceMetrics(rating_test, ratingPredict))

Accuracy of SVM of lifetime and salary and grades: 0.35641298128014387
Accuracy of SVM of lifetime and salary and grades: 0.43232157506152585


# merge on professor and course, no longer worried about semester

avg_reviews_df = pd.DataFrame()
classes = {}
count = 0
curr_prof = reviews_df.at[0, "name"]
curr_slug = reviews_df.at[0, "slug"]
# iterate through the reviews
for index, row in reviews_df.iterrows():
    if reviews_df.at[index, "name"] == curr_prof:
        if reviews_df.at[index, "course"] in classes:
            # add to the average
            classes[reviews_df.at[index, "course"]] = [classes.get(reviews_df.at[index, "course"])[0] + reviews_df.at[index, "rating"], 
                                                       classes.get(reviews_df.at[index, "course"])[1] + 1]
        else:
            classes[reviews_df.at[index, "course"]] = [reviews_df.at[index, "rating"],1]
    else:
        # add to the dataframe
        for i in classes.keys():
            avg_reviews_df.at[count, "name"] = curr_prof
            avg_reviews_df.at[count, "course"] = i
            avg_reviews_df.at[count, "slug"] = curr_slug
            avg_reviews_df.at[count, "average_rating"] = classes.get(i)[0]/classes.get(i)[1]
            avg_reviews_df.at[count, "num_ratings"] = classes.get(i)[1]
            count = count + 1
        classes = {}
        curr_prof = reviews_df.at[index, "name"]
        curr_slug = reviews_df.at[index, "slug"]

avg_reviews_df.head()


# merge on professor and course, no longer worried about semester

avg_grades_df = pd.DataFrame()
classes = {}
gpa = {"A+":4.0, "A":4.0, "A-":3.7,"B+":3.3, "B":3.0, "B-":2.7, "C+":2.3, "C":2.0, "C-":1.7, "D+":1.3, "D":1.0, "D-":0.7, "F":0.0}
count = 0
curr_prof = grades_df.at[0, "professor"]
# iterate through the grades
for index, row in grades_df.iterrows():
    curr_gpa = [0, 0]
    for i in gpa.keys():
        # add to the average
        curr_gpa[0] = curr_gpa[0] + gpa.get(i)*grades_df.at[index, i]
        curr_gpa[1] = curr_gpa[1] + grades_df.at[index, i]
    if grades_df.at[index, "professor"] == curr_prof:
        if grades_df.at[index, "course"] in classes:
            classes[grades_df.at[index, "course"]] = [classes.get(grades_df.at[index, "course"])[0] + curr_gpa[0], 
                                                      classes.get(grades_df.at[index, "course"])[1] + curr_gpa[1]]
        else:
            classes[grades_df.at[index, "course"]] = curr_gpa
    else:
        for i in classes.keys():
            # add to the dataframe
            if classes.get(i)[1] != 0:
                avg_grades_df.at[count, "name"] = curr_prof
                avg_grades_df.at[count, "course"] = i
                avg_grades_df.at[count, "average_gpa"] = (classes.get(i)[0])/(classes.get(i)[1])
                avg_grades_df.at[count, "num_students"] = classes.get(i)[1]
                count = count + 1
        classes = {}
        curr_prof = grades_df.at[index, "professor"]

avg_grades_df.head()


both_avg_df = pd.DataFrame()

# inner join on name and course
both_avg_df = pd.merge(
    avg_reviews_df,
    avg_grades_df,
    how = "inner",
    on = ["name", "course"]
)

both_avg_df.head()


x = both_avg_df["average_rating"]
y = both_avg_df["average_gpa"]
reg = linear_model.LinearRegression()

# Regression of average rating vs average gpa
x_train = [[x] for x in both_avg_df["average_rating"].values]
y_train = [[y] for y in both_avg_df["average_gpa"].values]
regfit = reg.fit(x_train, y_train)
y_pred = reg.predict(x_train)
plt.close()
plt.plot(x, y, "^")
plt.plot(x_train,y_pred)
plt.title("Average Rating vs Average GPA")
plt.ylabel("Average GPA")
plt.xlabel("Average Rating")
plt.show()


# getting rid of professor/class combinations with less than 5 reviews

five_or_more_df = pd.DataFrame()
count = 0

for index, row in both_avg_df.iterrows():
    if both_avg_df.at[index, "num_ratings"] >= 5:
        five_or_more_df.at[count, "name"] = both_avg_df.at[index, "name"]
        five_or_more_df.at[count, "slug"] = both_avg_df.at[index, "slug"]
        five_or_more_df.at[count, "course"] = both_avg_df.at[index, "course"]
        five_or_more_df.at[count, "average_rating"] = both_avg_df.at[index, "average_rating"]
        five_or_more_df.at[count, "num_ratings"] = both_avg_df.at[index, "num_ratings"]
        five_or_more_df.at[count, "average_gpa"] = both_avg_df.at[index, "average_gpa"]
        five_or_more_df.at[count, "num_students"] = both_avg_df.at[index, "num_students"]
        count = count + 1

five_or_more_df.head()


x = five_or_more_df["average_rating"]
y = five_or_more_df["average_gpa"]

# Regression of average rating vs average gpa
reg = linear_model.LinearRegression()
x_train = [[x] for x in five_or_more_df["average_rating"].values]
y_train = [[y] for y in five_or_more_df["average_gpa"].values]
regfit = reg.fit(x_train, y_train)
y_pred = reg.predict(x_train)
plt.plot(x, y, "^")
plt.plot(x_train,y_pred)
plt.title("Average Rating vs Average GPA (5 Reviews or More)")
plt.ylabel("Average GPA")
plt.xlabel("Average Rating")
plt.show()


multi_prof = {}
curr_prof = None
classes = [[]]
multi = False
for index, row in five_or_more_df.iterrows():
    # gets all of the professors that teach multiple classes
    classes.append([five_or_more_df.at[index, "course"],
                    five_or_more_df.at[index, "average_rating"],
                    five_or_more_df.at[index, "num_ratings"],
                    five_or_more_df.at[index, "average_gpa"],
                    five_or_more_df.at[index, "num_students"]])
    if five_or_more_df.at[index, "name"] == curr_prof:
        multi = True
    else:
        if multi == True:
            multi_prof[curr_prof] = classes
            multi = False
        classes = []
        curr_prof = five_or_more_df.at[index, "name"]

# puts all of these professors in a dataframe, sorting them by number of classes they teach
multi_prof_df = pd.DataFrame()

count = 0
for i in multi_prof.keys():
    multi_prof_df.at[count, "name"] = i
    multi_prof_df.at[count, "num_classes"] = len(multi_prof.get(i))
    count = count + 1

multi_prof_df = multi_prof_df.sort_values(by = "num_classes", ascending = False, ignore_index = True)

# copy over the information for the 14 professors with the most classes
most_classes_profs = []

for i in range(14):
    most_classes_profs.append(multi_prof_df.at[i, "name"])

most_classes_profs_df = pd.DataFrame()
count = 0

for index, row in five_or_more_df.iterrows():
    if five_or_more_df.at[index, "name"] in most_classes_profs:
        most_classes_profs_df.at[count, "name"] = five_or_more_df.at[index, "name"]
        most_classes_profs_df.at[count, "course"] = five_or_more_df.at[index, "course"]
        most_classes_profs_df.at[count, "average_rating"] = five_or_more_df.at[index, "average_rating"]
        most_classes_profs_df.at[count, "num_ratings"] = five_or_more_df.at[index, "num_ratings"]
        most_classes_profs_df.at[count, "average_gpa"] = five_or_more_df.at[index, "average_gpa"]
        most_classes_profs_df.at[count, "num_students"] = five_or_more_df.at[index, "num_students"]
        count = count + 1

most_classes_profs_df.head()


for i in most_classes_profs:
    courses = []
    ratings = []
    gpas = []

    # store all of the course information for each professor
    for j in multi_prof.get(i):
        courses.append(j[0])
        ratings.append(j[1])
        gpas.append(j[3])

    for j in range(len(courses)):
        plt.plot(ratings[j], gpas[j], "^", label = courses[j])

    plt.title("Average Rating vs Average GPA (Classes Taught by " + i + ")")
    plt.ylabel("Average GPA")
    plt.xlabel("Average Rating")
    plt.legend(loc = "center left", bbox_to_anchor=(1, 0.5))
    # linear regression for each professor
    ratings = np.array(ratings).reshape(-1, 1)
    reg = linear_model.LinearRegression()
    regfit = reg.fit(ratings, gpas)
    X = np.linspace(-100, 100)
    y_pred = reg.predict(np.array(X).reshape(-1,1))
    plt.plot(X,y_pred)
    plt.xlim(0.5, 5.5)
    plt.ylim(-0.5, 4.5)
    plt.show()


multi_course = {}
curr_course = None
profs = [[]]
multi = False

# get all of the classes that have been taught by multiple professors
five_or_more_df = five_or_more_df.sort_values(by = "course", ignore_index = True)

for index, row in five_or_more_df.iterrows():
    profs.append([five_or_more_df.at[index, "name"],
                    five_or_more_df.at[index, "average_rating"],
                    five_or_more_df.at[index, "num_ratings"],
                    five_or_more_df.at[index, "average_gpa"],
                    five_or_more_df.at[index, "num_students"]])
    if five_or_more_df.at[index, "course"] == curr_course:
        multi = True
    else:
        if multi == True:
            multi_course[curr_course] = profs
            multi = False
        profs = []
        curr_course = five_or_more_df.at[index, "course"]

five_or_more_df = five_or_more_df.sort_values(by = "name", ignore_index = True)

# store those classes in a dataframe and sort it by number of professors that have taught it
multi_course_df = pd.DataFrame()

count = 0
for i in multi_course.keys():
    multi_course_df.at[count, "course"] = i
    multi_course_df.at[count, "num_profs"] = len(multi_course.get(i))
    count = count + 1

multi_course_df = multi_course_df.sort_values(by = "num_profs", ascending = False, ignore_index = True)

# keep only the 10 courses that have been taught by the most professors
most_profs_courses = []

for i in range(10):
    most_profs_courses.append(multi_course_df.at[i, "course"])

most_profs_courses_df = pd.DataFrame()
count = 0

# copy over the information for those 10 courses
for index, row in five_or_more_df.iterrows():
    if five_or_more_df.at[index, "course"] in most_profs_courses:
        most_profs_courses_df.at[count, "name"] = five_or_more_df.at[index, "name"]
        most_profs_courses_df.at[count, "course"] = five_or_more_df.at[index, "course"]
        most_profs_courses_df.at[count, "average_rating"] = five_or_more_df.at[index, "average_rating"]
        most_profs_courses_df.at[count, "num_ratings"] = five_or_more_df.at[index, "num_ratings"]
        most_profs_courses_df.at[count, "average_gpa"] = five_or_more_df.at[index, "average_gpa"]
        most_profs_courses_df.at[count, "num_students"] = five_or_more_df.at[index, "num_students"]
        count = count + 1

most_profs_courses_df.head()


for i in most_profs_courses:
    profs = []
    ratings = []
    gpas = []

    # get all of the information for each professor that has taught the class
    for j in multi_course.get(i):
        profs.append(j[0])
        ratings.append(j[1])
        gpas.append(j[3])

    for j in range(len(profs)):
        plt.plot(ratings[j], gpas[j], "^", label = profs[j])

    plt.title("Average Rating vs Average GPA (Different Professors Teaching " + i + ")")
    plt.ylabel("Average GPA")
    plt.xlabel("Average Rating")
    plt.legend(loc = "center left", bbox_to_anchor=(1, 0.5))
    
    # linear regression for gpa vs rating
    ratings = np.array(ratings).reshape(-1, 1)
    reg = linear_model.LinearRegression()
    regfit = reg.fit(ratings, gpas)
    X = np.linspace(-100, 100)
    y_pred = reg.predict(np.array(X).reshape(-1,1))
    plt.plot(X,y_pred)
    plt.xlim(0.5, 5.5)
    plt.ylim(-0.5, 4.5)
    plt.show()


import nltk
nltk.download("vader_lexicon")

sia = SentimentIntensityAnalyzer()

reviews_df = reviews_df.dropna(subset = ['review'])

# get the sentiment for each review
for index, row in reviews_df.iterrows():
   reviews_df.at[index, "sentiment"] = sia.polarity_scores(reviews_df.at[index, "review"])["compound"]

average_sentiment_df = pd.DataFrame()
classes = {}
count = 0
curr_prof = reviews_df.at[0, "name"]
curr_slug = reviews_df.at[0, "slug"]
for index, row in reviews_df.iterrows():
    # get the average sentiment for each professor/class combination
    if reviews_df.at[index, "name"] == curr_prof:
        if reviews_df.at[index, "course"] in classes:
            classes[reviews_df.at[index, "course"]] = [classes.get(reviews_df.at[index, "course"])[0] + reviews_df.at[index, "rating"], 
                                                       classes.get(reviews_df.at[index, "course"])[1] + 1,
                                                       classes.get(reviews_df.at[index, "course"])[2] + reviews_df.at[index, "sentiment"]]
        else:
            classes[reviews_df.at[index, "course"]] = [reviews_df.at[index, "rating"],1,reviews_df.at[index, "sentiment"]]
    else:
        for i in classes.keys():
            # add to the dataframe
            average_sentiment_df.at[count, "name"] = curr_prof
            average_sentiment_df.at[count, "course"] = i
            average_sentiment_df.at[count, "slug"] = curr_slug
            average_sentiment_df.at[count, "average_rating"] = classes.get(i)[0]/classes.get(i)[1]
            average_sentiment_df.at[count, "num_ratings"] = classes.get(i)[1]
            average_sentiment_df.at[count, "average_sentiment"] = classes.get(i)[2]/classes.get(i)[1]
            count = count + 1
        classes = {}
        curr_prof = reviews_df.at[index, "name"]
        curr_slug = reviews_df.at[index, "slug"]

average_sentiment_df.head()


x = average_sentiment_df["average_rating"]
y = average_sentiment_df["average_sentiment"]

# Regression of average rating vs average sentiment
reg = linear_model.LinearRegression()
x_train = [[x] for x in average_sentiment_df["average_rating"].values]
y_train = [[y] for y in average_sentiment_df["average_sentiment"].values]
regfit = reg.fit(x_train, y_train)
y_pred = reg.predict(x_train)
plt.plot(x, y, "^")
plt.plot(x_train,y_pred)
plt.title("Average Rating vs Average Sentiment")
plt.ylabel("Average Sentiment")
plt.xlabel("Average Rating")
plt.show()


# gets the average sentiment of a professor across all of their classes

def combine(group):
     return pd.Series({
        'average_rating': sum(group['average_rating'].tolist())/len(group['average_rating'].tolist()),
        'num_ratings': sum(group['num_ratings'].tolist())/len(group['num_ratings'].tolist()),
        'average_sentiment': sum(group['average_sentiment'].tolist())/len(group['average_sentiment'].tolist())
    })

average_prof_df = average_sentiment_df.groupby(['name']).apply(combine).reset_index()

average_prof_df["name"] = average_prof_df["name"].apply(lambda x: x.split()[0].upper()) + " " +average_prof_df["name"].apply(lambda x: x.split()[-1].upper())

average_prof_df.head()


# read our existing data, getting the average salary for a given professor

combined_data_df = pd.read_csv("src/2_clean_data/final_combine_data.csv")

combined_data_df = combined_data_df.drop(columns = ["course", "semester", "average_rating", "num_reviews", "average_gpa", 
                                                    "num_students", "years_taught", "departments", "real_name(s)", "_merge"])

# merge the average salary with the average sentiment
for index, row in combined_data_df.iterrows():
    temp = combined_data_df.at[index, "salaries"][1:-1].split(", ")
    nums = [eval(i) for i in temp]
    combined_data_df.at[index, "average_salary"] = sum(nums)/len(nums)

combined_data_df = combined_data_df.drop(columns = ["salaries"])

combined_data_df.head()


# inner join on name
sal_sent_df = pd.merge(
    average_prof_df,
    combined_data_df,
    how = "inner",
    on = ["name"]
)

sal_sent_df.head()


# plot salary vs sentiment

x = sal_sent_df["average_salary"]
y = sal_sent_df["average_sentiment"]

# Regression of average salary vs average sentiment
reg = linear_model.LinearRegression()
x_train = [[x] for x in sal_sent_df["average_salary"].values]
y_train = [[y] for y in sal_sent_df["average_sentiment"].values]
regfit = reg.fit(x_train, y_train)
y_pred = reg.predict(x_train)
plt.plot(x, y, "^")
plt.plot(x_train,y_pred)
plt.title("Average Salary vs Average Sentiment")
plt.ylabel("Average Sentiment")
plt.xlabel("Average Salary")
plt.show()

	name	slug	type	course	rating	review	date
0	A Anthony	anthony	professor	AMST203	1.0	By far the worst professor I’ve ever had, and ...	2018-08-17
1	A Kruglanski	kruglanski	professor	PSYC489H	2.0	DO NOT TAKE PSYC489H "Motivated Social Cogniti...	2015-09-07
2	A Sharma	sharma_a	professor	ASTR300	2.0	Very boring, it's hard to maintain your focus ...	2019-04-04
3	A Sharma	sharma_a	professor	ASTR300	1.0	You'll pass but this class will be the most bo...	2019-05-26
4	A Sharma	sharma_a	professor	ASTR300	1.0	Rather difficult course. Class is extremely bo...	2019-12-08

	Employee	years_taught	salaries	departments	name
0	A Karim, Eaman	[2018, 2019]	[48000.0, 48960.0]	['ENGR', 'ENGR']	EAMAN KARIM
1	A'Hearn, Michael F.	[2013, 2014, 2015, 2016, 2017]	[125817.0, 130849.69, 145530.01, 155925.01, 13...	['CMNS', 'CMNS', 'CMNS', 'CMNS', 'CMNS']	MICHAEL A'HEARN
2	AMIN, MOHAMMED NURUL	[2015]	[46500.0]	['CMNS']	MOHAMMED AMIN
3	Aarhus, William H	[2016, 2017, 2018, 2019, 2020, 2021, 2022]	[75000.0, 75750.0, 75750.0, 77265.0, 80780.68,...	['SVPA', 'SVPA', 'SVPA', 'SVPA', 'SVPA', 'EXST...	WILLIAM AARHUS
4	Abadi, Berhane	[2022]	[31278.0]	['VPSA']	BERHANE ABADI

	name	slug	type	course	rating	review	date	year	season
0	A Anthony	anthony	professor	AMST203	1.0	By far the worst professor I’ve ever had, and ...	2018-08-17	2018	summer
1	A Kruglanski	kruglanski	professor	PSYC489H	2.0	DO NOT TAKE PSYC489H "Motivated Social Cogniti...	2015-09-07	2015	fall
2	A Sharma	sharma_a	professor	ASTR300	2.0	Very boring, it's hard to maintain your focus ...	2019-04-04	2019	spring
3	A Sharma	sharma_a	professor	ASTR300	1.0	You'll pass but this class will be the most bo...	2019-05-26	2019	spring
4	A Sharma	sharma_a	professor	ASTR300	1.0	Rather difficult course. Class is extremely bo...	2019-12-08	2019	fall

	professor	course	year	season	average_gpa	num_students
0	A Anthony	AMST202	2016	fall	2.462500	24.0
1	A Anthony	AMST202	2017	spring	2.934783	23.0
2	A Anthony	AMST203	2017	fall	2.796429	28.0
3	A Anthony	AMST203	2018	fall	3.204651	43.0
4	A Anthony	AMST203	2018	spring	2.476190	21.0

	name	course	year	season	average_rating	num_reviews	average_gpa	num_students
0	A Anthony	AMST203	2018	summer	1.0	1.0	NaN	NaN
1	A Kruglanski	PSYC489H	2015	fall	2.0	1.0	NaN	NaN
2	A Sharma	ASTR300	2019	fall	1.0	1.0	NaN	NaN
3	A Sharma	ASTR300	2019	spring	1.5	2.0	2.850877	57.0
4	A Sharma	ASTR300	2020	fall	2.0	1.0	NaN	NaN

Final Tutorial: Analysis of Professor Performance at UMD¶

Table of Contents ¶

Introduction ¶

Imports ¶

Data Collection ¶

Data Collection: PlanetTerp ¶

Diamondback ¶

Planet Terp review data¶

Planet Terp grade data¶

Grouped Diamondback salary data¶

Combining PlanetTerp Data ¶

Data Cleaning ¶

Duplicates in Diamondback ¶

Matching Names ¶

Getting Departments ¶

Missing Professors ¶

Professors with Missing Years ¶

Exploratory Analysis ¶

Analysis, Hypothesis Testing, and ML ¶

Independent Variables vs Dependent Variables ¶

Grades vs Reviews ¶

Sentiment Analysis ¶

Conclusions, Interpretations, and Insights ¶

	professor	course	semester	section	A+	A	A-	B+	B	B-	C+	C	C-	D+	D-	F	W	Other
0	A Anthony	AMST202	201608	0101	1.0	2.0	1.0	2.0	5.0	1.0	2.0	3.0	5.0	0.0	0.0	2.0	4.0	0.0
1	A Anthony	AMST202	201701	0101	0.0	1.0	4.0	3.0	3.0	7.0	3.0	2.0	0.0	0.0	0.0	0.0	1.0	1.0
2	A Anthony	AMST203	201708	FCH1	0.0	2.0	1.0	1.0	6.0	2.0	1.0	0.0	0.0	0.0	0.0	0.0	4.0	0.0
3	A Anthony	AMST203	201708	FCH2	0.0	0.0	1.0	4.0	3.0	0.0	0.0	4.0	1.0	1.0	1.0	0.0	2.0	0.0
4	A Anthony	AMST203	201801	0201	1.0	3.0	1.0	2.0	4.0	0.0	2.0	3.0	1.0	0.0	2.0	2.0	6.0	0.0

	name	course	semester	average_rating	num_reviews	average_gpa	num_students
0	A ANTHONY	[AMST203, AMST202, AMST202, AMST203, AMST203, ...	[summer 2018, fall 2016, spring 2017, fall 201...	[1.0, nan, nan, nan, nan, nan]	[1.0, nan, nan, nan, nan, nan]	[nan, 2.46, 2.93, 2.8, 3.2, 2.48]	[0, 24, 23, 28, 43, 21]
1	A KRUGLANSKI	[PSYC489H, PSYC489H, PSYC489H, PSYC489T, PSYC4...	[fall 2015, spring 2014, spring 2015, spring 2...	[2.0, nan, nan, nan, nan, nan, nan, nan, nan, ...	[1.0, nan, nan, nan, nan, nan, nan, nan, nan, ...	[nan, 3.51, 3.55, 3.89, 3.46, 3.56, 3.52, 3.45...	[0, 16, 8, 17, 20, 14, 19, 24, 10, 19, 3, 10, ...
2	A SHARMA	[ASTR300, ASTR300, ASTR300, ASTR300, ASTR300, ...	[fall 2019, spring 2019, fall 2020, winter 202...	[1.0, 1.5, 2.0, 3.0, nan, nan, nan, nan]	[1.0, 2.0, 1.0, 1.0, nan, nan, nan, nan]	[nan, 2.85, nan, nan, 2.92, 2.98, 2.69, 3.28]	[0, 57, 0, 0, 51, 59, 47, 58]
3	A.U. SHANKAR	[CMSC216, CMSC216, CMSC216, CMSC216, CMSC216, ...	[fall 2017, spring 2018, winter 2018, fall 201...	[1.0, 2.0, 1.0, 1.0, 3.0, 2.0, 2.2, 2.66666666...	[1.0, 1.0, 1.0, 2.0, 4.0, 2.0, 5.0, 3.0, 2.0, ...	[2.62, nan, nan, 1.99, 2.23, 2.7, 2.32, nan, 3...	[120, 0, 0, 114, 125, 55, 102, 0, 33, 39, 0, 3...
4	AARON BARTLETT	[ENGL101, ENGL265, ENGL265, ENGL101, ENGL243]	[fall 2020, fall 2022, spring 2023, fall 2019,...	[1.0, 1.0, 4.0, nan, nan]	[1.0, 2.0, 1.0, nan, nan]	[nan, 3.3, nan, 2.66, 3.37]	[0, 29, 0, 19, 18]

	name	course	average_gpa	num_students
0	A Anthony	AMST202	2.693617	47.0
1	A Anthony	AMST203	2.914130	92.0
2	A Kruglanski	PSYC489H	3.550000	8.0
3	A Kruglanski	PSYC489T	3.561702	94.0
4	A Kruglanski	PSYC604	3.544828	29.0

	name	slug	course	average_rating	num_ratings	average_gpa	num_students
0	A.U. Shankar	shankar_a.u.	CMSC216	2.222222	18.0	2.357948	692.0
1	A.U. Shankar	shankar_a.u.	CMSC412	3.200000	5.0	2.809319	279.0
2	Aaron Finkle	finkle	ECON306	2.500000	8.0	2.706983	2678.0
3	Aaron Swanlek	swanlek	COMM107	5.000000	7.0	3.938579	394.0
4	Abigail Nicolas	nicolas_abigail	PSYC221	5.000000	8.0	3.557846	325.0

	name	course	average_rating	num_ratings	average_gpa	num_students
0	Bonnie Dixon	BCHM463	2.666667	12.0	2.769094	1291.0
1	Bonnie Dixon	CHEM131	3.842105	19.0	2.707407	243.0
2	Bonnie Dixon	CHEM134	2.166667	6.0	2.723111	225.0
3	Bonnie Dixon	CHEM135	3.758621	29.0	2.612654	893.0
4	Bonnie Dixon	CHEM231	4.205882	68.0	2.730612	2829.0

	name	average_rating	num_ratings	average_sentiment
0	A ANTHONY	1.000000	1.0	-0.493900
1	A SHARMA	2.000000	4.0	-0.494300
2	A.U. SHANKAR	2.711111	11.5	0.174226
3	AARON BARTLETT	2.000000	3.0	0.171600
4	AARON FINKLE	2.500000	8.0	0.229350

	name	average_salary
0	A SHARMA	86125.633000
1	AARON FINKLE	72452.691667
2	AARON HOOD	54602.026667
3	AARON SWANLEK	41046.400000
4	ABDIRISAK MOHAMED	17882.856667

	Major	Department
0	BMAC	BMGT
1	PMAM	ENGR
2	Z115	ENGR
3	ENAE	ENGR
4	PMAE	ENGR

	name	course	average_rating	num_ratings	average_gpa	num_students
0	Aaron Swanlek	COMM107	5.000000	7.0	3.938579	394.0
1	Alice Mignerey	CHEM131	2.023256	43.0	2.752637	1896.0
2	Amanda Allen	ENGL101	4.111111	9.0	3.275768	879.0
3	Amanda Schech	CHEM131	3.266667	15.0	2.712098	1463.0
4	Amanda Schech	CHEM135	4.166667	6.0	2.678378	37.0