Practice K Nearest Neighbors using dataset consisting of information on Private and Non-Private Universities.
Dataset source: https://www.kaggle.com/faressayah/college-data
Data Dictionary:
private - private (yes) or not private (no)
apps - number of applications received
accept - number of applications accepted
enroll - number of new students enrolled
top10perc - percent new students from top 10% of high school
top25perc - percent new students from top 25% of high school
f_undergrad - number of full-time undergraduates
p_undergrad - number of part-time undergraduates
outstate - out of state tuition
room_board - room and board costs
books - estimated book costs
personal - estimated personal spending
phd - percent of faculty with PhDs
terminal - percent of faculty with terminal degree
s_f_ration - student to faculty ratio
perc_alumni - percent alumni who donate
expend - instructional expenditure per student
grad_rate - graduation rate
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
#reading in the csv file and showing the first 5 rows
data = pd.read_csv('data.csv')
data.head()
#shape
data.shape
#checking to make sure we have no null values
data.isnull().sum()
#pie chart to see percantage of private vs not private universities in the dataset
labels = ['Private', 'Not Private']
count = data['private'].value_counts()
plt.figure(figsize=(10,5))
plt.pie(count, labels=labels, autopct="%.1f%%", startangle=90)
plt.title('Percent of Private vs Not Private Universities', fontsize=15)
plt.show()
#taking a look at applications, acceptances, and enrollments
plt.figure(figsize=(8,8))
plt.subplot(2,1,1)
plt.title('Application vs Acceptances',fontsize=15)
sns.scatterplot(x='apps', y='accept', data=data, hue='private')
plt.legend(loc='upper right', title='Private')
plt.subplot(2,1,2)
plt.title('Acceptance vs Enrollment',fontsize=15)
sns.scatterplot(x='accept', y='enroll', data=data, hue='private')
plt.legend(loc='upper right', title='Private')
plt.tight_layout(pad=1.0)
plt.show()
#Total full-time undergrad students....public schools seem to be have larger numbers while most private schools
#have 5,000 or less
plt.figure(figsize=(9,5))
sns.histplot(data, x = 'f_undergrad', hue='private')
plt.title('Full-Time Undergraduates Distribution', fontsize=15)
plt.xlabel('Number of Full-Time Undegraduates')
plt.show()
plt.figure(figsize=(9,5))
plt.title('Full Time Undergraduates vs Student/Faculty Ratio',fontsize=15)
sns.scatterplot(x='s_f_ratio', y='f_undergrad', data=data, hue='private')
plt.xlabel('Student/Faculty Ratio')
plt.ylabel('Full Time Undergraduates')
plt.legend(loc='upper right', title='Private')
plt.show()
plt.figure(figsize=(9,5))
sns.histplot(data, x = 'outstate', hue='private')
plt.title('Out of State Tuition Distribution', fontsize=15)
plt.xlabel('Out of State Tuition')
plt.show()
data.loc[data.grad_rate > 100, 'grad_rate'] = 100
plt.figure(figsize=(9,5))
sns.histplot(data, x = 'grad_rate', hue='private')
plt.title('Graduation Rate By Institution Type', fontsize=15)
plt.xlabel('Graduation Rate')
plt.show()
plt.figure(figsize=(9,5))
plt.title('Full Time Undegrads vs Graduation Rate',fontsize=15)
sns.scatterplot(x='grad_rate', y='f_undergrad', data=data, hue='private')
plt.xlabel('Graduation Rate %')
plt.ylabel('Full Time Undergraduates')
plt.legend(loc='upper right', title='Private')
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
#setting X and y - y consists of just the private column
X = data.drop(['private'], axis=1)
y = data.private
#splitting the features and target data into 70% test and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
#defining a function to give me the confusion matrix heatmap, accuracy score, and classification report for
#whatever model you put in
def results(model, X_train, X_test, y_train, y_test):
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)
print('TRAINING RESULTS: \n')
clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
conf_matrix = confusion_matrix(y_train, y_train_pred)
print(f'Confusion Matrix: \n')
sns.heatmap(conf_matrix, annot=True)
plt.show()
print(f'Accuracy Score:\n{accuracy_score(y_train, y_train_pred):.4f}')
print(f'\nClassificaton Report:\n{clf_report}')
print('\nTESTING RESULTS: \n')
clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
conf_matrix = confusion_matrix(y_test, y_test_pred)
print('Confusion Matrix: \n')
sns.heatmap(conf_matrix, annot=True)
plt.show()
print(f'Accuracy Score:\n{accuracy_score(y_test, y_test_pred):.4f}')
print(f'\nClassification Report:\n{clf_report}')
#setting knn
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
results(knn, X_train, X_test, y_train, y_test)
#testing the model
print(knn.predict(X_test)[0:5])
#using a for loop to loop through the range 2-40 as knn values and setting the accuracy scores into a list
scores = []
for i in range(2, 40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc = accuracy_score(y_test, y_pred)
scores.append(acc)
plt.figure(figsize=(9, 5))
plt.plot(range(2, 40), scores)
plt.title('Accuracy vs KNN')
plt.ylabel("Accuracy")
plt.xlabel("KNN")
plt.show()
#retraining the model with a new KNN Value from above
knn2 = KNeighborsClassifier(n_neighbors=5)
knn2.fit(X_train, y_train)
results(knn2, X_train, X_test, y_train, y_test)
print(knn2.predict(X_test)[0:5])
#retraining the model with a new KNN Value from above
knn3 = KNeighborsClassifier(n_neighbors=7)
knn3.fit(X_train, y_train)
results(knn3, X_train, X_test, y_train, y_test)
Here we see our testing accuracy becoming a bit greater than the training accuracy which means that overfitting could be occuring (about to occur) so I probably would not go higher than k=7.