Project Purpose¶

Practice K Nearest Neighbors using dataset consisting of information on Private and Non-Private Universities.

Dataset source: https://www.kaggle.com/faressayah/college-data

Data Dictionary:

private - private (yes) or not private (no)
apps - number of applications received
accept - number of applications accepted
enroll - number of new students enrolled
top10perc - percent new students from top 10% of high school
top25perc - percent new students from top 25% of high school
f_undergrad - number of full-time undergraduates
p_undergrad - number of part-time undergraduates
outstate - out of state tuition
room_board - room and board costs
books - estimated book costs
personal - estimated personal spending
phd - percent of faculty with PhDs
terminal - percent of faculty with terminal degree
s_f_ration - student to faculty ratio
perc_alumni - percent alumni who donate
expend - instructional expenditure per student
grad_rate - graduation rate

Importing/Cleaning Data¶

#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

#reading in the csv file and showing the first 5 rows 
data = pd.read_csv('data.csv')
data.head()

#shape
data.shape

(777, 18)

#checking to make sure we have no null values 
data.isnull().sum()

private        0
apps           0
accept         0
enroll         0
top10perc      0
top25perc      0
f_undergrad    0
p_undergrad    0
outstate       0
room_board     0
books          0
personal       0
phd            0
terminal       0
s_f_ratio      0
perc_alumni    0
expend         0
grad_rate      0
dtype: int64

Let's get started with some Exploratory Data Analysis¶

#pie chart to see percantage of private vs not private universities in the dataset
labels = ['Private', 'Not Private']
count = data['private'].value_counts()
plt.figure(figsize=(10,5))
plt.pie(count, labels=labels, autopct="%.1f%%", startangle=90)
plt.title('Percent of Private vs Not Private Universities', fontsize=15)
plt.show()

#taking a look at applications, acceptances, and enrollments 
plt.figure(figsize=(8,8))

plt.subplot(2,1,1)
plt.title('Application vs Acceptances',fontsize=15)
sns.scatterplot(x='apps', y='accept', data=data, hue='private')
plt.legend(loc='upper right', title='Private')

plt.subplot(2,1,2)
plt.title('Acceptance vs Enrollment',fontsize=15)
sns.scatterplot(x='accept', y='enroll', data=data, hue='private')
plt.legend(loc='upper right', title='Private')

plt.tight_layout(pad=1.0)
plt.show()

#Total full-time undergrad students....public schools seem to be have larger numbers while most private schools
#have 5,000 or less
plt.figure(figsize=(9,5))
sns.histplot(data, x = 'f_undergrad', hue='private')
plt.title('Full-Time Undergraduates Distribution', fontsize=15)
plt.xlabel('Number of Full-Time Undegraduates')
plt.show()

plt.figure(figsize=(9,5))
plt.title('Full Time Undergraduates vs Student/Faculty Ratio',fontsize=15)
sns.scatterplot(x='s_f_ratio', y='f_undergrad', data=data, hue='private')
plt.xlabel('Student/Faculty Ratio')
plt.ylabel('Full Time Undergraduates')
plt.legend(loc='upper right', title='Private')
plt.show()

plt.figure(figsize=(9,5))
sns.histplot(data, x = 'outstate', hue='private')
plt.title('Out of State Tuition Distribution', fontsize=15)
plt.xlabel('Out of State Tuition')
plt.show()

data.loc[data.grad_rate > 100, 'grad_rate'] = 100
plt.figure(figsize=(9,5))
sns.histplot(data, x = 'grad_rate', hue='private')
plt.title('Graduation Rate By Institution Type', fontsize=15)
plt.xlabel('Graduation Rate')
plt.show()

plt.figure(figsize=(9,5))
plt.title('Full Time Undegrads vs Graduation Rate',fontsize=15)
sns.scatterplot(x='grad_rate', y='f_undergrad', data=data, hue='private')
plt.xlabel('Graduation Rate %')
plt.ylabel('Full Time Undergraduates')
plt.legend(loc='upper right', title='Private')
plt.show()

KNN¶

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

#setting X and y - y consists of just the private column
X = data.drop(['private'], axis=1)
y = data.private

#splitting the features and target data into 70% test and 30% testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

#defining a function to give me the confusion matrix heatmap, accuracy score, and classification report for 
#whatever model you put in 
def results(model, X_train, X_test, y_train, y_test):   
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print('TRAINING RESULTS: \n')
    clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
    conf_matrix = confusion_matrix(y_train, y_train_pred)
    print(f'Confusion Matrix: \n')
    sns.heatmap(conf_matrix, annot=True)
    plt.show()
    
    print(f'Accuracy Score:\n{accuracy_score(y_train, y_train_pred):.4f}')
    print(f'\nClassificaton Report:\n{clf_report}')
    
    print('\nTESTING RESULTS: \n')
    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
    conf_matrix = confusion_matrix(y_test, y_test_pred)
    print('Confusion Matrix: \n')
    sns.heatmap(conf_matrix, annot=True)
    plt.show()
    
    print(f'Accuracy Score:\n{accuracy_score(y_test, y_test_pred):.4f}')
    print(f'\nClassification Report:\n{clf_report}')

#setting knn 
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
results(knn, X_train, X_test, y_train, y_test)

TRAINING RESULTS: 

Confusion Matrix:

Accuracy Score:
0.9613

Classificaton Report:
                   No         Yes  accuracy   macro avg  weighted avg
precision    0.875740    1.000000  0.961326    0.937870      0.966132
recall       1.000000    0.946835  0.961326    0.973418      0.961326
f1-score     0.933754    0.972692  0.961326    0.953223      0.962079
support    148.000000  395.000000  0.961326  543.000000    543.000000

TESTING RESULTS: 

Confusion Matrix:

Accuracy Score:
0.8974

Classification Report:
                  No         Yes  accuracy   macro avg  weighted avg
precision   0.770270    0.956250  0.897436    0.863260      0.905384
recall      0.890625    0.900000  0.897436    0.895312      0.897436
f1-score    0.826087    0.927273  0.897436    0.876680      0.899598
support    64.000000  170.000000  0.897436  234.000000    234.000000

#testing the model
print(knn.predict(X_test)[0:5])

['No' 'No' 'No' 'Yes' 'Yes']

Getting the Accuarcy Scores for different KNN Values¶

#using a for loop to loop through the range 2-40 as knn values and setting the accuracy scores into a list 
scores = []

for i in range(2, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    scores.append(acc)

plt.figure(figsize=(9, 5))
plt.plot(range(2, 40), scores)
plt.title('Accuracy vs KNN')
plt.ylabel("Accuracy")
plt.xlabel("KNN")
plt.show()

Retraining to try different KNN Values¶

#retraining the model with a new KNN Value from above
knn2 = KNeighborsClassifier(n_neighbors=5)
knn2.fit(X_train, y_train)

results(knn2, X_train, X_test, y_train, y_test)

TRAINING RESULTS: 

Confusion Matrix:

Accuracy Score:
0.9521

Classificaton Report:
                   No         Yes  accuracy   macro avg  weighted avg
precision    0.912162    0.967089  0.952118    0.939625      0.952118
recall       0.912162    0.967089  0.952118    0.939625      0.952118
f1-score     0.912162    0.967089  0.952118    0.939625      0.952118
support    148.000000  395.000000  0.952118  543.000000    543.000000

TESTING RESULTS: 

Confusion Matrix:

Accuracy Score:
0.9316

Classification Report:
                  No         Yes  accuracy   macro avg  weighted avg
precision   0.887097    0.947674  0.931624    0.917386      0.931106
recall      0.859375    0.958824  0.931624    0.909099      0.931624
f1-score    0.873016    0.953216  0.931624    0.913116      0.931281
support    64.000000  170.000000  0.931624  234.000000    234.000000

print(knn2.predict(X_test)[0:5])

['No' 'No' 'No' 'Yes' 'Yes']

#retraining the model with a new KNN Value from above
knn3 = KNeighborsClassifier(n_neighbors=7)
knn3.fit(X_train, y_train)

results(knn3, X_train, X_test, y_train, y_test)

TRAINING RESULTS: 

Confusion Matrix:

Accuracy Score:
0.9429

Classificaton Report:
                   No         Yes  accuracy   macro avg  weighted avg
precision    0.914894    0.952736   0.94291    0.933815      0.942422
recall       0.871622    0.969620   0.94291    0.920621      0.942910
f1-score     0.892734    0.961104   0.94291    0.926919      0.942469
support    148.000000  395.000000   0.94291  543.000000    543.000000

TESTING RESULTS: 

Confusion Matrix:

Accuracy Score:
0.9444

Classification Report:
                  No         Yes  accuracy   macro avg  weighted avg
precision   0.904762    0.959064  0.944444    0.931913      0.944212
recall      0.890625    0.964706  0.944444    0.927665      0.944444
f1-score    0.897638    0.961877  0.944444    0.929757      0.944307
support    64.000000  170.000000  0.944444  234.000000    234.000000

Here we see our testing accuracy becoming a bit greater than the training accuracy which means that overfitting could be occuring (about to occur) so I probably would not go higher than k=7.

	private	apps	accept	enroll	top10perc	top25perc	f_undergrad	p_undergrad	outstate	room_board	books	personal	phd	terminal	s_f_ratio	perc_alumni	expend	grad_rate
0	Yes	1660	1232	721	23	52	2885	537	7440	3300	450	2200	70	78	18.1	12	7041	60
1	Yes	2186	1924	512	16	29	2683	1227	12280	6450	750	1500	29	30	12.2	16	10527	56
2	Yes	1428	1097	336	22	50	1036	99	11250	3750	400	1165	53	66	12.9	30	8735	54
3	Yes	417	349	137	60	89	510	63	12960	5450	450	875	92	97	7.7	37	19016	59
4	Yes	193	146	55	16	44	249	869	7560	4120	800	1500	76	72	11.9	2	10922	15