Import packages
In [ ]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
Task 1 - Import train and test set of success data¶
In [ ]:
train_data = pd.read_csv("https://sxbin.gay/u/Joryn/PythonAi%20-%201/train_data_success.csv")
test_data = pd.read_csv("https://sxbin.gay/u/Joryn/PythonAi%20-%201/test_data_success.csv")
In [ ]:
train_data.head()
Out[ ]:
Unnamed: 0 | age | interest | success | age_groups | |
---|---|---|---|---|---|
0 | 224 | 35.0 | 69.922636 | 1 | 3 |
1 | 38 | 13.0 | 18.069521 | 0 | 1 |
2 | 200 | 9.0 | 18.603253 | 0 | 1 |
3 | 122 | 34.0 | 30.049936 | 0 | 2 |
4 | 73 | 23.0 | 41.132264 | 0 | 2 |
In [ ]:
test_data.head()
Out[ ]:
Unnamed: 0 | age | interest | success | age_groups | |
---|---|---|---|---|---|
0 | 54 | 26.0 | 46.679500 | 1 | 2 |
1 | 153 | 30.0 | 30.941048 | 0 | 2 |
2 | 288 | 16.0 | 24.010528 | 0 | 1 |
3 | 165 | 24.0 | 65.720663 | 1 | 2 |
4 | 3 | 7.0 | 25.704665 | 1 | 1 |
Divide the columns to features and label.
In [ ]:
success_features_train = train_data.iloc[:,[1,2]]
success_label_train =train_data['success']
success_features_test = test_data.iloc[:,[1,2]]
success_label_test = test_data['success']
Task 2 - KNN classifier¶
Import the packages.
In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
Fit the classifier, set the parameters.
In [ ]:
neigh = KNeighborsClassifier(n_neighbors=5,metric="euclidean")
#fit the model
Out[ ]:
KNeighborsClassifier(metric='euclidean')
Make predictions for test set.
In [ ]:
#make prediction
In [ ]:
#print prediction
Out[ ]:
array([1, 0, 0, 1, 0, 0, 1, 0, 0, 1])
Show the confusion matrix.
In [ ]:
#
[[33 7] [ 3 47]]
Calculate the learned measures.
In [ ]:
print("Precision: ")
print("Recall: ")
print("Accuracy: " )
Precision: 0.8703703703703703 Recall: 0.94 Accuracy: 0.8888888888888888
ROC curve and AUC score
In [ ]:
#make the roc curve
In [ ]:
plt.figure(figsize=(5,5))
plt.plot(fpr,tpr,linewidth=2.0)
#plot the random classifier line
x = np.linspace(0,1,100)
y = x
plt.plot(x, y, label = "random classifier")
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.xlim([0,1])
plt.ylim([0,1])
Out[ ]:
(0.0, 1.0)
In [ ]:
#auc score
Out[ ]:
0.92475
Task 3 - Decision tree¶
Import the package
In [ ]:
from sklearn import tree
Set the classifier
In [ ]:
#create the classifier
Train it on the training data
In [ ]:
#fit the modell
Confusion matrix and accuracy measures
In [ ]:
#conffusion matrix
[[32 8] [ 2 48]]
In [ ]:
print("Precision: ")
print("Recall: ")
print("Accuracy: " )
Precision: 0.8571428571428571 Recall: 0.96 Accuracy: 0.8888888888888888
ROC curve and AUC score
In [ ]:
#make the roc curve
In [ ]:
plt.figure(figsize=(5,5))
plt.plot(fpr_tree,tpr_tree,linewidth=2.0)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.xlim([0,1])
plt.ylim([0,1])
Out[ ]:
(0.0, 1.0)
In [ ]:
#auc score
Out[ ]:
0.94775
Plot the tree
In [ ]:
tree.plot_tree(tree_clf) #tree_clf is the name of the modell we made, change if neccessary
Out[ ]:
[Text(0.4583333333333333, 0.875, 'X[1] <= 44.062\ngini = 0.489\nsamples = 207\nvalue = [88, 119]'), Text(0.25, 0.625, 'X[0] <= 34.5\ngini = 0.271\nsamples = 99\nvalue = [83, 16]'), Text(0.16666666666666666, 0.375, 'X[1] <= 40.361\ngini = 0.107\nsamples = 88\nvalue = [83, 5]'), Text(0.08333333333333333, 0.125, 'gini = 0.049\nsamples = 80\nvalue = [78, 2]'), Text(0.25, 0.125, 'gini = 0.469\nsamples = 8\nvalue = [5, 3]'), Text(0.3333333333333333, 0.375, 'gini = 0.0\nsamples = 11\nvalue = [0, 11]'), Text(0.6666666666666666, 0.625, 'X[1] <= 48.217\ngini = 0.088\nsamples = 108\nvalue = [5, 103]'), Text(0.5, 0.375, 'X[0] <= 34.5\ngini = 0.426\nsamples = 13\nvalue = [4, 9]'), Text(0.4166666666666667, 0.125, 'gini = 0.494\nsamples = 9\nvalue = [4, 5]'), Text(0.5833333333333334, 0.125, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]'), Text(0.8333333333333334, 0.375, 'X[1] <= 56.947\ngini = 0.021\nsamples = 95\nvalue = [1, 94]'), Text(0.75, 0.125, 'gini = 0.124\nsamples = 15\nvalue = [1, 14]'), Text(0.9166666666666666, 0.125, 'gini = 0.0\nsamples = 80\nvalue = [0, 80]')]
In [ ]:
import graphviz
dot_data = tree.export_graphviz(tree_clf)
graph = graphviz.Source(dot_data)
graph.render("success")
dot_data = tree.export_graphviz(tree_clf, out_file=None, feature_names=["age", "interest"], class_names=["successful", "unsuccessful"], filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[ ]:
Task 4 - SVM¶
In [ ]:
from sklearn import svm
Visualize the data.
In [ ]:
plt.scatter(success_features_train.iloc[:, 0], success_features_train.iloc[:, 1], c=success_label_train, cmap='winter')
Out[ ]:
<matplotlib.collections.PathCollection at 0x7fb7aac30fd0>
In [ ]:
plt.scatter(success_features_test.iloc[:, 0], success_features_test.iloc[:, 1], c=success_label_test, cmap='winter')
Out[ ]:
<matplotlib.collections.PathCollection at 0x7fb7a9a9ee50>
Train, make predictions show confusion matrix, calculate accuracy measures.
In [ ]:
#model, fit, prediction and confusion matrix
[[32 8] [ 8 42]]
In [ ]:
print("Precision: ")
print("Recall: ")
print("Accuracy: " )
Precision: 0.84 Recall: 0.84 Accuracy: 0.8222222222222222
ROC curve and AUC score
In [ ]:
#make the roc curve
In [ ]:
plt.figure(figsize=(5,5))
plt.plot(fpr_svm,tpr_svm,linewidth=2.0)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.xlim([0,1])
plt.ylim([0,1])
Out[ ]:
(0.0, 1.0)
In [ ]:
#auc score
Out[ ]:
0.9259999999999999
Task 5 - Summary¶
In [ ]:
print("AUC kNN:", metrics.roc_auc_score(success_label_test,knn_prob[:,1], average='macro', sample_weight=None))
print("AUC decision tree:", metrics.roc_auc_score(success_label_test,tree_prob[:,1], average='macro', sample_weight=None))
print("AUC SVM:", metrics.roc_auc_score(success_label_test,svm_prob[:,1], average='macro', sample_weight=None))
AUC kNN: 0.92475 AUC decision tree: 0.94775 AUC SVM: 0.9259999999999999
Task 6 - if you still have some time - Multiclass classification¶
In [ ]:
mobile_data = pd.read_csv("https://sxbin.gay/u/Joryn/PythonAi%20-%201/train_mobile.csv")
In [ ]:
#Data head and number of NaN-s
2000 battery_power 0 blue 0 clock_speed 0 dual_sim 0 fc 0 four_g 0 int_memory 0 m_dep 0 mobile_wt 0 n_cores 0 pc 0 px_height 0 px_width 0 ram 0 sc_h 0 sc_w 0 talk_time 0 three_g 0 touch_screen 0 wifi 0 price_range 0 dtype: int64
In [ ]:
#columns
Out[ ]:
Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi', 'price_range'], dtype='object')
In [ ]:
#count the number of price ranges
1 500 2 500 3 500 0 500 Name: price_range, dtype: int64
In [ ]:
#create feature and label space
In [ ]:
#split train and test data (use sklearn.selection)
In [ ]:
#predict with decision tree classifier (also create the model)
In [ ]:
#confusion matrix
print(cm)
[[87 7 0 0] [ 7 96 2 0] [ 0 21 62 13] [ 0 0 11 94]]
In [ ]:
#accuracy score (use sklearn built-in accuracy function)
Accuracy: 0.8475
In [ ]:
#make and test the trees for different depths
1 depth tree: 0.4975 2 depth tree: 0.785 3 depth tree: 0.79 4 depth tree: 0.8275 5 depth tree: 0.8225 6 depth tree: 0.835 7 depth tree: 0.84 8 depth tree: 0.8425 9 depth tree: 0.84 10 depth tree: 0.825 11 depth tree: 0.815 12 depth tree: 0.8275 13 depth tree: 0.835 14 depth tree: 0.825 15 depth tree: 0.815 16 depth tree: 0.8325 17 depth tree: 0.815 18 depth tree: 0.8175 19 depth tree: 0.8225 20 depth tree: 0.83
In [ ]: