import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt

train_data = pd.read_csv("https://sxbin.gay/u/Joryn/PythonAi%20-%201/train_data_success.csv")
test_data = pd.read_csv("https://sxbin.gay/u/Joryn/PythonAi%20-%201/test_data_success.csv")

train_data.head()

test_data.head()

success_features_train = train_data.iloc[:,[1,2]]
success_label_train =train_data['success']
success_features_test = test_data.iloc[:,[1,2]]
success_label_test = test_data['success']

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

neigh = KNeighborsClassifier(n_neighbors=5,metric="euclidean")
#fit the model

KNeighborsClassifier(metric='euclidean')

#make prediction

#print prediction

array([1, 0, 0, 1, 0, 0, 1, 0, 0, 1])

#

[[33  7]
 [ 3 47]]

print("Precision: ")
print("Recall: ")
print("Accuracy: " )

Precision:  0.8703703703703703
Recall:  0.94
Accuracy:  0.8888888888888888

#make the roc curve

plt.figure(figsize=(5,5))
plt.plot(fpr,tpr,linewidth=2.0)
#plot the random classifier line
x = np.linspace(0,1,100)
y = x
plt.plot(x, y, label = "random classifier")
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.xlim([0,1])
plt.ylim([0,1])

(0.0, 1.0)

#auc score

0.92475

from sklearn import tree

#create the classifier

#fit the modell

#conffusion matrix

[[32  8]
 [ 2 48]]

print("Precision: ")
print("Recall: ")
print("Accuracy: " )

Precision:  0.8571428571428571
Recall:  0.96
Accuracy:  0.8888888888888888

#make the roc curve

plt.figure(figsize=(5,5))
plt.plot(fpr_tree,tpr_tree,linewidth=2.0)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.xlim([0,1])
plt.ylim([0,1])

(0.0, 1.0)

#auc score

0.94775

tree.plot_tree(tree_clf) #tree_clf is the name of the modell we made, change if neccessary

[Text(0.4583333333333333, 0.875, 'X[1] <= 44.062\ngini = 0.489\nsamples = 207\nvalue = [88, 119]'),
 Text(0.25, 0.625, 'X[0] <= 34.5\ngini = 0.271\nsamples = 99\nvalue = [83, 16]'),
 Text(0.16666666666666666, 0.375, 'X[1] <= 40.361\ngini = 0.107\nsamples = 88\nvalue = [83, 5]'),
 Text(0.08333333333333333, 0.125, 'gini = 0.049\nsamples = 80\nvalue = [78, 2]'),
 Text(0.25, 0.125, 'gini = 0.469\nsamples = 8\nvalue = [5, 3]'),
 Text(0.3333333333333333, 0.375, 'gini = 0.0\nsamples = 11\nvalue = [0, 11]'),
 Text(0.6666666666666666, 0.625, 'X[1] <= 48.217\ngini = 0.088\nsamples = 108\nvalue = [5, 103]'),
 Text(0.5, 0.375, 'X[0] <= 34.5\ngini = 0.426\nsamples = 13\nvalue = [4, 9]'),
 Text(0.4166666666666667, 0.125, 'gini = 0.494\nsamples = 9\nvalue = [4, 5]'),
 Text(0.5833333333333334, 0.125, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]'),
 Text(0.8333333333333334, 0.375, 'X[1] <= 56.947\ngini = 0.021\nsamples = 95\nvalue = [1, 94]'),
 Text(0.75, 0.125, 'gini = 0.124\nsamples = 15\nvalue = [1, 14]'),
 Text(0.9166666666666666, 0.125, 'gini = 0.0\nsamples = 80\nvalue = [0, 80]')]

import graphviz
dot_data = tree.export_graphviz(tree_clf)
graph = graphviz.Source(dot_data)
graph.render("success")
dot_data = tree.export_graphviz(tree_clf, out_file=None, feature_names=["age", "interest"], class_names=["successful", "unsuccessful"],  filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph

from sklearn import svm

plt.scatter(success_features_train.iloc[:, 0], success_features_train.iloc[:, 1], c=success_label_train, cmap='winter')

<matplotlib.collections.PathCollection at 0x7fb7aac30fd0>

plt.scatter(success_features_test.iloc[:, 0], success_features_test.iloc[:, 1], c=success_label_test, cmap='winter')

<matplotlib.collections.PathCollection at 0x7fb7a9a9ee50>

#model, fit, prediction and confusion matrix

[[32  8]
 [ 8 42]]

print("Precision: ")
print("Recall: ")
print("Accuracy: " )

Precision:  0.84
Recall:  0.84
Accuracy:  0.8222222222222222

#make the roc curve

plt.figure(figsize=(5,5))
plt.plot(fpr_svm,tpr_svm,linewidth=2.0)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.xlim([0,1])
plt.ylim([0,1])

(0.0, 1.0)

#auc score

0.9259999999999999

print("AUC kNN:", metrics.roc_auc_score(success_label_test,knn_prob[:,1], average='macro', sample_weight=None))
print("AUC decision tree:", metrics.roc_auc_score(success_label_test,tree_prob[:,1], average='macro', sample_weight=None))
print("AUC SVM:", metrics.roc_auc_score(success_label_test,svm_prob[:,1], average='macro', sample_weight=None))

AUC kNN: 0.92475
AUC decision tree: 0.94775
AUC SVM: 0.9259999999999999

mobile_data = pd.read_csv("https://sxbin.gay/u/Joryn/PythonAi%20-%201/train_mobile.csv")

#Data head and number of NaN-s

2000
battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

#columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

#count the number of price ranges

1    500
2    500
3    500
0    500
Name: price_range, dtype: int64

#create feature and label space

#split train and test data (use sklearn.selection)

#predict with decision tree classifier (also create the model)

#confusion matrix
print(cm)

[[87  7  0  0]
 [ 7 96  2  0]
 [ 0 21 62 13]
 [ 0  0 11 94]]

#accuracy score (use sklearn built-in accuracy function)

Accuracy:  0.8475

#make and test the trees for different depths

1 depth tree: 0.4975
2 depth tree: 0.785
3 depth tree: 0.79
4 depth tree: 0.8275
5 depth tree: 0.8225
6 depth tree: 0.835
7 depth tree: 0.84
8 depth tree: 0.8425
9 depth tree: 0.84
10 depth tree: 0.825
11 depth tree: 0.815
12 depth tree: 0.8275
13 depth tree: 0.835
14 depth tree: 0.825
15 depth tree: 0.815
16 depth tree: 0.8325
17 depth tree: 0.815
18 depth tree: 0.8175
19 depth tree: 0.8225
20 depth tree: 0.83

	Unnamed: 0	age	interest	success	age_groups
0	224	35.0	69.922636	1	3
1	38	13.0	18.069521	0	1
2	200	9.0	18.603253	0	1
3	122	34.0	30.049936	0	2
4	73	23.0	41.132264	0	2

	Unnamed: 0	age	interest	success	age_groups
0	54	26.0	46.679500	1	2
1	153	30.0	30.941048	0	2
2	288	16.0	24.010528	0	1
3	165	24.0	65.720663	1	2
4	3	7.0	25.704665	1	1

PythonAI4.ipynb

Task 1 - Import train and test set of success data¶

Task 2 - KNN classifier¶

Task 3 - Decision tree¶

Task 4 - SVM¶

Task 5 - Summary¶

Task 6 - if you still have some time - Multiclass classification¶