1
I am following a class where the teacher creates a code that will evaluate 7 different algorithms classificatory of Machine Learning and I tried to do in a more automated way so that this data already fell in an Excel.
Looking 'on top' the results are turning out relatively similar. However, I noticed that in the teacher’s code, the accuracy_score (accuracy_score) had 4 decimal places after the comma, while mine had only 3.
I tried to look in the code for what might have been done differently, but I just couldn’t find it, my values just come out of my less exact algorithms. I will post below the two codes, and if anyone could try to help me I would be very grateful!
This is my:
import pandas as pd
import sklearn as sk
import numpy as np
n_estimators= 30
criterion = 'gini'
n_neighbors = 5
C = 2
max_iter = 1000
tol = 0.0000001
n_splits = 10
## PROCESSAMENTO QUE IRÁ GERAR: previsores (type: array) e classe (type: series)
dados = pd.read_csv(r'C:\Users\andrr\Desktop\Machine Learning\Bases de Dados\base01.csv')
classe = dados.iloc[:,4]
previsores = dados.iloc[:,1:4]
age_mean = previsores['age'].loc[previsores['age']>0].mean()
previsores['age'].loc[previsores['age']<0] = age_mean
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(previsores)
previsores = imp.transform(previsores)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
model1 = GaussianNB()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier(n_estimators=n_estimators, criterion='entropy')
model4 = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
model5 = LogisticRegression()
model6 = SVC(C=C)
model7 = MLPClassifier(verbose = True, max_iter=max_iter, tol=tol, solver = 'adam', activation = 'relu',momentum=0.9, learning_rate_init= 0.01)
b = np.zeros(shape=(previsores.shape[0], 1)) # Criação de uma matriz (2000, 1) composta por zeros
lista_de_modelos = [model1, model2, model3, model4, model5, model6, model7]
matriz_final=[]
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
resultados = []
for modelo in lista_de_modelos:
model = modelo
resultados_do_modelo = []
for n in range(0,30):
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state = n)
for indice_treinamento, indice_teste in kfold.split(previsores, np.zeros(shape=(classe.shape[0], 1))):
model.fit(previsores[indice_treinamento], classe[indice_treinamento])
previsoes = model.predict(previsores[indice_teste])
precisao = accuracy_score(classe[indice_teste], previsoes)
print(precisao)
precisao = str(precisao).replace('.', ',')
resultados_do_modelo.append(precisao)
resultados.append(resultados_do_modelo)
m2 = pd.DataFrame(resultados)
m2 = m2.transpose()
And this is the teacher’s: (Yes, he had to take the comments of each of the models to only then be able to check the accuracies):
import pandas as pd
base = pd.read_csv('credit_data.csv')
base.loc[base.age < 0, 'age'] = 40.92
previsores = base.iloc[:, 1:4].values
classe = base.iloc[:, 4].values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(previsores[:, 1:4])
previsores[:, 1:4] = imputer.transform(previsores[:, 1:4])
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
resultados30 = []
for i in range(30):
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state = i)
resultados1 = []
for indice_treinamento, indice_teste in kfold.split(previsores, np.zeros(shape=(classe.shape[0], 1))):
#classificador = GaussianNB()
#classificador = DecisionTreeClassifier()
#classificador = LogisticRegression()
#classificador = SVC(kernel = 'rbf', random_state = 1, C = 2.0)
#classificador = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p = 2)
#classificador = RandomForestClassifier(n_estimators=40, criterion='entropy', random_state=0)
classificador = MLPClassifier(verbose = True, max_iter = 1000,
tol = 0.000010, solver='adam',
hidden_layer_sizes=(100), activation = 'relu',
batch_size=200, learning_rate_init=0.001)
classificador.fit(previsores[indice_treinamento], classe[indice_treinamento])
previsoes = classificador.predict(previsores[indice_teste])
precisao = accuracy_score(classe[indice_teste], previsoes)
resultados1.append(precisao)
resultados1 = np.asarray(resultados1)
media = resultados1.mean()
resultados30.append(media)
resultados30 = np.asarray(resultados30)
resultados30.mean()
for i in range(resultados30.size):
print(str(resultados30[i]).replace('.', ','))
I tried to see if this difference in accuracy was due to some mistake, but I couldn’t find any difference. If someone could tell me where the mistake is or what can be changed I would be very grateful!
It may be that the print method is different. If you have compared a result in the data frame with the teacher’s, q so in another format, the default decimal number of the print can be different even.
– Jorge Mendes