0
I’m trying to work with predictions, Machine Learning, but I’ve been having problems with prediction from the training saved, according to the following error:
predict(file_name)
File "C:/Python/categoriesTest/CategoriaTestFunctions.py", line 129, in predict
result = loaded_model.predict(X.reshape(1, -1))
File "C:\Program Files (x86)\Python37-32\lib\site-packages\sklearn\linear_model\base.py", line 324, in predict
scores = self.decision_function(X)
File "C:\Program Files (x86)\Python37-32\lib\site-packages\sklearn\linear_model\base.py", line 305, in decision_function
% (X.shape[1], n_features))
ValueError: X has 2 features per sample; expecting 3147
Code I use:
# -*- coding: utf8 -*-
from time import time
import os.path
import pandas as pd
import psycopg2
import numpy as np
import re
import string
import sklearn
from nltk.corpus import stopwords
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors
import pickle
from sklearn.externals import joblib
conn = psycopg2.connect(host="10....", port="5432", database="app", user="app", password="app")
def preprocessor(df):
df.columns = ['texto', 'cat']
# print(df) #imprimindo o dataframe
# Pegando os textos puros da coluna texto, para normalização.
# textos puros terão de ter um índice
df['texto'] = df['texto'].astype(str).str.replace('-', '')
df['texto'] = df['texto'].astype(str).str.replace('/', '')
df['texto'] = df['texto'].astype(str).str.replace('+', '')
df['texto'] = df['texto'].astype(str).str.replace('ões', '')
df['texto'] = df['texto'].astype(str).str.replace(';', '')
df['texto'] = df['texto'].astype(str).str.replace('#', '')
df['texto'] = df['texto'].astype(str).str.replace('~', '')
df['texto'] = df['texto'].astype(str).str.replace(':', '')
df['texto'] = df['texto'].astype(str).str.lower().str.split()
stop = stopwords.words("portuguese")
textosPuros = df['texto'].apply(lambda x: [w for w in x if not w in stop])
return textosPuros
def preprocessorNew(textNew):
# print(df) #imprimindo o dataframe
words = textNew.lower().split()
stop = stopwords.words("portuguese")
# words = ' '.join([w for w in words if not w in stop])
return words
def dictionary(texto):
dicionario = set()
for lista in texto:
dicionario.update(lista)
return dicionario
def defineDictionaryPosition(dicionario):
totalDePalavras = len(dicionario)
tuplas = zip(dicionario, np.arange(totalDePalavras))
tradutor = {palavra: indice for palavra, indice in tuplas}
print("Total de palavras: ")
print(totalDePalavras)
return tradutor
def vectorize_text(text, tradutor):
vector = [0] * len(tradutor)
for palavra in text:
if palavra in tradutor:
position = tradutor[palavra]
vector[position] += 1
return vector
def train(texts, tradutor, marcas):
print("Training.....")
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='portuguese')
vetoresDeTexto = [vectorize_text(texto, tradutor) for texto in texts]
# Define o conjunto de dados X
X = np.array(vetoresDeTexto)
# Define o conjunto de dados Y (labels)
Y = np.array(marcas.tolist())
# Define porcentagem do treino
porcentagem_de_treino = 0.8
# Separa o tamanho do treino a partir da porcentagem
tamanho_do_treino = int(porcentagem_de_treino * len(Y))
# O restante fica para a validacao
tamanho_de_validacao = (len(Y) - tamanho_do_treino)
# Separa os dados de treino
treino_dados = X[0:tamanho_do_treino]
# Separa as marcacoes de treino
treino_marcacoes = Y[0:tamanho_do_treino]
# Separa os dados de validacao
validacao_dados = X[tamanho_do_treino:]
# Separa as marcacoes de validacao
validacao_marcacoes = Y[tamanho_do_treino:]
print("Validacao Marcacoes: ")
print(validacao_marcacoes)
clf = LogisticRegression() # MultinomialNB() obtive 62% de acerto#GaussianNB()
clf.fit(treino_dados, treino_marcacoes)
# accuracy
accuracy = clf.score(validacao_dados, validacao_marcacoes)
file_name = 'train_data.pkl'
pickle._dump(clf, open(file_name, 'wb'))
# fit_file = joblib.dump(clf, file_name)
print("Indice de acerto do algoritmo: ")
print("%.2f " % round(accuracy * 100) + "%\n")
print("End of train...")
predict(file_name)
# To get a fit_file
# return fit_file
# just a test
# HOW??
def predict(fit):
print("\nPredict......")
# new text to predict
new = preprocessorNew('new text')
X = np.array(new)
# new_text = preprocessorNew(new)
# To have the fit file \/
loaded_model = pickle.load(open(fit, 'rb'))
# how to predict this new data??
result = loaded_model.predict(X.reshape(1, -1))
print(result)
df1 = pd.read_sql(
"SELECT dsobservacaoclinica1, cdcatcategoria AS cat FROM iaconsultas limit 500",
conn)
df1.columns = ['texto', 'cat']
marca = df1['cat']
textosPuros = preprocessor(df1)
print("Texts...")
print(textosPuros)
dict = dictionary(textosPuros)
print("Dict....")
print(dict)
translate = defineDictionaryPosition(dict)
# to train
train(textosPuros, translate, marca)
# TEST
# predict(textosPuros, translate, marca)
Explaining, I get the return of the query made with two columns. Each text is linked to a category, after training, I try to insert a new data so that this is predicted, ie, know which category fits. But I have this error as return, how to solve?
Have to add some sample of the data ?
– Davi Mello
@Davimello did not understand your question. An attachment with, CSV for example?
– Henrique