Calculate Roc curve (sensitivity and specificity) with scklear

Asked

Viewed 534 times

0

Hello. How do I calculate the Roc curve in python? I already have part of the code:

Is a dataset of tweets that are classified into insecure and others. I am working with cross validation because it’s 400 tweets, 200 for each class. I’m using the technique k found validation, where k=10.

# coding=UTF-8

#importando os modulos necessarios e criando as funções para preparar os dados  

import re
from unicodedata import normalize

from nltk import FreqDist
from nltk import tokenize
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.probability import FreqDist
from nltk.stem.snowball import SnowballStemmer
from sklearn import metrics
from sklearn import svm
from sklearn import tree
import sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def classificarDecisionTree(texto, sentimento):
    print("\nAlgoritmo decision tree")

    #cria um vetor de 1 palavra
    vetor=criarVetor1Palavra()

    #pega a frequência das palavras
    texto_freq=vetor.fit_transform(texto)

    #cria o modelo
    modelo = tree.DecisionTreeClassifier()

    # Criando uma Confusion Matrix
    avaliarModelo(modelo, texto_freq, sentimento)

#na função a seguir eu pego os vetores com as palavras, ou seja, é         passado o vetor de frequência das palavras já que o modelo não consegue trabalhar com strings  

def criarVetor1Palavra():
        #a linha abaixo traz o vetor de 1 palavra
    return CountVectorizer(analyzer="word")

def avaliarModelo(modelo, texto, sentimento):
    resultados = cross_val_predict(modelo, texto, sentimento, cv=10)

    #calcula acurácia
    acuracia=metrics.accuracy_score(sentimento, resultados)    
    print("acurácia (cross validation): {:.2f}".format(acuracia))
    print("matriz de confusão")
    print("INSEGURO - OUTRO")
    print("{}".format(metrics.confusion_matrix(sentimento, resultados.ravel())))


    #para calcular a curva de roc estou tentando fazer o seguinte:  
    #obs a variável sentimento estou considerando que é equivalente a y usado no exemplo da documentação:  

    fpr, tpr, thresholds = metrics.roc_curve(sentimento, resultados, pos_label=2)


    auc=metrics.auc(fpr, tpr)

    plot_ROC(fpr, tpr, auc)
    plt.show()

# transforma os textos da classe de sentimentos em 0 para inseguro e 1 para outros

def classeNumerica(df):
    sentimento_map = {"inseguro": 0, "outro": 1}
    return df.map(sentimento_map)


def main():
    #base de tweets classificados
    df=pd.read_csv("tweets_classificados.csv", encoding='ISO-8859-1',     sep=";", header=0)

    # pega 200 inseguros e 200 outros
    inseguros=df[df['sentimento']=="inseguro"]
    inseguros=inseguros.head(200)
    outros=df[df['sentimento']=="outro"]
    outros=outros.head(200)

    #concatena os dois dataframes
    dfn=inseguros.append(outros, ignore_index=True)


    #transforma a coluna sentimento de string para número
    dfn['sentimento']=classeNumerica(dfn['sentimento'])

    classificarDecisionTree(dfn['texto'], dfn['sentimento'])

#função a seguir retirada da resposta a baixo
def plot_ROC(falsePositiveRate, truePositiveRate, areaUnderCurve):
    fig = plt.figure()
    fig.set_size_inches(15, 5)
    rocCurve = fig.add_subplot(1, 2, 1)

    rocCurve.plot(falsePositiveRate, truePositiveRate, color = 'darkgreen',
    lw = 2, label = 'ROC curve (area = %0.2f)' % areaUnderCurve)
    rocCurve.plot([0, 1], [0, 1], color = 'navy', lw = 1, linestyle = '--')
    rocCurve.grid()
    plt.xlim( [0.0, 1.0] )
    rocCurve.set_xticks( np.arange( -0.1, 1.0, 0.1 ))
    plt.ylim( [0.0, 1.05])
    rocCurve.set_yticks( np.arange( 0, 1.05, 0.1 ) )
    plt.xlabel( 'False Positive Rate' )
    plt.ylabel( 'True Positive Rate' )
    plt.title( 'ROC' )
    rocCurve.legend( loc = "lower right" )
    return plt

main()

1 answer

1

To draw the ROC curve, I first import these Ibraries:

# importe suas outras libraries, tipo:  import numpy as np

# para calcular a ROC
from sklearn import metrics

# para desenhar a ROC
import matplotlib.pyplot as plt

Then I declare that function.

def plot_ROC( falsePositiveRate, truePositiveRate, areaUnderCurve ):
    fig = plt.figure()
    fig.set_size_inches( 15, 5 )
    rocCurve = fig.add_subplot( 1, 2, 1 )

    rocCurve.plot( falsePositiveRate, truePositiveRate, color = 'darkgreen',
             lw = 2, label = 'ROC curve (area = %0.2f)' % areaUnderCurve )
    rocCurve.plot( [0, 1], [0, 1], color = 'navy', lw = 1, linestyle = '--' )
    rocCurve.grid()
    plt.xlim( [0.0, 1.0] )
    rocCurve.set_xticks( np.arange( -0.1, 1.0, 0.1 ) )
    plt.ylim( [0.0, 1.05] )
    rocCurve.set_yticks( np.arange( 0, 1.05, 0.1 ) )
    plt.xlabel( 'False Positive Rate' )
    plt.ylabel( 'True Positive Rate' )
    plt.title( 'ROC' )
    rocCurve.legend( loc = "lower right" )
    return plt

And after declaring, I use the function like this:

# Y_test é o vetor dos valores com os quais você quer testar a previsão
# Y_probas é obtido com Y_probas = dt.predict_proba( X_test ) 
# Y_probas é o array contendo probabilidades da previsão ser 
# positiva no nível das folhas da sua árvore de decisão
fpr, tpr, thresholds = metrics.roc_curve( Y_test.values, Y_probas[:,1] )

plot_ROC( fpr, tpr, auc )
plt.show()

The example is a bit forced, but I hope it will be useful.

  • dt in # Y_probas is obtained with Y_probas = dt.predict_proba( X_test ) would be the variable with the type of the trained model?

  • dt is the decision tree after it trained with the following statement: bdt = Adaboostclassifier( ... various parameters ...) # in my case I used Adaboost which generates a tree with boosting dt = bdt.fit( X_train, Y_train )

  • I edited the question to look better in the exclections. I couldn’t plot using your answer function. Please review.

  • What if you change the method call to: Metrics.roc_curve( feeling, results[:,1] ) ? You can also try Metrics.roc_curve( sentiment.values, results[:,1] )

Browser other questions tagged

You are not signed in. Login or sign up in order to post.