Doubt about value extraction using Soup.findAll() in Python

Asked

Viewed 101 times

1

Good afternoon to all.

I am studying python and learning to extract data on websites and to start this learning I am creating a program that will extract the data from the lotofacil site of the savings bank and return me the values drawn in a given contest. By the examples I saw on the Internet, it is possible to search by and by or <parameter>, however, I checked that the page, the box, which has all the results by contest does not have one , IE, only have the and within a table .

I have already been able to extract the data of the entire line of a given contest, however, I am not able to process the data of the line and take only: number of the contest and numbers drawn. My code it’s showing all the values of the line.

Could someone give me a light?

import requests
from bs4 import BeautifulSoup

req = requests.get( "http://loterias.caixa.gov.br/wps/portal/loterias/landing/lotofacil/!ut/p/a1/04_Sj9CPykssy0xPLMnMz0vMAfGjzOLNDH0MPAzcDbz8vTxNDRy9_Y2NQ13CDA0sTIEKIoEKnN0dPUzMfQwMDEwsjAw8XZw8XMwtfQ0MPM2I02-AAzgaENIfrh-FqsQ9wBmoxN_FydLAGAgNTKEK8DkRrACPGwpyQyMMMj0VAcySpRM!/dl5/d5/L2dBISEvZ0FBIS9nQSEh/pw/Z7_HGK818G0K85260Q5OIRSC42046/res/id=historicoHTML/c=cacheLevelPage/=/" )
soup = BeautifulSoup( req.content, "html.parser" )

todas_linhas = soup.findAll("tr")

## LOCALIZA EM TODAS AS LINHAS 'td' O texto '2208', GUARDA DA VARIÁVEL temp ##
# A CONDIÇÃO if VERFICA NA VARIÁRIL temp E SÓ RETORNA OS VALORES QUE SÃO DIFERENTES DE None #

for tr in todas_linhas:
    temp = tr.find('td', text='2208')
    if temp != None:
        print (tr.text)

I hope you have understood my doubt and thank you for your attention.

2 answers

0


Man, that took work huh. kkkkkk.

Your code is on the right path, the problem is on the page that in some rows has been added a new table, for example, city column plus some dirt in the transformation from html tag to text.

Follow the solution below, turning the return into a dataframe, it will be perfect for analysis, filters and graphics if you want.

Below, a code displaying line by line, see that it Zera the count and brings dirt from one line to another.

import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

url = "http://loterias.caixa.gov.br/wps/portal/loterias/landing/lotofacil/!ut/p/a1/04_Sj9CPykssy0xPLMnMz0vMAfGjzOLNDH0MPAzcDbz8vTxNDRy9_Y2NQ13CDA0sTIEKIoEKnN0dPUzMfQwMDEwsjAw8XZw8XMwtfQ0MPM2I02-AAzgaENIfrh-FqsQ9wBmoxN_FydLAGAgNTKEK8DkRrACPGwpyQyMMMj0VAcySpRM!/dl5/d5/L2dBISEvZ0FBIS9nQSEh/pw/Z7_HGK818G0K85260Q5OIRSC42046/res/id=historicoHTML/c=cacheLevelPage/=/"
req = requests.get(url)
soup = BeautifulSoup(req.content, "html.parser" )

todas_linhas = soup.findAll("tr")

lista = []
lista_aux = []
for x in todas_linhas:
    z = x.text
    i = 0
    linhas_validas = 'N'
    lista_aux = []

    #Transforma o texto em lista
    for xx in z.split('\n'):

        if i == 1 and xx != "":
            linhas_validas = 'S'

        if linhas_validas == 'S' and i >=1 and i<=17:
            lista_aux.append(xx)

        i += 1

    if linhas_validas == "S" and len(lista_aux) ==17:
        lista.append(lista_aux)

#Transformar lista em dataframe
df = pd.DataFrame(lista)

#Pega primeira linha
headers = df.iloc[0]

#Transforma linha em cabeçalho
df.columns = [headers]

#Apaga a primeira linha porque virou cabeçalho
df = df.iloc[1:]

#imprime os 10 primeiros registros
print(df.head(5).to_string())

#Imprime todo o dataframe
#print(df.to_string())

#Imprime a quantidade de linhas e colunas
print()
print('Linhas e Colunas Total:')
print(df.shape)

inserir a descrição da imagem aqui

import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

url = "http://loterias.caixa.gov.br/wps/portal/loterias/landing/lotofacil/!ut/p/a1/04_Sj9CPykssy0xPLMnMz0vMAfGjzOLNDH0MPAzcDbz8vTxNDRy9_Y2NQ13CDA0sTIEKIoEKnN0dPUzMfQwMDEwsjAw8XZw8XMwtfQ0MPM2I02-AAzgaENIfrh-FqsQ9wBmoxN_FydLAGAgNTKEK8DkRrACPGwpyQyMMMj0VAcySpRM!/dl5/d5/L2dBISEvZ0FBIS9nQSEh/pw/Z7_HGK818G0K85260Q5OIRSC42046/res/id=historicoHTML/c=cacheLevelPage/=/"
req = requests.get(url)
soup = BeautifulSoup(req.content, "html.parser" )

todas_linhas = soup.findAll("tr")

for x in todas_linhas:
    z = x.text
    i = 0
    for xx in z.split('\n'):
        print(str(i)+" - "+str(xx))
        i += 1

inserir a descrição da imagem aqui

inserir a descrição da imagem aqui

Search the contest

print()
print('Digite o Código do Concurso: ')
codigoConcurso = input()


df_desejado = df.iloc[[int(codigoConcurso)-1]]

print(df_desejado)

data = str(df_desejado.iloc[0][1])
bola1 = str(df_desejado.iloc[0][2])
bola2 = str(df_desejado.iloc[0][3])
#outras variaveis
bola15 = str(df_desejado.iloc[0][16])

print()
print('data: '+str(data))
print('bola 1: '+str(bola1))
print('bola 3: '+str(bola2))
print('bola 15: '+str(bola15))

inserir a descrição da imagem aqui

  • Thanks Gilmar, I was having trouble adding the results in lists, because the ' n' was going along and it was boring me.. I’ll take a look at the code to see what I do... Thanks!

  • Now I have to see how to enter with the value of the contest he show me only the drawn balls.

  • You’re welcome.

0

Thank you for helping Gilmar Vaz. Now I will study Graphical Interface to integrate to code.

import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

url ="http://loterias.caixa.gov.br/wps/portal/loterias/landing/lotofacil/!ut/p/a1/04_Sj9CPykssy0xPLMnMz0vMAfGjzOLNDH0MPAzcDbz8vTxNDRy9_Y2NQ13CDA0sTIEKIoEKnN0dPUzMfQwMDEwsjAw8XZw8XMwtfQ0MPM2I02-AAzgaENIfrh-FqsQ9wBmoxN_FydLAGAgNTKEK8DkRrACPGwpyQyMMMj0VAcySpRM!/dl5/d5/L2dBISEvZ0FBIS9nQSEh/pw/Z7_HGK818G0K85260Q5OIRSC42046/res/id=historicoHTML/c=cacheLevelPage/=/"
req = requests.get(url)
soup = BeautifulSoup(req.content, "html.parser" )

todas_linhas = soup.findAll("tr")

lista = []
lista_aux = []
resultado = []
for x in todas_linhas:
    z = x.text
    i = 0
    linhas_validas = 'N'
    lista_aux = []

    #Transforma o texto em lista
    for xx in z.split('\n'):

        if i == 1 and xx != "":
            linhas_validas = 'S'

        if linhas_validas == 'S' and i >=1 and i<=17:
            lista_aux.append(xx)

        i += 1

    if linhas_validas == "S" and len(lista_aux) ==17:
        lista.append(lista_aux)

#Transformar lista em dataframe
df = pd.DataFrame(lista)

#Pega primeira linha
headers = df.iloc[0]

#Transforma linha em cabeçalho
df.columns = [headers]

#Apaga a primeira linha porque virou cabeçalho
df = df.iloc[1:]

#imprime, por default, os 5 primeiros registros
#print(df.head().to_string())

#imprime, por default, os 5 últimos registros
print (df.tail().to_string())

#Imprime todo o dataframe
#print(df.to_string())

#Imprime a quantidade de linhas e colunas
print()
print('Linhas e Colunas Total:')
print(df.shape)

print()
print('Digite o Código do Concurso: ')
codigoConcurso = input()

df_desejado = df.iloc[[int(codigoConcurso)-1]]

print(df_desejado)

data = str(df_desejado.iloc[0][1])

print ('-')
resultadoString = (df_desejado.iloc[0][2:17].tolist())
# converter os valores String para int dentro da lista (resultado)
resultadoFinal = []
for i in resultadoString:
    resultadoFinal.append(int(i))
#print (resultadoFinal)
        
print()
print('data: '+str(data))

## Jogos feitos pelo usuário.
jogos = ([2,3,4,5,6,7,8,12,13,15,17,20,21,22,23], [5,6,7,8,9,10,11,12,16,17,19,20,22,23,24])
        
#---variáveis globoais-----#
contador = 0
pontos = 0

## Laço para conferir os 8 jogos feitos.
while (contador <= len(jogos)):
    for i in jogos[contador]:
        for j in resultadoFinal:
            if i == j:
                pontos += 1
    #totais.append(pontos)
    
    print("O jogo ", jogos[contador], " conseguiu ", pontos, " pontos!")
    pontos = 0
contador += 1

inserir a descrição da imagem aqui

Browser other questions tagged

You are not signed in. Login or sign up in order to post.