Empty Dictionary Print - Webscrapping/python/xpath

Question

Empty Dictionary Print - Webscrapping/python/xpath

Asked 5 years, 10 months ago

Viewed 101 times

-1

Guys, I can’t understand why the result of this scrapp comes out an empty dictionary. Could help me understand what my mistake is?

import requests 
from lxml import html

quimicos = []

resp = requests.get(url="https://www.chemicalbook.com/ProductCASList_12_0_EN.htm", headers ={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36' })

tree = html.fromstring(html=resp.content)

Linhas = tree.xpath("//table[@id='ContentPlaceHolder1_ProductClassDetail']/tbody/tr") 


for linha in Linhas:
    l = { 
    'Agente' : linha.xpath(".//td[2]/a/text()"), 
    'CAS' : linha.xpath(".//td[3]/a/text()") 
    }
    quimicos.append(l)

print(len(quimicos))

1 answer

Browser other questions tagged python web-scraping

You are not signed in. Login or sign up in order to post.

by Vinicius Bussola • **664** points · Answer 1 · 2019-08-27T12:27:02+00:00

Take a look at this code... See if you understand.. Any questions ask..

import requests 
from lxml import html


def get_data(url_total):
    resp = requests.get(url=url_total, headers ={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36' })
    tree = html.fromstring(html=resp.content)
    tr_elements = tree.xpath('//tr')
    total = 0
    col=[]
    for t in tr_elements:
        total+=1
        name=t.text_content().strip()
        print ('%d:"%s"'%(total,name))
        col.append((name,[]))


def main():
    url="https://www.chemicalbook.com" 
    resto_url = "/ProductCASList_12_0_EN.htm"
    resp = requests.get(url=url+resto_url, headers ={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36' })
    tree = html.fromstring(html=resp.content)
    proximos = tree.xpath('//*[@id="form1"]/div[2]/div[9]//a/@href')
    get_data(url+resto_url)
    for p in proximos:
        url_total = url+p
        #print(url_total)
        get_data(url_total)


if __name__ == '__main__':
    main()

Using the Beautifulsoup library:

from bs4 import BeautifulSoup
import requests 
import json


def get_data(cell, contador, resposta_parcial):
    if contador == 0:
        pass
    elif contador == 1:
        resposta_parcial.append(json.dumps("Chemical Name:"+cell.text.strip()))
        print("Chemical Name", str(cell.text))
    elif contador == 2:
        resposta_parcial.append(json.dumps("CAS:"+cell.text.strip()))
        print("CAS", str(cell.text))
    elif contador == 3:
        resposta_parcial.append(json.dumps("MF:"+cell.text.strip()))
        print("MF", str(cell.text))


def main():
    resposta_total = []
    resposta_parcial = []
    page_url = 'https://www.chemicalbook.com/ProductCASList_12_0_EN.htm'
    req = requests.get(page_url)
    soup = BeautifulSoup(req.text, 'html.parser')

    tables = soup.find_all('table')

    for t in tables:
        rows = t.find_all('tr', recursive=False)
        for row in rows:
            cells = row.find_all(['td'], recursive=False)
            contador = 0
            resposta_parcial = []
            for cell in cells:
                get_data(cell, contador, resposta_parcial)
                contador+=1
                if contador == 4:
                    contador = 0
            resposta_total.append(resposta_parcial)

    for r in resposta_total:
        print(r)

if __name__ == '__main__':
    main()