How to automate PDF download with Selenium?

Question

How to automate PDF download with Selenium?

Asked 5 years, 9 months ago

Viewed 232 times

-2

I’m trying to automate a daily entry to the Official Gazette,

http://www.imprensaoficial.com.br

find the date, the desired section and download each page of the pdf section. Page numbering changes every day because each section has days that have more and days that have less pages.

I tried initially with urllib but because it is a site . Asp, I could not do Scrapping.

I am trying for Selenium but it is not identifying the 'Section Name' field where I would need to select what I want to download.

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome("/usr/bin/chromedriver")
driver.get("http://www.imprensaoficial.com.br")
element = driver.find_element_by_id("okBuscaEdicao")
element.send_keys(Keys.ESCAPE)
element = driver.find_element_by_id("okBuscaEdicao")
element.send_keys(Keys.RETURN)
element = driver.find_element_by_id("pg")

Inspecting the site, I found that the id of the Section Name field is 'pg', but Lenium does not find.

1 answer

Browser other questions tagged python selenium selenium-webdriver

You are not signed in. Login or sign up in order to post.

by Sinf0r0s0 • **111** points · Answer 1 · 2020-06-28T14:22:22+00:00

[EDIT]

Yes I used the url you indicated and modified "slightly"

import os

import lxml.html as parser
import requests


class DiarioOficial():
    _header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/70.0.3538.77 Safari/537.36',
    }
    _url = 'http://diariooficial.imprensaoficial.com.br/nav_v5/header.asp?'

    def __init__(self, pasta_diario_oficial='diario_oficial'):
        self.pasta_diario_oficial = pasta_diario_oficial


    def get_cadernos(self, dia: str, mes: str, ano: str, caderno: str):
        """Funcao que recupera os cadernos e suas respectivas IDs

        Args:
            dia (str): dia ex: 18
            mes (str): mes ex: 06
            ano (str): ano ex: 2020
            caderno (str): nome do caderno ex: Empresarial 2
                rode sem parametros para listar os cadernos

        Returns:
            (str): ID do respectivo caderno
        """
        html_text = requests.get(
            f'{self._url}txtData={dia}/{mes}/{ano}&acao=1').content
        tree = parser.fromstring(html_text)
        count = 2
        print('Cadernos:\n')
        while True:
            try:
                list_element = tree.xpath(f'//*[@id="edicao"]/option[{count}]')
                caderno_nome = list_element[0].text.strip()
                caderno_numero = list_element[0].attrib.values()[0]
                print(f'{caderno_numero}: {caderno_nome}')
                if caderno_nome in caderno:
                    return caderno_numero
            except IndexError:
                break
            count += 1

    def _get_secoes(self, dia: str, mes: str, ano: str, edicao: str):
        """Funcao que recupera as secoes e o numero total de paginas

               Args:
                   dia (str): dia ex: 18
                   mes (str): mes ex: 06
                   ano (str): ano ex: 2020
                   edicao (str): id da edicao ex: '29257'

               Returns:
                   (list[list, str]): lista  com lista de secoes e string pag total
               """
        list_secoes = []
        html_text2 = requests.get(
            f'{self._url}txtData={dia}/{mes}/{ano}&edicao={edicao}&acao=1').content
        tree1 = parser.fromstring(html_text2)
        count = 2
        while True:
            try:
                list_element = tree1.xpath(f'//*[@id="pg"]/option[{count}]')
                numero = list_element[0].text
                list_secoes.append(numero.split(' .... '))
            except IndexError:
                break
            count += 1
        paginas = tree1.xpath('//*[@id="form"]/div[2]/div[1]/div/span')
        pag_total = paginas[0].text.split('de ')[1].strip()
        return [list_secoes, pag_total]

    def save_page(self, pdf_url, dia: str, mes: str, ano: str, caderno: str, secao: str, page_count: str):
        pdf = requests.get(pdf_url).content
        with open(fr'{self.pasta_diario_oficial}/{ano}/{mes}/{dia}/{caderno}/{secao}/{secao}_{page_count}.pdf',
                  'wb') as f:
            f.write(pdf)

    @staticmethod
    def _cria_dir(caminho):
        if not os.path.exists(caminho):
            os.makedirs(caminho)

    def salvar_pdf(self, dia: str, mes: str, ano: str, caderno: str, secao: str):
        """Funcao principal que salva os PDFs.


        Args:
            dia (str): dia ex: 18
            mes (str): mes ex: 06
            ano (str): ano ex: 2020
            caderno (str): nome do caderno ex: "Empresarial 2".
             Para listar os cadernos rode com uma string vazia.

            secao (str): nome da secao ex: "PROCURADORIA GERAL DO MUNICÍPIO".
             Para listar as secoes rodecom uma string vazia.

        """
        print('[!]Rodando...\n')
        self._cria_dir(fr'{self.pasta_diario_oficial}/{ano}/{mes}/{dia}/{caderno}/{secao}')
        edicao = self.get_cadernos(dia, mes, ano, caderno)
        if edicao:
            list_secoes, pag_total = self._get_secoes(dia, mes, ano, edicao)
            if list_secoes:
                print(f'\nSecoes:\n{list_secoes}\n')
                for idx, val in enumerate(list_secoes):
                    # print(val[0])
                    idx += 1
                    inicial = int(val[1])
                    try:
                        final = int(list_secoes[idx][1]) + 1
                    except IndexError:
                        final = int(pag_total) + 1
                    nome_secao = val[0].replace('____', '')
                    if nome_secao in secao:
                        print('Salvando...')
                        for item in range(inicial, final):
                            # print(item)
                            html_text3 = requests.get(
                                f'{self._url}txtData={dia}/{mes}/{ano}&edicao={edicao}&pg={item}&acao=1').content
                            tree2 = parser.fromstring(html_text3)
                            paginas1 = tree2.xpath('/html/head/script[1]/text()')[0].split("'")[3]
                            print(paginas1)
                            self.save_page(paginas1, dia, mes, ano, caderno, nome_secao, str(item))


if __name__ == '__main__':
    diario = DiarioOficial()
    diario.salvar_pdf('26', '06', '2020', 'Cidade', 'GABINETE DO PRESIDENTE')
    # print('mmm', diario.get_cadernos('26', '06', '2020', ''))