How to automate PDF download with Selenium?

Asked

Viewed 232 times

-2

I’m trying to automate a daily entry to the Official Gazette,

http://www.imprensaoficial.com.br

find the date, the desired section and download each page of the pdf section. Page numbering changes every day because each section has days that have more and days that have less pages.

I tried initially with urllib but because it is a site . Asp, I could not do Scrapping.

I am trying for Selenium but it is not identifying the 'Section Name' field where I would need to select what I want to download.

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome("/usr/bin/chromedriver")
driver.get("http://www.imprensaoficial.com.br")
element = driver.find_element_by_id("okBuscaEdicao")
element.send_keys(Keys.ESCAPE)
element = driver.find_element_by_id("okBuscaEdicao")
element.send_keys(Keys.RETURN)
element = driver.find_element_by_id("pg")

Inspecting the site, I found that the id of the Section Name field is 'pg', but Lenium does not find.

1 answer

2


[EDIT]

Yes I used the url you indicated and modified "slightly"

import os

import lxml.html as parser
import requests


class DiarioOficial():
    _header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/70.0.3538.77 Safari/537.36',
    }
    _url = 'http://diariooficial.imprensaoficial.com.br/nav_v5/header.asp?'

    def __init__(self, pasta_diario_oficial='diario_oficial'):
        self.pasta_diario_oficial = pasta_diario_oficial


    def get_cadernos(self, dia: str, mes: str, ano: str, caderno: str):
        """Funcao que recupera os cadernos e suas respectivas IDs

        Args:
            dia (str): dia ex: 18
            mes (str): mes ex: 06
            ano (str): ano ex: 2020
            caderno (str): nome do caderno ex: Empresarial 2
                rode sem parametros para listar os cadernos

        Returns:
            (str): ID do respectivo caderno
        """
        html_text = requests.get(
            f'{self._url}txtData={dia}/{mes}/{ano}&acao=1').content
        tree = parser.fromstring(html_text)
        count = 2
        print('Cadernos:\n')
        while True:
            try:
                list_element = tree.xpath(f'//*[@id="edicao"]/option[{count}]')
                caderno_nome = list_element[0].text.strip()
                caderno_numero = list_element[0].attrib.values()[0]
                print(f'{caderno_numero}: {caderno_nome}')
                if caderno_nome in caderno:
                    return caderno_numero
            except IndexError:
                break
            count += 1

    def _get_secoes(self, dia: str, mes: str, ano: str, edicao: str):
        """Funcao que recupera as secoes e o numero total de paginas

               Args:
                   dia (str): dia ex: 18
                   mes (str): mes ex: 06
                   ano (str): ano ex: 2020
                   edicao (str): id da edicao ex: '29257'

               Returns:
                   (list[list, str]): lista  com lista de secoes e string pag total
               """
        list_secoes = []
        html_text2 = requests.get(
            f'{self._url}txtData={dia}/{mes}/{ano}&edicao={edicao}&acao=1').content
        tree1 = parser.fromstring(html_text2)
        count = 2
        while True:
            try:
                list_element = tree1.xpath(f'//*[@id="pg"]/option[{count}]')
                numero = list_element[0].text
                list_secoes.append(numero.split(' .... '))
            except IndexError:
                break
            count += 1
        paginas = tree1.xpath('//*[@id="form"]/div[2]/div[1]/div/span')
        pag_total = paginas[0].text.split('de ')[1].strip()
        return [list_secoes, pag_total]

    def save_page(self, pdf_url, dia: str, mes: str, ano: str, caderno: str, secao: str, page_count: str):
        pdf = requests.get(pdf_url).content
        with open(fr'{self.pasta_diario_oficial}/{ano}/{mes}/{dia}/{caderno}/{secao}/{secao}_{page_count}.pdf',
                  'wb') as f:
            f.write(pdf)

    @staticmethod
    def _cria_dir(caminho):
        if not os.path.exists(caminho):
            os.makedirs(caminho)

    def salvar_pdf(self, dia: str, mes: str, ano: str, caderno: str, secao: str):
        """Funcao principal que salva os PDFs.


        Args:
            dia (str): dia ex: 18
            mes (str): mes ex: 06
            ano (str): ano ex: 2020
            caderno (str): nome do caderno ex: "Empresarial 2".
             Para listar os cadernos rode com uma string vazia.

            secao (str): nome da secao ex: "PROCURADORIA GERAL DO MUNICÍPIO".
             Para listar as secoes rodecom uma string vazia.

        """
        print('[!]Rodando...\n')
        self._cria_dir(fr'{self.pasta_diario_oficial}/{ano}/{mes}/{dia}/{caderno}/{secao}')
        edicao = self.get_cadernos(dia, mes, ano, caderno)
        if edicao:
            list_secoes, pag_total = self._get_secoes(dia, mes, ano, edicao)
            if list_secoes:
                print(f'\nSecoes:\n{list_secoes}\n')
                for idx, val in enumerate(list_secoes):
                    # print(val[0])
                    idx += 1
                    inicial = int(val[1])
                    try:
                        final = int(list_secoes[idx][1]) + 1
                    except IndexError:
                        final = int(pag_total) + 1
                    nome_secao = val[0].replace('____', '')
                    if nome_secao in secao:
                        print('Salvando...')
                        for item in range(inicial, final):
                            # print(item)
                            html_text3 = requests.get(
                                f'{self._url}txtData={dia}/{mes}/{ano}&edicao={edicao}&pg={item}&acao=1').content
                            tree2 = parser.fromstring(html_text3)
                            paginas1 = tree2.xpath('/html/head/script[1]/text()')[0].split("'")[3]
                            print(paginas1)
                            self.save_page(paginas1, dia, mes, ano, caderno, nome_secao, str(item))


if __name__ == '__main__':
    diario = DiarioOficial()
    diario.salvar_pdf('26', '06', '2020', 'Cidade', 'GABINETE DO PRESIDENTE')
    # print('mmm', diario.get_cadernos('26', '06', '2020', ''))
  • Your code works pretty cool, but what you’re calling the section is the notebook. the section is inside the notebook and it doesn’t appear in the link. In this case, I need to delimit a section within the Executive Notebook, and this selection is done by javascript, I believe. http://diariooficial.imprensaoficial.com.br/doflash/prototipo/2020/Junho/27/exec1/pdf/pg_0055.pdf this section, on that day, begins on page 55.

  • What you need is quite complex, let me see if I understand: You want a program that does a search by entering parameters such as date, notebook and the section name ex ('27', '06', '2020' '____Secretary’s Office' ) and save in a folder all the PDF’s" referring to the section, or want to save all the "sections" of a specific day in separate files?

  • in fact, the notebook and the section are always the same, I would only change the day. He would download the pdf for that section. I was able to understand the site and I think q doesn’t need java. In this link http://diariooficial.imprensaoficial.com.br/nav_v5/header.asp?txtData=26/06/2020&cad=4&cedic=20200626&pg=1&acao=&edicao=&secao=' it changes every day and you can find the list of sections and the respective page number of that day. now I’m having trouble with the following: '____Secretary’s Office' is a subsection that has to come along with the first name without the underscore.

Browser other questions tagged

You are not signed in. Login or sign up in order to post.