[EDIT]
Yes I used the url you indicated and modified "slightly"
import os
import lxml.html as parser
import requests
class DiarioOficial():
_header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/70.0.3538.77 Safari/537.36',
}
_url = 'http://diariooficial.imprensaoficial.com.br/nav_v5/header.asp?'
def __init__(self, pasta_diario_oficial='diario_oficial'):
self.pasta_diario_oficial = pasta_diario_oficial
def get_cadernos(self, dia: str, mes: str, ano: str, caderno: str):
"""Funcao que recupera os cadernos e suas respectivas IDs
Args:
dia (str): dia ex: 18
mes (str): mes ex: 06
ano (str): ano ex: 2020
caderno (str): nome do caderno ex: Empresarial 2
rode sem parametros para listar os cadernos
Returns:
(str): ID do respectivo caderno
"""
html_text = requests.get(
f'{self._url}txtData={dia}/{mes}/{ano}&acao=1').content
tree = parser.fromstring(html_text)
count = 2
print('Cadernos:\n')
while True:
try:
list_element = tree.xpath(f'//*[@id="edicao"]/option[{count}]')
caderno_nome = list_element[0].text.strip()
caderno_numero = list_element[0].attrib.values()[0]
print(f'{caderno_numero}: {caderno_nome}')
if caderno_nome in caderno:
return caderno_numero
except IndexError:
break
count += 1
def _get_secoes(self, dia: str, mes: str, ano: str, edicao: str):
"""Funcao que recupera as secoes e o numero total de paginas
Args:
dia (str): dia ex: 18
mes (str): mes ex: 06
ano (str): ano ex: 2020
edicao (str): id da edicao ex: '29257'
Returns:
(list[list, str]): lista com lista de secoes e string pag total
"""
list_secoes = []
html_text2 = requests.get(
f'{self._url}txtData={dia}/{mes}/{ano}&edicao={edicao}&acao=1').content
tree1 = parser.fromstring(html_text2)
count = 2
while True:
try:
list_element = tree1.xpath(f'//*[@id="pg"]/option[{count}]')
numero = list_element[0].text
list_secoes.append(numero.split(' .... '))
except IndexError:
break
count += 1
paginas = tree1.xpath('//*[@id="form"]/div[2]/div[1]/div/span')
pag_total = paginas[0].text.split('de ')[1].strip()
return [list_secoes, pag_total]
def save_page(self, pdf_url, dia: str, mes: str, ano: str, caderno: str, secao: str, page_count: str):
pdf = requests.get(pdf_url).content
with open(fr'{self.pasta_diario_oficial}/{ano}/{mes}/{dia}/{caderno}/{secao}/{secao}_{page_count}.pdf',
'wb') as f:
f.write(pdf)
@staticmethod
def _cria_dir(caminho):
if not os.path.exists(caminho):
os.makedirs(caminho)
def salvar_pdf(self, dia: str, mes: str, ano: str, caderno: str, secao: str):
"""Funcao principal que salva os PDFs.
Args:
dia (str): dia ex: 18
mes (str): mes ex: 06
ano (str): ano ex: 2020
caderno (str): nome do caderno ex: "Empresarial 2".
Para listar os cadernos rode com uma string vazia.
secao (str): nome da secao ex: "PROCURADORIA GERAL DO MUNICÍPIO".
Para listar as secoes rodecom uma string vazia.
"""
print('[!]Rodando...\n')
self._cria_dir(fr'{self.pasta_diario_oficial}/{ano}/{mes}/{dia}/{caderno}/{secao}')
edicao = self.get_cadernos(dia, mes, ano, caderno)
if edicao:
list_secoes, pag_total = self._get_secoes(dia, mes, ano, edicao)
if list_secoes:
print(f'\nSecoes:\n{list_secoes}\n')
for idx, val in enumerate(list_secoes):
# print(val[0])
idx += 1
inicial = int(val[1])
try:
final = int(list_secoes[idx][1]) + 1
except IndexError:
final = int(pag_total) + 1
nome_secao = val[0].replace('____', '')
if nome_secao in secao:
print('Salvando...')
for item in range(inicial, final):
# print(item)
html_text3 = requests.get(
f'{self._url}txtData={dia}/{mes}/{ano}&edicao={edicao}&pg={item}&acao=1').content
tree2 = parser.fromstring(html_text3)
paginas1 = tree2.xpath('/html/head/script[1]/text()')[0].split("'")[3]
print(paginas1)
self.save_page(paginas1, dia, mes, ano, caderno, nome_secao, str(item))
if __name__ == '__main__':
diario = DiarioOficial()
diario.salvar_pdf('26', '06', '2020', 'Cidade', 'GABINETE DO PRESIDENTE')
# print('mmm', diario.get_cadernos('26', '06', '2020', ''))
Your code works pretty cool, but what you’re calling the section is the notebook. the section is inside the notebook and it doesn’t appear in the link. In this case, I need to delimit a section within the Executive Notebook, and this selection is done by javascript, I believe.
http://diariooficial.imprensaoficial.com.br/doflash/prototipo/2020/Junho/27/exec1/pdf/pg_0055.pdf
this section, on that day, begins on page 55.– A Neto
What you need is quite complex, let me see if I understand: You want a program that does a search by entering parameters such as date, notebook and the section name ex ('27', '06', '2020' '____Secretary’s Office' ) and save in a folder all the PDF’s" referring to the section, or want to save all the "sections" of a specific day in separate files?
– Sinf0r0s0
in fact, the notebook and the section are always the same, I would only change the day. He would download the pdf for that section. I was able to understand the site and I think q doesn’t need java. In this link
http://diariooficial.imprensaoficial.com.br/nav_v5/header.asp?txtData=26/06/2020&cad=4&cedic=20200626&pg=1&acao=&edicao=&secao='
it changes every day and you can find the list of sections and the respective page number of that day. now I’m having trouble with the following: '____Secretary’s Office' is a subsection that has to come along with the first name without the underscore.– A Neto