I want to remove all text that has % of a PDF. but it does not return me the result

Question

I want to remove all text that has % of a PDF. but it does not return me the result

Asked 4 years, 9 months ago

Viewed 30 times

0

I am trying to get back an administration rate of several PDF’s, however these PDF’s do not follow a standard, so I want to bring all the percentages that contain in this PDF, so the final file would have: FILE NAME; CNPJ; FEE

import json
from logging import PercentStyle
import os
import threading
import re
from pdfminer.high_level import extract_text

class Worker(threading.Thread):

    def __init__(self, queue, base_dir, logger):
        super(Worker, self).__init__()

        # Inicializa as variáveis locais
        self.my_queue = queue
        self.logger = logger
        self.base_dir = base_dir

        self.ptn_cnpj = re.compile(r'[0-9]{2}\.?[0-9]{3}\.?[0-9]{3}\/?[0-9]{4}\-?[0-9]{2}')

    def run(self):

        while True:
            # Imprime log para o usuário
            # self.logger.info(f"Aguardando arquivo para processamento.")

            # Busca os dados da fila de execução
            try:
                file_to_proccess = self.my_queue.get(timeout=5)
            except:
                file_to_proccess = None

            if file_to_proccess is None:
                continue

            # Transforma o retorno da fila em formato json
            file_to_proccess = json.loads(file_to_proccess)

            # Verifica se foi enviada ordem de parada do programa
            if 'end' in file_to_proccess:
                self.logger.warning(f"Fechamento solicitado.")
                break
            # ---

            # Verifica se os dados são válidos
            if len(file_to_proccess) > 0:
                # Chama a rotina de processamento do e-mail
                self.__proccess_file(file_to_proccess)
            # ---

            # Atualiza a situação da fila para "concluído"
            self.my_queue.task_done()

    def __proccess_file(self, file_to_proccess):
        # Verifica se existe a propriedade "name" no json
        if 'name' not in file_to_proccess:
            # self.logger.error('Processo sem arquivo informado, ignorado.')
            return None
        # ---

        # Busca o texto do arquivo
        text = self.__extract_text(file_to_proccess['name'])

        # Verifica se houve retorno da extração
        if text:
            fileoutput = os.path.join(self.base_dir, 'output/',
                                      f"processo_{str(threading.currentThread().getName())}.txt")

            with open(fileoutput, "a+") as fn:
                fn.write(f"{text}\n")

            self.logger.info(f'Arquivo "{file_to_proccess["name"]}" processado com sucesso.')
        # ---

        # Move o arquivo para outra pasta
        try:
            from_name = os.path.join(self.base_dir, 'pending/', file_to_proccess['name'])
            to_name = os.path.join(self.base_dir, 'done/', file_to_proccess['name'])
            os.replace(from_name, to_name)
        except Exception as e:
            self.logger.error(f'Erro ao mover o arquivo "{file_to_proccess["name"]}": {str(e)}')
        # ---

    def __extract_text(self, filename):
        # Verifica se o arquivo é um PDF
        if filename.lower().endswith(".pdf"):
            # Realiza a extração do texto do PDF
            try:
                full_filename = os.path.join(self.base_dir, 'pending/', filename)
                text = extract_text(full_filename).replace('\n', '')
            except Exception as e:
                self.logger.error(f'Erro na extração de dados do PDF "{filename}".')
                return None
            # ---

            # Verifica se existe a expressão "cédula"

            # print(text)
            
                   
               

            if text[0:6] == 'Cédula':
                try:
                    credora_cnpj = re.findall(r'[0-9]{2}\.?[0-9]{3}\.?[0-9]{3}\/?[0-9]{4}\-?[0-9]{2}', text)[0].strip()
                except IndexError:
                    credora_cnpj = ''
            else:
                try:
                    credora_cnpj = re.findall(self.ptn_cnpj, text)[0].strip()
                except IndexError:
                    credora_cnpj = ''
            
                texto = 'taxa de administração que equivalerá a %, ou taxa de gestão é %'
            

                lista = texto.split('%')

                porcents = []
            
                for item in lista[:-1]:
                  texto_da_taxa = item.split(" ")[-1] + '%'
                porcents.append(texto_da_taxa)
                if len(texto_da_taxa)==0:
                   porcents.append('Não foi encontrado nenhuma taxa de administração')
            
            
            # --- (Fim do verifica se existe a expressão "cédula")

            # final_text = "{};{};{};{};{};{};{};{};{};{};".format(filename, credora_nome2, credora_cnpj, emitente_nome,
            #                                                      emitente_doc, emiss, venc, principal, prazo, parcela)

            final_text = "{};{};{};".format(filename, credora_cnpj, texto_da_taxa)
            return final_text
        else:
            self.logger.error(f'O arquivo "{filename}", não é um PDF.')
            return None
        # --- (Fim do verifica se o arquivo é um PDF)

You have at least one sample of the pdf file that you intend to use as input in your black box ?

– Lacobus

2021/07/16 at 22:26
Yes, I will leave the link to a PDF: http://sistemas.cvm.gov.br/docsrecebidos/20201223143628UP8e71ac9ffdcf47bb878ef13c4bad9290.pdf

– Caio Pereira

2021/07/17 at 14:16

No answers

Browser other questions tagged python python-3.x

You are not signed in. Login or sign up in order to post.