Fix Encoding Problem while exporting to csv from a scrapy file

Asked

Viewed 400 times

0

How can I fix encoding problem while saving file in csv? this problem is happening only when saved in csv. inserir a descrição da imagem aqui

from scrapy import *
from projeto_iruan.items import *
import csv

class imprensaNacional(scrapy.Spider):
    name = 'imprensaNacional'
    start_urls = ['http://www.imprensanacional.gov.br/leiturajornal?data=11-09-2018&secao=dou3']
    imprensaNacional = "imprensaNacional.csv"
    custom_settings = {
        'FEED_FORMAT': csv
    }

    def __init__(self):
        # empty outputfile
        open(self.imprensaNacional, "w").close()

    def parse(self, response):
        url_base = 'http://www.imprensanacional.gov.br/'
        script = response.xpath('//*[@class="span8 hierarchy-wrapper"]//*[contains(text(),"AVISO DE LICITA")]')
        for urls in script:
            links = urls.xpath('.//@href').extract_first().encode('utf-8')
            link_completo = url_base + links
            yield Request(url=link_completo, callback=self.parseAviso)

    def parseAviso(self, response):
        with open(self.imprensaNacional, "a") as f:
            writer = csv.writer(f, delimiter=";")
            conteudo = response.xpath('//*[@class="journal-content-article"]')
            for info in conteudo:
                titulo = info.xpath(u'.//*[@class="identifica"]/text()[contains(.,"N\xba")]').extract_first().encode('utf-8')
                pregao = info.xpath(u'.//*[@class="identifica"]/text()[contains(.,"N\xba")]').extract_first().split()[3].encode('utf-8')
                uasg = info.xpath(u'.//*[@class="identifica"]/text()[contains(.,"N\xba")]').extract_first().split()[6].encode('utf-8')
                tipo = info.xpath('.//*[@class="identifica"]/text()[contains(.,"AVISO")]').extract_first().encode('utf-8')
                pregoeiro = info.xpath('.//*[@class="assina"]/text()').extract_first().encode('utf-8')
                descricao = info.xpath('.//*[@class="dou-paragraph"]/text()').extract_first().encode('utf-8')
                dou = info.xpath('.//*[@class="dou-paragraph"]/text()[contains(.,"(")]').extract_first().encode('utf-8')
                orgao = info.xpath('.//*[@class="orgao-dou-data"]/text()').extract_first().encode('utf-8')
                data_publicacao = info.xpath('.//*[@class="publicado-dou-data"]/text()').extract_first().encode('utf-8')
                edicao_dou = info.xpath('.//*[@class="edicao-dou-data"]/text()').extract_first().encode('utf-8')
                secao = info.xpath('.//*[@class="secao-dou"]/text()').extract_first().encode('utf-8')
                pagina = info.xpath('.//*[@class="secao-dou-data"]/text()').extract_first().encode('utf-8')
                writer.writerow([titulo,tipo,pregao,uasg,dou,data_publicacao,edicao_dou,secao,pagina,orgao,pregoeiro,response.url,descricao])
                yield {'Titulo': titulo, 'Tipo': tipo, 'Pregao': pregao, 'UASG': uasg,
                       'DOU': dou, 'DataPublicacao': data_publicacao, 'Edicao': edicao_dou,
                       'Secao': secao, 'Pagina': pagina, 'Orgao': orgao, 'Pregoeiro': pregao,
                       'Url': response.url, 'Descricao': descricao}

1 answer

1

You are encoding your strings in utf-8, but it seems that you are using a spreadsheet program to read them that you are not recognizing this encoding.

If you are using excel, when importing to excel, use the import dialog (instead of simply opening the file) so you can select the encoding utf-8 option. Another alternative is to try using the cp1252 encoding which is usually the default encoding of excel in English. Exchange all .encode('utf-8') for .encode('cp1252') and see if it solves.

As for double lines, you are opening the file in text mode, and as the module csv python 2 expects a file in binary format, you end up having duplicity of line breaks. Change the line

with open(self.imprensaNacional, "a") as f:

for

with open(self.imprensaNacional, "ab") as f:

to use binary mode.

Browser other questions tagged

You are not signed in. Login or sign up in order to post.