How to avoid Max retries exceeded error in scraping in Python?

Asked

Viewed 2,667 times

1

In Python 3 I made a program to scrape table lines from a public website with several pages (97893). And I create a list with the rows of each column and put a sleep to try to prevent scraping from stopping, but even using multiple times is not working

The home site is this: http://www.portaltransparencia.gov.br/PortalComprasDiretasFavorecido.asp?TipoPesquisa=2&Ano=2017&Pagina=1

from bs4 import BeautifulSoup
import requests
import pandas as pd
import random
from time import sleep

def sopa(link):
    res = requests.get(link)
    soup =  BeautifulSoup(res.text, "lxml")
    table = soup.select("table")[1]
    conjunto = table.findAll("tr")
    return conjunto

planilha = []

for i in range(1,97893):
    link = "http://www.portaltransparencia.gov.br/PortalComprasDiretasFavorecido.asp?TipoPesquisa=2&Ano=2017&Pagina="
    link = link + str(i)
    print(link)
    conjunto = sopa(link)
    sleep(random.uniform(0.2, 10))
    conta = 0
    for linha in conjunto:
        if conta > 0:
            documento = linha.find("td", {"class": "firstChild"}, {"style": "white-space: nowrap;"}).text.strip()
            nome = linha.find("a").text.strip()
            valor = linha.find("td", {"class": "colunaValor"}).text.strip()
            dicionario = {"documento": documento, "nome": nome, "valor": valor}
            planilha.append(dicionario)
        conta = conta + 1

Stopped at 686 site with these error messages:

gaierror                                  Traceback (most recent call last)
~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/connection.py in _new_conn(self)
    140             conn = connection.create_connection(
--> 141                 (self.host, self.port), self.timeout, **extra_kw)
    142 

~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
     59 
---> 60     for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
     61         af, socktype, proto, canonname, sa = res

/usr/lib/python3.6/socket.py in getaddrinfo(host, port, family, type, proto, flags)
    742     addrlist = []
--> 743     for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
    744         af, socktype, proto, canonname, sa = res

gaierror: [Errno -2] Name or service not known

During handling of the above exception, another exception occurred:

NewConnectionError                        Traceback (most recent call last)
~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    600                                                   body=body, headers=headers,
--> 601                                                   chunked=chunked)
    602 

~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    356         else:
--> 357             conn.request(method, url, **httplib_request_kw)
    358 

/usr/lib/python3.6/http/client.py in request(self, method, url, body, headers, encode_chunked)
   1238         """Send a complete request to the server."""
-> 1239         self._send_request(method, url, body, headers, encode_chunked)
   1240 

/usr/lib/python3.6/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1284             body = _encode(body, 'body')
-> 1285         self.endheaders(body, encode_chunked=encode_chunked)
   1286 

/usr/lib/python3.6/http/client.py in endheaders(self, message_body, encode_chunked)
   1233             raise CannotSendHeader()
-> 1234         self._send_output(message_body, encode_chunked=encode_chunked)
   1235 

/usr/lib/python3.6/http/client.py in _send_output(self, message_body, encode_chunked)
   1025         del self._buffer[:]
-> 1026         self.send(msg)
   1027 

/usr/lib/python3.6/http/client.py in send(self, data)
    963             if self.auto_open:
--> 964                 self.connect()
    965             else:

~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/connection.py in connect(self)
    165     def connect(self):
--> 166         conn = self._new_conn()
    167         self._prepare_conn(conn)

~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/connection.py in _new_conn(self)
    149             raise NewConnectionError(
--> 150                 self, "Failed to establish a new connection: %s" % e)
    151 

NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7f4b9674a780>: Failed to establish a new connection: [Errno -2] Name or service not known

During handling of the above exception, another exception occurred:

MaxRetryError                             Traceback (most recent call last)
~/Documentos/Code/knight/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    439                     retries=self.max_retries,
--> 440                     timeout=timeout
    441                 )

~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    638             retries = retries.increment(method, url, error=e, _pool=self,
--> 639                                         _stacktrace=sys.exc_info()[2])
    640             retries.sleep()

~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
    387         if new_retry.is_exhausted():
--> 388             raise MaxRetryError(_pool, url, error or ResponseError(cause))
    389 

MaxRetryError: HTTPConnectionPool(host='www.portaltransparencia.gov.br', port=80): Max retries exceeded with url: /PortalComprasDiretasFavorecido.asp?TipoPesquisa=2&Ano=2017&Pagina=686 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f4b9674a780>: Failed to establish a new connection: [Errno -2] Name or service not known',))

During handling of the above exception, another exception occurred:

ConnectionError                           Traceback (most recent call last)
<ipython-input-4-590ac6d45255> in <module>()
      3     link = link + str(i)
      4     print(link)
----> 5     conjunto = sopa(link)
      6     sleep(random.uniform(0.2, 10))
      7     conta = 0

<ipython-input-2-7aefd26bf83b> in sopa(link)
      1 def sopa(link):
----> 2     res = requests.get(link)
      3     soup =  BeautifulSoup(res.text, "lxml")
      4     table = soup.select("table")[1]
      5     conjunto = table.findAll("tr")

~/Documentos/Code/knight/lib/python3.6/site-packages/requests/api.py in get(url, params, **kwargs)
     70 
     71     kwargs.setdefault('allow_redirects', True)
---> 72     return request('get', url, params=params, **kwargs)
     73 
     74 

~/Documentos/Code/knight/lib/python3.6/site-packages/requests/api.py in request(method, url, **kwargs)
     56     # cases, and look like a memory leak in others.
     57     with sessions.Session() as session:
---> 58         return session.request(method=method, url=url, **kwargs)
     59 
     60 

~/Documentos/Code/knight/lib/python3.6/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    506         }
    507         send_kwargs.update(settings)
--> 508         resp = self.send(prep, **send_kwargs)
    509 
    510         return resp

~/Documentos/Code/knight/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs)
    616 
    617         # Send the request
--> 618         r = adapter.send(request, **kwargs)
    619 
    620         # Total elapsed time of the request (approximately)

~/Documentos/Code/knight/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    506                 raise SSLError(e, request=request)
    507 
--> 508             raise ConnectionError(e, request=request)
    509 
    510         except ClosedPoolError as e:

ConnectionError: HTTPConnectionPool(host='www.portaltransparencia.gov.br', port=80): Max retries exceeded with url: /PortalComprasDiretasFavorecido.asp?TipoPesquisa=2&Ano=2017&Pagina=686 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f4b9674a780>: Failed to establish a new connection: [Errno -2] Name or service not known',))

Please, do I need very long downtime to do this scraping? Or might it be flawed in the quality of my connection?

1 answer

1


I ran the program down in response, it took almost 2 hours (even with 100 threads), but the data is here: https://we.tl/kAUuAeW9gR (won’t be for long)

I can’t effectively answer the question of the question’s title, because it could be any number of reasons, but I can help improve the code:

Since there are 97893 pages, it is very impractical to make all requests in the same thread, even if you file one request per second it would take more than 27 hours.

It’s also very important to store it right away in a file/database so you don’t have to run Prog many more times (it’s a heavy and time-consuming program), so next time you need the data just open the file and the links are there.

I’ve done many webcrawlers, and when I have such a large number of requests to always use threads, which in this case has greatly reduced the execution time, taking advantage of the part you did to effectively parse html (BeautifulSoup), I did a few things (use 100 threads in the code below):

PS: I didn’t need to use Random:

from bs4 import BeautifulSoup
import requests
import pandas as pd
import random
import threading, queue, json
from time import sleep

def sopa(link):
    res = requests.get(link)
    soup =  BeautifulSoup(res.text, "lxml")
    table = soup.select("table")[1]
    conjunto = table.findAll("tr")
    return conjunto

def p_manager(p_q): # funcao responsavel pelos prints aqui vai atuar o nosso p_q definido em baixo
    while True:
        msg = p_q.get()
        print(msg)
        p_q.task_done()


def handle_reqs(work):
    total_w = len(work)
    while work:
        i = work.pop(0) # fazendo assim vamos aliviando a memoria
        link = "http://www.portaltransparencia.gov.br/PortalComprasDiretasFavorecido.asp?TipoPesquisa=2&Ano=2017&Pagina={}".format(i)
        p_q.put('[+] {}/{} - getting: {}'.format(len(work), total_w, link))
        conjunto = sopa(link)
        conta = 0
        for linha in conjunto:
            if conta > 0:
                documento = linha.find("td", {"class": "firstChild"}, {"style": "white-space: nowrap;"}).text.strip()
                nome = linha.find("a").text.strip()
                valor = linha.find("td", {"class": "colunaValor"}).text.strip()
                dicionario = {"documento": documento, "nome": nome, "valor": valor}
                planilha.append(dicionario)
            conta = conta + 1
    if(threading.active_count() <= 3): # se só houverem 3 threads (esta, a main, e a daemon), acabamos o scrapping
        data_q.put(True) # acabou e enviamos o sinal para desbloquear e escrever no ficheiro

p_q = queue.Queue() # responsavel pelos prints, nao queremos sobrecarregar as outras threads com os prints (chamadas de sistema)
t = threading.Thread(target=p_manager, args=(p_q,))
t.daemon = True # daemon, significa que o programa acaba independentemente se esta tem trabalho pendente ou nao
t.start() # inicia-la

data_q = queue.Queue() # responsavel pelo rastreio do do final scrapping 
planilha = []
num_threads = 100 # vamos usar 100 threads
works = [list(range(1, 97893))[i::num_threads] for i in range(num_threads)] # preparar o trabalho para cada thread
for w in works: # dividir o trabalho pelas threads
    threading.Thread(target=handle_reqs, args=(w,)).start() # iniciar cada uma

data_q.get() # bloquear até receber o sinal e continuar o prog

with open('tests.txt', 'w') as f:
    json.dump(planilha, f, sort_keys=True, indent=4)
  • 1

    Thank you very much for your attention

  • 1

    You’re welcome @Reinaldochaves

  • I ran the program on my machine. After two hours it stopped on "Page=97845" a long time ago. It does not advance to the next link already about 30 minutes ago. It may be problem in connection?

  • I also opened the JSON file that was made available gently. I couldn’t find any names like, KATIA DALILA LIMA PROTASIO

  • 1

    It is able @Reinaldochaves... Did not find to be because of the parsing of html, some page must have some unforeseen tags. Reset the program, try to start again

  • Hello. All right? I’m trying to adapt this code for another big scraping. Can I write the question, @Miguel?

  • 1

    Hello @Reinaldochaves, can you ask another question and tell me here that I will try to help

  • Thank you. I put here: https://answall.com/questions/291032/em-raspagens-grandes-comorevitar-connectionerror

  • And here’s the attempt with threads: https://gist.github.com/reichaves/d0b3aee99d0310f73045b20150639985

Show 4 more comments

Browser other questions tagged

You are not signed in. Login or sign up in order to post.