Testing and changing the structure to use threads is a lot of pags to use just one, unless you have time.
In my opinion, either you choose the columns to store in csv, or it simply stays in a json file, because you have nested data and it’s not good like that, but you can adjust it later as you want:
import requests
import pandas as pd
import
import threading, Ueue, json
from time import Sleep
def get_req(i):
url = 'http://www.cnj.jus.br/bnmp/rest/pesquisar'
payload = {"criterio":{"orgaoJulgador":{"uf":"RJ","municipio":"","descricao":""},"orgaoJTR":{},"parte":{"documentos":[{"identificacao":""}]}},"paginador":{"paginaAtual":i},"fonetica":"true","ordenacao":{"porNome":False,"porData":False}}
try:
req = requests.get(url, json=payload).json()
except Exception as err:
print(err)
sleep(1) # para nao 'massacrar' ainda mais o servidor, nem o nosso cpu
return get_req(i)
else:
return req
def p_manager(p_q): # funcao responsavel pelos prints aqui vai atuar o nosso p_q definido em baixo
while True:
msg = p_q.get()
print(msg)
p_q.task_done()
def handle_reqs(work):
total_w = len(work)
while work:
i = work.pop(0) # fazendo assim vamos aliviando a memoria
p_q.put('[+] {}/{} - obtendo pag: {}'.format(len(work), total_w, i))
req = get_req(i)
for idx, i in enumerate(req['mandados']):
for j in i['detalhes']:
j_spl = j.split(':') # separar chave do valor
req['mandados'][idx][j_spl[0]] = j_spl[1].strip()
mandados.extend(req['mandados'])
if(threading.active_count() <= 3): # se só houverem 3 threads (esta, a main, e a daemon), acabamos o scrapping
data_q.put(True) # acabou e enviamos o sinal para desbloquear e escrever no ficheiro
p_q = queue.Queue() # responsavel pelos prints, nao queremos sobrecarregar as outras threads com os prints (chamadas de sistema)
t = threading.Thread(target=p_manager, args=(p_q,))
t.daemon = True # daemon, significa que o programa acaba independentemente se esta tem trabalho pendente ou nao
t.start() # inicia-la
data_q = queue.Queue() # responsavel pelo rastreio do do final scrapping
mandados = []
num_threads = 100 # vamos usar 100 threads
works = [list(range(1, 5299))[i::num_threads] for i in range(num_threads)] # preparar o trabalho para cada thread
for w in works: # dividir o trabalho pelas threads
threading.Thread(target=handle_reqs, args=(w,)).start() # iniciar cada uma
data_q.get() # bloquear até receber o sinal e continuar o prog
# print(mandados)
df = pd.DataFrame(mandados, columns=['nomeParte', 'orgao', 'numeroMandado', 'dataMandado', 'situacao', 'Nome do Genitor', 'Nome da Genitora', 'Data de nascimento', 'Carteira de identidade'])
df.to_csv('mandados_12_abr_2018_RJ.csv', index=False, sep=';')
print(df.head())
Ouput:
nomeParte orgao numeroMandado \
0 CLAUDIA FERREIRA VIEIRA TJRJ 358208-13.2011.8.19.0001.0002
1 JEFFERSON FARIAS DE SOUZA ASSUNÇÃO TJRJ 358208-13.2011.8.19.0001.0001
2 LEANDRO MARINHO DA SILVA TJRJ 7725-44.2011.8.19.0036.0001
3 NEWTON SERGIO SERPA FARACO TJRJ 935-15.2012.8.19.0002.0001
4 FABIANO FIGUEIREDO MARQUES TJRJ 4091-11.2012.8.19.0002.0001
dataMandado situacao Nome do Genitor \
0 2012-02-01 Aguardando Cumprimento José Salema Ferreira
1 2012-02-01 Aguardando Cumprimento Sergio Ricardo Souza Assunção
2 2012-02-01 Aguardando Cumprimento Gilberto Marinho Da Silva
3 2012-02-01 Aguardando Cumprimento Carlos Henrique Faraco
4 2012-02-01 Aguardando Cumprimento Carlos Henrique Da Silva Marques
Nome da Genitora Data de nascimento \
0 Izabel Teixeira Ferreira 22/08/1973
1 Elaine Farias 07/01/1992
2 Maria Das Dores Marinho Da Silva NaN
3 Marilda Serpa Faraco 19/07/1979
4 Isabel Cristina De Souza Figueiredo 17/01/1984
Carteira de identidade
0 111860326
1 246563449
2 122077373
3 126036201
4 204365324
hello, thank you very much. here took about 10 minutes. on error pages appeared: ('Connection aborted.', Connectionreseterror(104, 'Connection reset by peer')) - at the end generated a CSV with the columns: sent, message, pager and success
– Reinaldo Chaves
in warrants has 5.298 lines, which is the information that interests me. is a list with dictionaries
– Reinaldo Chaves
I guess when the req comes back I’d have to select only the information of warrants, right? That’s possible?
– Reinaldo Chaves
@Reinaldochaves but what is the information of warrants? Which columns interest you?
– Miguel
namePart, organization, numberMandate, dateMandate, situation and details
– Reinaldo Chaves
Details is a list, so I suggest, in order not to evade the question that was asked, that you ask another question to know how to parse those columns in order to have the ones that interest you. I’m sorry, but I was going to have to change the answer a lot and I was going to be out of context with the question
– Miguel
I put here: https://answall.com/questions/291062/howto create dataframe-em-pandas-a-partir-de-series-com-dicion%C3%a1rios , @Miguel
– Reinaldo Chaves