2
Hello !
I am importing a database by Chunks and making filters.
I use this code that I took from someone here and adapted to my case:
arq_grande <- file("cnpj_dados_cadastrais_pj.csv", "r")
tam_chunk <- 5000
df1 <- read.csv(arq_grande, nrows = 10, header = T, sep = "#", dec = ".")
for(i in 1:ncol(df1)){df1[,i] <- df1[,i] %>% iconv(from = 'UTF-8', to = 'latin1')}
df_filtrado <- df1 %>% filter(codigo_natureza_juridica=="2143") %>% select(cnpj,everything())
write.table(df_filtrado, "cnpj_dados_cadastrais_pj_filtrado_coop.csv", row.names = F, sep = "#", dec = ".")
nrow <- 1
repeat {
df <- read.csv(arq_grande, header=FALSE, sep="#", col.names = names(df1), nrows = tam_chunk)
for(i in 1:ncol(df)){df[,i] <- df[,i] %>% iconv(from = 'UTF-8', to = 'latin1')}
cat("Read", nrow(df), "rows\n")
if (nrow(df) == 0)
break
df_filtrado <- df %>% filter(codigo_natureza_juridica=="2143") %>% select(cnpj,everything())
write.table(df_filtrado, "cnpj_dados_cadastrais_pj_filtrado_coop.csv", append = T, col.names = F, row.names = F, sep = "#", dec = ".")
}
close(arq_grande)
In this code, the function cat("Read", nrow(df), "rows\n")
displays the lines being analyzed. Example:
Read 5000 Rows
Read 5000 Rows
Read 5000 Rows
But since the base is too big, it doesn’t help much.
Would anyone know how to help me generate a visualization of progress? At least display the sum of lines that have already been analyzed ? And the processing time at the end ?
To make it something like:
Read 5000 Rows, total 5000
Read 5000 Rows, total 10000
Read 5000 Rows, total 15000
Total processing time: 20seconds
Thank you very much !
EDIT:
dput(head(df1, 20))
structure(list(tipo_de_registro = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), indicador = c(FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE), tipo_atualizacao = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), cnpj = c(191L, 272L, 353L,
434L, 515L, 604L, 787L, 868L, 949L, 1082L), identificador_matriz_filial = c(1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), razao_social = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "BANCO DO BRASIL SA", class = "factor"),
nome_fantasia = structure(c(2L, 3L, 6L, 9L, 1L, 8L, 7L, 10L,
5L, 4L), .Label = c("CAMPOS EST.UNIF.", "DIRECAO GERAL",
"MANAUS (AM)", "PORTO ALEGRE (RS)", "PRACA TIRADENTES - CURITIBA PR",
"PRESIDENTE VARGAS BELEM (PA)", "RECIFE (PE)", "SALVADOR (BA)",
"SANTOS - SANTOS (SP)", "SETOR PUBLICO - FORTALEZA (CE)"), class = "factor"),
situacao_cadastral = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), data_situacao_cadastral = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "2005-11-03", class = "factor"),
motivo_situacao_cadastral = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), nm_cidade_exterior = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), cod_pais = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), nm_pais = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), codigo_natureza_juridica = c(2038L, 2038L, 2038L,
2038L, 2038L, 2038L, 2038L, 2038L, 2038L, 2038L), data_inicio_atividade = structure(c(1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1966-08-01",
"1976-08-13"), class = "factor"), cnae_fiscal = c(6422100L,
6421200L, 6422100L, 6422100L, 6421200L, 6422100L, 6422100L,
6422100L, 6421200L, 6422100L), descricao_tipo_logradouro = structure(c(5L,
6L, 1L, 6L, 4L, 2L, 3L, 3L, 4L, 6L), .Label = c("", "4A AVENIDA",
"AVENIDA", "PRACA", "QUADRA", "RUA"), class = "factor"),
logradouro = structure(c(8L, 5L, 2L, 1L, 4L, 3L, 6L, 7L,
9L, 10L), .Label = c("15 DE NOVEMBRO", "AVEN PRESIDENTE VARGAS",
"CENTRO ADMINISTRATIVO DA BAHIA", "DAS 4 JORNADAS", "GUILHERME MOREIRA",
"RIO BRANCO", "SANTOS DUMONT", "SAUN QUADRA 5 LOTE B TORRES I, II E III",
"TIRADENTES", "URUGUAI"), class = "factor"), numero = structure(c(10L,
7L, 5L, 3L, 1L, 9L, 4L, 6L, 8L, 2L), .Label = c("11", "185",
"195", "240", "248", "2889", "315", "410", "600", "SN"), class = "factor"),
complemento = structure(c(4L, 1L, 1L, 1L, 1L, 1L, 3L, 2L,
1L, 1L), .Label = c("", "3 ANDAR", "ANDAR 1", "ANDAR 1 A 16 SALA 101 A 1601 ANDAR 1 A 16 SALA 101 A 1601 ANDAR 1 A 16 SALA 101 A 1601"
), class = "factor"), bairro = structure(c(2L, 3L, 5L, 3L,
3L, 4L, 6L, 1L, 3L, 3L), .Label = c("ALDEOTA", "ASA NORTE",
"CENTRO", "CENTRO ADMINISTRATIVO DA BAHIA", "COMERCIO", "RECIFE"
), class = "factor"), cep = c(70040912L, 69005300L, 66010900L,
11010908L, 28030002L, 41745002L, 50030310L, 60150165L, 80020100L,
90010901L), uf = structure(c(4L, 1L, 5L, 10L, 8L, 2L, 6L,
3L, 7L, 9L), .Label = c("AM", "BA", "CE", "DF", "PA", "PE",
"PR", "RJ", "RS", "SP"), class = "factor"), codigo_municipio = c(9701L,
255L, 427L, 7071L, 5819L, 3849L, 2531L, 1389L, 7535L, 8801L
), municipio = structure(c(2L, 6L, 1L, 10L, 3L, 9L, 8L, 5L,
4L, 7L), .Label = c("BELEM", "BRASILIA", "CAMPOS DOS GOYTACAZES",
"CURITIBA", "FORTALEZA", "MANAUS", "PORTO ALEGRE", "RECIFE",
"SALVADOR", "SANTOS"), class = "factor"), ddd_telefone_1 = structure(c(2L,
1L, 1L, 1L, 1L, 3L, 4L, 5L, 1L, 1L), .Label = c("", "61 34939002",
"71 33628803", "81 34257433", "85 32667805"), class = "factor"),
ddd_telefone_2 = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 3L,
4L, 1L, 1L), .Label = c("", "71 33628952", "81 34257233",
"85 32667829"), class = "factor"), ddd_fax = structure(c(2L,
1L, 1L, 1L, 1L, 3L, 4L, 5L, 1L, 1L), .Label = c("", "61 34931040",
"71 32420844", "81 34257520", "85 33660808"), class = "factor"),
correio_eletronico = structure(c(5L, 1L, 1L, 1L, 1L, 2L,
3L, 4L, 1L, 1L), .Label = c("", "[email protected]", "[email protected]",
"[email protected]", "[email protected]"), class = "factor"),
qualificacao_responsavel = c(10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L), capital_social_empresa = c(6e+10, 0,
0, 0, 0, 0, 0, 0, 0, 0), porte_empresa = c(5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L), opcao_pelo_simples = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), data_opcao_pelo_simples = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), data_exclusao_simples = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), opcao_pelo_mei = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "N"), class = "factor"),
situacao_especial = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), data_situacao_especial = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), filler = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), fim_registro = c(FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE)), row.names = c(NA, 10L
), class = "data.frame")
Thank you very much ! But repeat() is not working. When I say run, there is a "+" sign on the console. I couldn’t find where it is. And you need to put a comma at the end of the df line() ?
– RxT
I just realized that my code is reading only half the lines of the original . csv. You know what it can be ?
– RxT
Progress with total nrow and time working? For the reading question of . csv, you will get more help if you ask a new question.
– Willian Vieira
The progress yes ! time no, I removed this function, because when I had repeat() run, it did not run, "+" appears on the console, as if something was missing.
– RxT