A - How to display repeat progress ? And processing time?

Asked

Viewed 92 times

2

Hello !

I am importing a database by Chunks and making filters.

I use this code that I took from someone here and adapted to my case:

   arq_grande <- file("cnpj_dados_cadastrais_pj.csv", "r")
    tam_chunk <- 5000
    df1 <- read.csv(arq_grande, nrows = 10, header = T, sep = "#", dec = ".")
    for(i in 1:ncol(df1)){df1[,i] <- df1[,i] %>% iconv(from = 'UTF-8', to = 'latin1')}
    df_filtrado <- df1 %>% filter(codigo_natureza_juridica=="2143") %>%  select(cnpj,everything())
    write.table(df_filtrado, "cnpj_dados_cadastrais_pj_filtrado_coop.csv", row.names = F, sep = "#", dec = ".")

    nrow <- 1

    repeat {
      df <- read.csv(arq_grande, header=FALSE, sep="#", col.names = names(df1), nrows = tam_chunk)
      for(i in 1:ncol(df)){df[,i] <- df[,i] %>% iconv(from = 'UTF-8', to = 'latin1')}
      cat("Read", nrow(df), "rows\n")
      if (nrow(df) == 0)
        break
      df_filtrado <- df %>% filter(codigo_natureza_juridica=="2143") %>%  select(cnpj,everything())
      write.table(df_filtrado, "cnpj_dados_cadastrais_pj_filtrado_coop.csv", append = T, col.names = F, row.names = F, sep = "#", dec = ".")
    }
    close(arq_grande)

In this code, the function cat("Read", nrow(df), "rows\n") displays the lines being analyzed. Example:

Read 5000 Rows

Read 5000 Rows

Read 5000 Rows

But since the base is too big, it doesn’t help much.


Would anyone know how to help me generate a visualization of progress? At least display the sum of lines that have already been analyzed ? And the processing time at the end ?

To make it something like:

Read 5000 Rows, total 5000

Read 5000 Rows, total 10000

Read 5000 Rows, total 15000

Total processing time: 20seconds

Thank you very much !


EDIT:

dput(head(df1, 20))
structure(list(tipo_de_registro = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), indicador = c(FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE), tipo_atualizacao = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA), cnpj = c(191L, 272L, 353L, 
434L, 515L, 604L, 787L, 868L, 949L, 1082L), identificador_matriz_filial = c(1L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), razao_social = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "BANCO DO BRASIL SA", class = "factor"), 
    nome_fantasia = structure(c(2L, 3L, 6L, 9L, 1L, 8L, 7L, 10L, 
    5L, 4L), .Label = c("CAMPOS EST.UNIF.", "DIRECAO GERAL", 
    "MANAUS (AM)", "PORTO ALEGRE (RS)", "PRACA TIRADENTES - CURITIBA PR", 
    "PRESIDENTE VARGAS BELEM (PA)", "RECIFE (PE)", "SALVADOR (BA)", 
    "SANTOS - SANTOS (SP)", "SETOR PUBLICO - FORTALEZA (CE)"), class = "factor"), 
    situacao_cadastral = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L), data_situacao_cadastral = structure(c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L), .Label = "2005-11-03", class = "factor"), 
    motivo_situacao_cadastral = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L), nm_cidade_exterior = c(NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA), cod_pais = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA), nm_pais = c(NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA), codigo_natureza_juridica = c(2038L, 2038L, 2038L, 
    2038L, 2038L, 2038L, 2038L, 2038L, 2038L, 2038L), data_inicio_atividade = structure(c(1L, 
    2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1966-08-01", 
    "1976-08-13"), class = "factor"), cnae_fiscal = c(6422100L, 
    6421200L, 6422100L, 6422100L, 6421200L, 6422100L, 6422100L, 
    6422100L, 6421200L, 6422100L), descricao_tipo_logradouro = structure(c(5L, 
    6L, 1L, 6L, 4L, 2L, 3L, 3L, 4L, 6L), .Label = c("", "4A AVENIDA", 
    "AVENIDA", "PRACA", "QUADRA", "RUA"), class = "factor"), 
    logradouro = structure(c(8L, 5L, 2L, 1L, 4L, 3L, 6L, 7L, 
    9L, 10L), .Label = c("15 DE NOVEMBRO", "AVEN PRESIDENTE VARGAS", 
    "CENTRO ADMINISTRATIVO DA BAHIA", "DAS 4 JORNADAS", "GUILHERME MOREIRA", 
    "RIO BRANCO", "SANTOS DUMONT", "SAUN QUADRA 5 LOTE B TORRES I, II E III", 
    "TIRADENTES", "URUGUAI"), class = "factor"), numero = structure(c(10L, 
    7L, 5L, 3L, 1L, 9L, 4L, 6L, 8L, 2L), .Label = c("11", "185", 
    "195", "240", "248", "2889", "315", "410", "600", "SN"), class = "factor"), 
    complemento = structure(c(4L, 1L, 1L, 1L, 1L, 1L, 3L, 2L, 
    1L, 1L), .Label = c("", "3 ANDAR", "ANDAR 1", "ANDAR 1 A 16              SALA  101 A 1601          ANDAR 1 A 16              SALA  101 A 1601          ANDAR 1 A 16              SALA  101 A 1601"
    ), class = "factor"), bairro = structure(c(2L, 3L, 5L, 3L, 
    3L, 4L, 6L, 1L, 3L, 3L), .Label = c("ALDEOTA", "ASA NORTE", 
    "CENTRO", "CENTRO ADMINISTRATIVO DA BAHIA", "COMERCIO", "RECIFE"
    ), class = "factor"), cep = c(70040912L, 69005300L, 66010900L, 
    11010908L, 28030002L, 41745002L, 50030310L, 60150165L, 80020100L, 
    90010901L), uf = structure(c(4L, 1L, 5L, 10L, 8L, 2L, 6L, 
    3L, 7L, 9L), .Label = c("AM", "BA", "CE", "DF", "PA", "PE", 
    "PR", "RJ", "RS", "SP"), class = "factor"), codigo_municipio = c(9701L, 
    255L, 427L, 7071L, 5819L, 3849L, 2531L, 1389L, 7535L, 8801L
    ), municipio = structure(c(2L, 6L, 1L, 10L, 3L, 9L, 8L, 5L, 
    4L, 7L), .Label = c("BELEM", "BRASILIA", "CAMPOS DOS GOYTACAZES", 
    "CURITIBA", "FORTALEZA", "MANAUS", "PORTO ALEGRE", "RECIFE", 
    "SALVADOR", "SANTOS"), class = "factor"), ddd_telefone_1 = structure(c(2L, 
    1L, 1L, 1L, 1L, 3L, 4L, 5L, 1L, 1L), .Label = c("", "61  34939002", 
    "71  33628803", "81  34257433", "85  32667805"), class = "factor"), 
    ddd_telefone_2 = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 3L, 
    4L, 1L, 1L), .Label = c("", "71  33628952", "81  34257233", 
    "85  32667829"), class = "factor"), ddd_fax = structure(c(2L, 
    1L, 1L, 1L, 1L, 3L, 4L, 5L, 1L, 1L), .Label = c("", "61  34931040", 
    "71  32420844", "81  34257520", "85  33660808"), class = "factor"), 
    correio_eletronico = structure(c(5L, 1L, 1L, 1L, 1L, 2L, 
    3L, 4L, 1L, 1L), .Label = c("", "[email protected]", "[email protected]", 
    "[email protected]", "[email protected]"), class = "factor"), 
    qualificacao_responsavel = c(10L, 10L, 10L, 10L, 10L, 10L, 
    10L, 10L, 10L, 10L), capital_social_empresa = c(6e+10, 0, 
    0, 0, 0, 0, 0, 0, 0, 0), porte_empresa = c(5L, 5L, 5L, 5L, 
    5L, 5L, 5L, 5L, 5L, 5L), opcao_pelo_simples = c(0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L), data_opcao_pelo_simples = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA), data_exclusao_simples = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA), opcao_pelo_mei = structure(c(2L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "N"), class = "factor"), 
    situacao_especial = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA), data_situacao_especial = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA), filler = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA), fim_registro = c(FALSE, FALSE, FALSE, FALSE, FALSE, 
    FALSE, FALSE, FALSE, FALSE, FALSE)), row.names = c(NA, 10L
), class = "data.frame")

2 answers

1


For the first question, just add nrow(df) at each stage of the repeat. For the second question, I followed the same logic as the first question using the function system.time():

totalRows <- 0
totalTempo <- 0
repeat
{
 tempo <- system.time( # medir tempo
   df <- read.csv(arq_grande, header=FALSE, sep="#", col.names = names(df1), nrows = tam_chunk)

   for(i in 1:ncol(df))
   {
     df[, i] <- df[, i] %>% iconv(from = 'UTF-8', to = 'latin1')
   }
 )

 nRow = nrow(df)
 # atualizar rows e tempo
 totalRows <- totalRows + nRow
 totalTempo <- totalTempo + tempo[3]

 cat("Read", nRow, "rows, total", totalRows, "\n")
 cat("Tempo total de processamento:", totalTempo, "secs\r")

 if (nRow == 0)
   break
 df_filtrado <- df %>% filter(codigo_natureza_juridica=="2143") %>%  select(cnpj,everything())
 write.table(df_filtrado, "cnpj_dados_cadastrais_pj_filtrado_coop.csv", append = T, col.names = F, row.names = F, sep = "#", dec = ".")
}

close(arq_grande)

PS: assign a function as a variable (such as nrow <- 1) is not recommending.

  • Thank you very much ! But repeat() is not working. When I say run, there is a "+" sign on the console. I couldn’t find where it is. And you need to put a comma at the end of the df line() ?

  • I just realized that my code is reading only half the lines of the original . csv. You know what it can be ?

  • Progress with total nrow and time working? For the reading question of . csv, you will get more help if you ask a new question.

  • The progress yes ! time no, I removed this function, because when I had repeat() run, it did not run, "+" appears on the console, as if something was missing.

1

The next function reads the file arq_grande, filter and write the file arq_out in pieces of size chunk_size.

library(dplyr)

readCSV <- function(CNJ, arq_grande, arq_out, chunk_size = 5000){
  f <- function(CNJ, arq, out, chunk_size){
    Skip <- 0L
    TotalWritten <- 0L
    #
    nlines <- R.utils::countLines(arq_grande)
    df1 <- read.csv(arq_grande, nrow = chunk_size)
    df1[] <- lapply(df1, iconv, from = 'UTF-8', to = 'latin1')
    col_names <- names(df1)
    df1 <- df1 %>% 
      filter(codigo_natureza_juridica %in% CNJ)
    write.csv(df1, out, row.names = FALSE)
    TotalWritten <- TotalWritten + nrow(df1)
    Skip <- Skip + chunk_size
    #
    passes <- (nlines - chunk_size) %/% chunk_size
    remaining <- (nlines - chunk_size) %% chunk_size
    #
    for(i in seq_len(passes)){
      # ler um bocado do ficheiro e converter para 'latin1'
      df1 <- read.table(arq_grande, skip = Skip, nrow = chunk_size, sep = ",")
      df1[] <- lapply(df1, iconv, from = 'UTF-8', to = 'latin1')
      names(df1) <- col_names
      # filtrar por 'codigo_natureza_juridica'
      df1 <- df1 %>% 
        filter(codigo_natureza_juridica %in% CNJ)
      # escrever as linhas filtradas
      write.table(df1, out, sep = ",", append = TRUE, 
                  row.names = FALSE, col.names = FALSE)
      # atualizar os totais e printá-los
      TotalWritten <- TotalWritten + nrow(df1)
      Skip <- Skip + chunk_size
      msg <- sprintf("Read %d rows, total %d", chunk_size, Skip)
      cat(msg, "\n")
    }
    # se sobrar algum bocado, fazer o mesmo
    if(remaining > 0){
      df1 <- read.table(arq_grande, skip = Skip, nrow = remaining, sep = ",")
      df1[] <- lapply(df1, iconv, from = 'UTF-8', to = 'latin1')
      names(df1) <- col_names
      df1 <- df1 %>% 
        filter(codigo_natureza_juridica %in% CNJ)
      write.table(df1, out, sep = ",", append = TRUE, 
                  row.names = FALSE, col.names = FALSE)
      TotalWritten <- TotalWritten + nrow(df1)
      Skip <- Skip + remaining
      msg <- sprintf("Read %d rows, total %d", remaining, Skip)
      cat(msg, "\n")
    }
    c(Skip, TotalWritten)
  }
  #
  arq <- file(arq_grande, 'r')
  on.exit(close(arq))
  out <- file(arq_out, 'w')
  on.exit(close(out), add = TRUE)
  tempo <- system.time(res <- f(CNJ, arq, out, chunk_size))
  cat("Tempo:", tempo, "\n\n")
  list(Read = res[1], Written = res[2], Time = tempo)
}

res <- readCSV("2143", "teste.csv", "cnpj_dados_cadastrais_pj_filtrado_coop.csv", chunk_size = 7000)

res
#$Read
#[1] 1000001
#
#$Written
#[1] 19616
#
#$Time
#  usuário   sistema decorrido 
#   97.527     1.882    99.388 

R.utils::countLines("cnpj_dados_cadastrais_pj_filtrado_coop.csv")
#[1] 19617
#attr(,"lastLineHasNewline")
#[1] TRUE

The discrepancy between the total written lines given by the function and by R.utils::countLines is due to the latter function also count the newline end of a line with zero characters.

  • Thank you very much for your reply !

  • You’re making the mistake: Error in codigo_natureza_juridica %in% CNJ : &#xA; object 'codigo_natureza_juridica' not found . And I also need to look for two values in this field.

  • @Ricardotheodoro Are you sure the database has a column with that name? To me, with test data gives no error.

  • have yes ! in my sample code worked. It’s a Character column.

  • @Ricardotheodoro Can you please, edit the question with the departure of dput(head(df1, 20)) as it is after the first read.csv?

  • Done ! I edited the question.

Show 1 more comment

Browser other questions tagged

You are not signed in. Login or sign up in order to post.