Performance of conditionals in R: how to accelerate this process?

Asked

Viewed 49 times

1

I am working with a data.frame with more than 29 million lines, and I need to do some validations exchanging the data if necessary, but I am using ifelse which is extremely slow, there is some way to do this in a better way?

Example of the validations:

tabela$coluna <- ifelse(tabela$coluna == 'Teste', "X", tabela$coluna)
tabela$coluna <- ifelse(tabela$coluna == 'Teste2', "Y", tabela$coluna)
tabela$coluna <- ifelse(tabela$coluna == 'Teste3', "Z", tabela$coluna)
  • You can use a function once, you have the base switch and dplyr case_when, I just don’t know if it gets faster, but it gets much more organized

1 answer

2


I’ll propose two alternatives being the fastest case_when, plus I’ll show you another way to write the code in tidyverse and fix a bug in your code

library(microbenchmark)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

N <- 10000
set.seed(1)
df <- data.frame(coluna  = sample(x = c("Teste","Teste2","Teste3"),size = N, replace=T))


initial_approach <- function(tabela){
  tabela$coluna <- ifelse(tabela$coluna == 'Teste', "X", tabela$coluna)
  tabela$coluna <- ifelse(tabela$coluna == 'Teste2', "Y", tabela$coluna)
  tabela$coluna <- ifelse(tabela$coluna == 'Teste3', "Z", tabela$coluna)
  return(tabela)
}
initial_approach(df) %>% head()
#>   coluna
#> 1      X
#> 2      3
#> 3      X
#> 4      2
#> 5      X
#> 6      3

ifelse_approach <- function(tabela) {
  tabela$coluna <- ifelse(tabela$coluna=="Teste",yes = "X",
                           ifelse(tabela$coluna == "Teste2",yes = "Y",no = "Z"
                                  )
  )
  return(tabela)
}


if_else_approach <- function(tabela) {
  tabela$coluna <- if_else(tabela$coluna=="Teste","X",
                          if_else(tabela$coluna == "Teste2","Y","Z"
                          )
  )
  return(tabela)
}


case_when_approach <- function(tabela) {
  tabela$coluna <- case_when(tabela$coluna == "Teste" ~"X",
                              tabela$coluna == "Teste2" ~"Y",
                              TRUE ~ "Z")
  return(tabela)
}

case_when_approach_tidy <- function(tabela) {
  tabela <- tabela %>% 
    mutate(coluna = case_when(coluna == "Teste" ~"X",
                              coluna == "Teste2" ~"Y",
                              TRUE ~ "Z"))
  return(tabela)
}


identical(ifelse_approach(df), case_when_approach(df))
#> [1] TRUE

x <- df %>% case_when_approach()
y <- df %>% ifelse_approach()
z <- df %>% initial_approach()
a <- df %>% case_when_approach_tidy()
b <- df %>% if_else_approach()


x %>% head()
#>   coluna
#> 1      X
#> 2      Z
#> 3      X
#> 4      Y
#> 5      X
#> 6      Z
y %>% head()
#>   coluna
#> 1      X
#> 2      Z
#> 3      X
#> 4      Y
#> 5      X
#> 6      Z
z %>% head()
#>   coluna
#> 1      X
#> 2      3
#> 3      X
#> 4      2
#> 5      X
#> 6      3
a %>% head()
#>   coluna
#> 1      X
#> 2      Z
#> 3      X
#> 4      Y
#> 5      X
#> 6      Z
b %>% head()
#>   coluna
#> 1      X
#> 2      Z
#> 3      X
#> 4      Y
#> 5      X
#> 6      Z

identical(x,y)
#> [1] TRUE
identical(x,z)
#> [1] FALSE
identical(x,a)
#> [1] TRUE
identical(x,b)
#> [1] TRUE


microbenchmark(initial_approach(df), ifelse_approach(df), case_when_approach(df),case_when_approach_tidy(df),if_else_approach(df),
               unit="relative", times=100L)
#> Unit: relative
#>                         expr      min       lq     mean   median       uq
#>         initial_approach(df) 8.390192 8.536914 8.112638 8.885369 8.312491
#>          ifelse_approach(df) 5.858890 5.764680 5.489827 5.811131 5.587000
#>       case_when_approach(df) 1.339375 1.286289 1.225888 1.248153 1.257498
#>  case_when_approach_tidy(df) 3.593573 3.522934 3.330928 3.523780 3.164132
#>         if_else_approach(df) 1.000000 1.000000 1.000000 1.000000 1.000000
#>        max neval
#>  3.2119030   100
#>  2.3503550   100
#>  0.7355599   100
#>  1.9000484   100
#>  1.0000000   100

Created on 2020-06-08 by the reprex package (v0.3.0)

  • Thank you very much! I will do some tests focused on performance.

  • I think if_else should be one of the fastest if it’s not enough to maybe try something from the data.table package

Browser other questions tagged

You are not signed in. Login or sign up in order to post.