Performance of conditionals in R: how to accelerate this process?

Question

Performance of conditionals in R: how to accelerate this process?

Asked 5 years, 1 month ago

Viewed 49 times

1

I am working with a data.frame with more than 29 million lines, and I need to do some validations exchanging the data if necessary, but I am using ifelse which is extremely slow, there is some way to do this in a better way?

Example of the validations:

tabela$coluna <- ifelse(tabela$coluna == 'Teste', "X", tabela$coluna)
tabela$coluna <- ifelse(tabela$coluna == 'Teste2', "Y", tabela$coluna)
tabela$coluna <- ifelse(tabela$coluna == 'Teste3', "Z", tabela$coluna)

You can use a function once, you have the base switch and dplyr case_when, I just don’t know if it gets faster, but it gets much more organized

– Bruno

2020/06/08 at 16:50

1 answer

Browser other questions tagged r benchmark

You are not signed in. Login or sign up in order to post.

by Bruno • **248** points · Answer 1 · 2020-06-08T18:55:43+00:00

I’ll propose two alternatives being the fastest case_when, plus I’ll show you another way to write the code in tidyverse and fix a bug in your code

library(microbenchmark)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

N <- 10000
set.seed(1)
df <- data.frame(coluna  = sample(x = c("Teste","Teste2","Teste3"),size = N, replace=T))


initial_approach <- function(tabela){
  tabela$coluna <- ifelse(tabela$coluna == 'Teste', "X", tabela$coluna)
  tabela$coluna <- ifelse(tabela$coluna == 'Teste2', "Y", tabela$coluna)
  tabela$coluna <- ifelse(tabela$coluna == 'Teste3', "Z", tabela$coluna)
  return(tabela)
}
initial_approach(df) %>% head()
#>   coluna
#> 1      X
#> 2      3
#> 3      X
#> 4      2
#> 5      X
#> 6      3

ifelse_approach <- function(tabela) {
  tabela$coluna <- ifelse(tabela$coluna=="Teste",yes = "X",
                           ifelse(tabela$coluna == "Teste2",yes = "Y",no = "Z"
                                  )
  )
  return(tabela)
}


if_else_approach <- function(tabela) {
  tabela$coluna <- if_else(tabela$coluna=="Teste","X",
                          if_else(tabela$coluna == "Teste2","Y","Z"
                          )
  )
  return(tabela)
}


case_when_approach <- function(tabela) {
  tabela$coluna <- case_when(tabela$coluna == "Teste" ~"X",
                              tabela$coluna == "Teste2" ~"Y",
                              TRUE ~ "Z")
  return(tabela)
}

case_when_approach_tidy <- function(tabela) {
  tabela <- tabela %>% 
    mutate(coluna = case_when(coluna == "Teste" ~"X",
                              coluna == "Teste2" ~"Y",
                              TRUE ~ "Z"))
  return(tabela)
}


identical(ifelse_approach(df), case_when_approach(df))
#> [1] TRUE

x <- df %>% case_when_approach()
y <- df %>% ifelse_approach()
z <- df %>% initial_approach()
a <- df %>% case_when_approach_tidy()
b <- df %>% if_else_approach()


x %>% head()
#>   coluna
#> 1      X
#> 2      Z
#> 3      X
#> 4      Y
#> 5      X
#> 6      Z
y %>% head()
#>   coluna
#> 1      X
#> 2      Z
#> 3      X
#> 4      Y
#> 5      X
#> 6      Z
z %>% head()
#>   coluna
#> 1      X
#> 2      3
#> 3      X
#> 4      2
#> 5      X
#> 6      3
a %>% head()
#>   coluna
#> 1      X
#> 2      Z
#> 3      X
#> 4      Y
#> 5      X
#> 6      Z
b %>% head()
#>   coluna
#> 1      X
#> 2      Z
#> 3      X
#> 4      Y
#> 5      X
#> 6      Z

identical(x,y)
#> [1] TRUE
identical(x,z)
#> [1] FALSE
identical(x,a)
#> [1] TRUE
identical(x,b)
#> [1] TRUE


microbenchmark(initial_approach(df), ifelse_approach(df), case_when_approach(df),case_when_approach_tidy(df),if_else_approach(df),
               unit="relative", times=100L)
#> Unit: relative
#>                         expr      min       lq     mean   median       uq
#>         initial_approach(df) 8.390192 8.536914 8.112638 8.885369 8.312491
#>          ifelse_approach(df) 5.858890 5.764680 5.489827 5.811131 5.587000
#>       case_when_approach(df) 1.339375 1.286289 1.225888 1.248153 1.257498
#>  case_when_approach_tidy(df) 3.593573 3.522934 3.330928 3.523780 3.164132
#>         if_else_approach(df) 1.000000 1.000000 1.000000 1.000000 1.000000
#>        max neval
#>  3.2119030   100
#>  2.3503550   100
#>  0.7355599   100
#>  1.9000484   100
#>  1.0000000   100

^{Created on 2020-06-08 by the reprex package (v0.3.0)}