I’ll propose two alternatives being the fastest case_when, plus I’ll show you another way to write the code in tidyverse and fix a bug in your code
library(microbenchmark)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
N <- 10000
set.seed(1)
df <- data.frame(coluna = sample(x = c("Teste","Teste2","Teste3"),size = N, replace=T))
initial_approach <- function(tabela){
tabela$coluna <- ifelse(tabela$coluna == 'Teste', "X", tabela$coluna)
tabela$coluna <- ifelse(tabela$coluna == 'Teste2', "Y", tabela$coluna)
tabela$coluna <- ifelse(tabela$coluna == 'Teste3', "Z", tabela$coluna)
return(tabela)
}
initial_approach(df) %>% head()
#> coluna
#> 1 X
#> 2 3
#> 3 X
#> 4 2
#> 5 X
#> 6 3
ifelse_approach <- function(tabela) {
tabela$coluna <- ifelse(tabela$coluna=="Teste",yes = "X",
ifelse(tabela$coluna == "Teste2",yes = "Y",no = "Z"
)
)
return(tabela)
}
if_else_approach <- function(tabela) {
tabela$coluna <- if_else(tabela$coluna=="Teste","X",
if_else(tabela$coluna == "Teste2","Y","Z"
)
)
return(tabela)
}
case_when_approach <- function(tabela) {
tabela$coluna <- case_when(tabela$coluna == "Teste" ~"X",
tabela$coluna == "Teste2" ~"Y",
TRUE ~ "Z")
return(tabela)
}
case_when_approach_tidy <- function(tabela) {
tabela <- tabela %>%
mutate(coluna = case_when(coluna == "Teste" ~"X",
coluna == "Teste2" ~"Y",
TRUE ~ "Z"))
return(tabela)
}
identical(ifelse_approach(df), case_when_approach(df))
#> [1] TRUE
x <- df %>% case_when_approach()
y <- df %>% ifelse_approach()
z <- df %>% initial_approach()
a <- df %>% case_when_approach_tidy()
b <- df %>% if_else_approach()
x %>% head()
#> coluna
#> 1 X
#> 2 Z
#> 3 X
#> 4 Y
#> 5 X
#> 6 Z
y %>% head()
#> coluna
#> 1 X
#> 2 Z
#> 3 X
#> 4 Y
#> 5 X
#> 6 Z
z %>% head()
#> coluna
#> 1 X
#> 2 3
#> 3 X
#> 4 2
#> 5 X
#> 6 3
a %>% head()
#> coluna
#> 1 X
#> 2 Z
#> 3 X
#> 4 Y
#> 5 X
#> 6 Z
b %>% head()
#> coluna
#> 1 X
#> 2 Z
#> 3 X
#> 4 Y
#> 5 X
#> 6 Z
identical(x,y)
#> [1] TRUE
identical(x,z)
#> [1] FALSE
identical(x,a)
#> [1] TRUE
identical(x,b)
#> [1] TRUE
microbenchmark(initial_approach(df), ifelse_approach(df), case_when_approach(df),case_when_approach_tidy(df),if_else_approach(df),
unit="relative", times=100L)
#> Unit: relative
#> expr min lq mean median uq
#> initial_approach(df) 8.390192 8.536914 8.112638 8.885369 8.312491
#> ifelse_approach(df) 5.858890 5.764680 5.489827 5.811131 5.587000
#> case_when_approach(df) 1.339375 1.286289 1.225888 1.248153 1.257498
#> case_when_approach_tidy(df) 3.593573 3.522934 3.330928 3.523780 3.164132
#> if_else_approach(df) 1.000000 1.000000 1.000000 1.000000 1.000000
#> max neval
#> 3.2119030 100
#> 2.3503550 100
#> 0.7355599 100
#> 1.9000484 100
#> 1.0000000 100
Created on 2020-06-08 by the reprex package (v0.3.0)
You can use a function once, you have the base switch and dplyr case_when, I just don’t know if it gets faster, but it gets much more organized
– Bruno