A: How to count and sum the amount of a certain "factor" in the observations (lines) of a data.frame?

Asked

Viewed 4,047 times

4

Dear, would like to get the amount of "yes" (factor) on each line of a data.frame, as follows. Would anyone know what arguments I would have to use to do this with "mutate"? I tried several ways and could not. I tried with:

Base = Base %>% mutate(Total_yes = )

If anyone can help, I’d be grateful!

    > dput(Base)
structure(list(ID = structure(1:100, .Label = c("110001", "110002", 
"110003", "110004", "110005", "110006", "110007", "110008", "110009", 
"110010", "110011", "110012", "110013", "110014", "110015", "110018", 
"110020", "110025", "110026", "110028", "110029", "110030", "110032", 
"110033", "110034", "110037", "110040", "110045", "110050", "110060", 
"110070", "110080", "110090", "110092", "110094", "110100", "110110", 
"110120", "110130", "110140", "110143", "110145", "110146", "110147", 
"110148", "110149", "110150", "110155", "110160", "110170", "110175", 
"110180", "120001", "120005", "120010", "120013", "120017", "120020", 
"120025", "120030", "120032", "120033", "120034", "120035", "120038", 
"120039", "120040", "120042", "120043", "120045", "120050", "120060", 
"120070", "120080", "130002", "130006", "130008", "130010", "130014", 
"130020", "130030", "130040", "130050", "130060", "130063", "130068", 
"130070", "130080", "130083", "130090", "130100", "130110", "130115", 
"130120", "130130", "130140", "130150", "130160", "130165", "130170"
), class = "factor"), Col_1 = structure(c(1L, 4L, 4L, 3L, 2L, 
1L, 4L, 4L, 3L, 2L, 1L, 2L, 4L, 3L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 
4L, 4L, 4L, 3L, 3L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 3L, 4L, 4L, 1L, 
4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 
4L, 4L, 4L, 4L, 4L, 1L, 4L, 4L, 2L, 4L, 4L, 2L, 4L, 3L, 4L, 4L, 
4L, 4L, 2L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 2L, 4L), .Label = c("NA", 
"Não", "Não disponível", "Sim"), class = "factor"), Col_2 = structure(c(4L, 
4L, 4L, 3L, 4L, 2L, 2L, 2L, 3L, 4L, 4L, 2L, 2L, 3L, 2L, 2L, 4L, 
4L, 2L, 4L, 2L, 4L, 2L, 4L, 3L, 3L, 2L, 4L, 4L, 4L, 3L, 4L, 4L, 
3L, 4L, 4L, 2L, 4L, 2L, 4L, 4L, 2L, 2L, 4L, 4L, 2L, 4L, 2L, 4L, 
4L, 2L, 2L, 4L, 2L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 2L, 4L, 4L, 4L, 
4L, 4L, 3L, 4L, 2L, 4L, 4L, 2L, 4L, 2L, 2L, 2L, 4L, 4L, 4L, 1L, 
4L, 3L, 2L, 4L, 1L, 4L, 2L, 2L, 1L, 4L, 4L, 2L, 1L, 2L, 2L, 1L, 
2L, 4L, 4L), .Label = c("NA", "Não", "Não disponível", "Sim"), class = "factor"), 
    Col_3 = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L
    ), .Label = c("NA", "Não", "Sim"), class = "factor"), Col_4 = structure(c(3L, 
    3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 1L, 3L, 
    3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L), .Label = c("Não", "Não disponível", 
    "Sim"), class = "factor"), Col_5 = structure(c(4L, 4L, 4L, 
    2L, 4L, 2L, 2L, 4L, 2L, 4L, 1L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 
    2L, 4L, 2L, 2L, 2L, 4L, 2L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 
    3L, 4L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 4L, 2L, 4L, 2L, 
    4L, 4L, 2L, 4L, 4L, 2L, 4L, 2L, 4L, 4L, 2L, 4L, 2L, 2L, 4L, 
    4L, 4L, 4L, 4L, 2L, 3L, 2L, 4L, 4L, 4L, 4L, 2L, 4L, 2L, 2L, 
    2L, 4L, 4L, 4L, 2L, 4L, 4L, 2L, 4L, 2L, 4L, 2L, 4L, 4L, 2L, 
    4L, 2L, 4L, 4L, 2L, 4L, 4L), .Label = c("NA", "Não", "Não disponível", 
    "Sim"), class = "factor"), Col_6 = structure(c(2L, 2L, 2L, 
    2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 
    2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 
    1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Não", "Sim"), class = "factor"), 
    Col_7 = structure(c(4L, 4L, 2L, 4L, 2L, 4L, 3L, 4L, 4L, 4L, 
    4L, 2L, 4L, 3L, 2L, 4L, 4L, 4L, 2L, 4L, 4L, 4L, 4L, 1L, 1L, 
    1L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 
    3L, 4L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L, 3L, 4L, 4L, 
    4L, 4L, 4L, 3L, 4L, 3L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 
    4L, 4L, 4L, 3L, 4L, 4L, 4L, 3L, 3L, 3L, 2L, 2L, 4L, 4L, 4L, 
    4L, 4L, 2L, 4L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L
    ), .Label = c("NA", "Não", "Não disponível", "Sim"), class = "factor"), 
    Col_8 = structure(c(3L, 3L, 1L, 1L, 3L, 3L, 2L, 3L, 1L, 3L, 
    3L, 3L, 1L, 2L, 1L, 3L, 3L, 3L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 
    3L, 1L, 1L, 3L, 3L, 1L, 1L, 3L, 2L, 1L, 3L, 1L, 3L, 1L, 3L, 
    2L, 3L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 3L, 1L, 3L, 2L, 1L, 1L, 
    3L, 1L, 3L, 2L, 3L, 2L, 1L, 3L, 3L, 3L, 2L, 3L, 1L, 1L, 1L, 
    1L, 3L, 3L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 
    1L, 3L, 1L, 3L, 2L, 3L, 3L, 1L, 3L, 1L, 1L, 3L, 1L, 1L, 3L
    ), .Label = c("Não", "Não disponível", "Sim"), class = "factor"), 
    Col_9 = structure(c(2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 
    2L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 
    2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 
    3L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 
    3L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 2L, 
    2L, 2L, 3L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 
    2L, 2L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L
    ), .Label = c("Ignorado", "Não", "Sim"), class = "factor"), 
    Col_10 = structure(c(3L, 3L, 3L, 2L, 3L, 1L, 1L, 1L, 2L, 
    3L, 3L, 1L, 1L, 2L, 1L, 1L, 3L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 
    2L, 2L, 1L, 3L, 3L, 3L, 2L, 3L, 3L, 2L, 3L, 3L, 1L, 3L, 3L, 
    2L, 2L, 1L, 3L, 3L, 3L, 2L, 3L, 3L, 2L, 3L, 3L, 1L, 3L, 1L, 
    3L, 2L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 3L, 3L, 3L, 3L, 2L, 
    3L, 3L, 3L, 3L, 3L, 1L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 
    2L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 1L, 1L, 
    3L), .Label = c("Não", "Não disponível", "Sim"), class = "factor"), 
    Total_Sim = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("ID", 
"Col_1", "Col_2", "Col_3", "Col_4", "Col_5", "Col_6", "Col_7", 
"Col_8", "Col_9", "Col_10", "Total_Sim"), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -100L))

3 answers

8


Using the package dplyr:

library(dplyr)
Base <- Base %>%
  mutate(Total_Sim = rowSums(. == "Sim"))
  • It worked perfectly, Rafael. Grateful! Could you just enlighten me one thing: what the point means within the rowSums function?

  • 3

    The . refers to all columns where the function will be applied rowSums. In this case, as no selection was made, it is referring to all columns of Base.

5

I think the simplest way is with rowSums. Like the comparisons == result in logical values FALSE/TRUE that the R encodes as 0/1, just add the values in each line.

rowSums(Base[, 2:11] == "Sim")
 #[1]  8  9  7  4  6  6  3  7  4  8  7  4  6  2  4  7  9  9  2 10  5  7  5  8  4
 #[26]  5  5  7  9 10  5  8  9  0  9  9  4  9  7  8  5  7  3  8  9  6  9  6  8  9
 #[51]  4  7  6  5  8  8  8  9  4 10  5  5  9  8 10  6 10  5  4  6  8  9  9  6  4
 #[76]  7  6  3  5  6  5  7  4  7  8  6  9  3  9  1  9  9  6  8  6  6  8  4  3  9

In addition to simplicity, it has the advantage of being very fast. The function rowSums is scheduled in C and avoids the cycles of R.

  • very grateful for the explanation and help, Rui!

4

Using the basic R package:

# função para quantificar os 'Sim'
func <- function(x) length(which(base[x, c(2:11)] == 'Sim'))

# aplicar para todas as linhas
base$Total_Sim <- sapply(1: dim(base)[1], func)
  • Thank you, Willian, I’m still learning about functions. I’ll use your example to train here, rs!

Browser other questions tagged

You are not signed in. Login or sign up in order to post.