As commented by @Marcusnunes, it is possible to do this extraction using the rvest
:
library(rvest)
URL <- "http://globoesporte.globo.com/futebol/brasileirao-serie-a/"
tabelas <- read_html(URL) %>% html_table()
tabela1 <- tabelas[[1]]
tabela1[[3]] <- NULL
names(tabela1) <- c("Posição","Time")
tabela2 <- tabelas[[2]]
tabela2$`ÚLT. JOGOS` <- NULL
df.data <- dplyr::bind_cols(tabela1, tabela2)
df.data$Time <- gsub("[A-Z]{3}$","",df.data$Time)
If there is a package use limitation, only with r-base
it is possible to create your own html_table2
:
pagina <- readLines(URL, encoding = 'UTF-8')
html_table2 <- function(pagina) {
tabelas <- strsplit(paste(pagina, collapse = "\n"), '<table.*?>')[[1]]
# remover lixo antes da primeira tabela
tabelas <- tabelas[-1]
tabelas_limpas <- lapply(tabelas, function(x) {
# tirar sujeira posterior a '</table>'
limpa <- sub('</table>.*', '', x)
})
linhas <- lapply(tabelas_limpas, function(x) {
linhas <- strsplit(x, '<tr.*?>')[[1]]
sub('</tr.*?>', '', linhas)
})
df <- lapply(linhas, function(x) {
colunas <- lapply(x, function(y) strsplit(y, '<td.*?>')[[1]])
matriz <- do.call(rbind, colunas)
limpas <- gsub('</?.+?>', '', matriz) # Remove tags HTML
limpas <- gsub('\\n', '', limpas)
limpas <- gsub('[\\s]{2,}', '', limpas)
# Remove duas primeiras linhas e a primeira e ultima colunas
limpas <- limpas[- (1:2), -c(1, ncol(limpas))]
as.data.frame(limpas)
})
df
}
tabelas <- html_table2(pagina)
tabela1 <- tabelas[[1]]
names(tabela1) <- c("Posição","Time")
tabela2 <- tabelas[[2]]
names(tabela2) <- c('P', 'J', 'V', 'E', 'D', 'GP', 'GC', 'SG', '%')
df.data <- dplyr::bind_cols(tabela1, tabela2)
df.data$Time <- gsub("[A-Z]{3}$","",df.data$Time)
The problem is not clear to me. I understood that it is not possible to use the package
XML
, but this means that none other package can be used? That is, the problem should be solved with the base commands of R? Or is it possible to install other webscraping packages, such asrvest
, for example?– Marcus Nunes
Briefly, I need to extract table from a URL in Rstudio, I can only use existing packages in R, but not using XML.
– Juny