3
I would like to know how to ignore the links that do not fit the conditions set in title, data_hora and text; thus managing to continue scraping the site.
Error that occurs when a link does not have or does not follow the conditions:"Error in data.frame(title, time_time, text): Arguments imply differing number of Rows: 1, 0"
Below is the script:
# iniciar bibliotecas
library(XML)
library(xlsx)
#url_base <- "http://www.saocarlosagora.com.br/busca/?q=PDT&page=2"
url_base <- "http://www.saocarlosagora.com.br/busca/?q=bolt&page=koxa"
url_base <- gsub("bolt", "PDT", url_base)
links_saocarlos <- c()
for (i in 1:4){
url1 <- gsub("koxa", i, url_base)
pag<- readLines(url1)
pag<- htmlParse(pag)
pag<- xmlRoot(pag)
links <- xpathSApply(pag, "//div[@class='item']/a", xmlGetAttr, name="href")
links <- paste("http://www.saocarlosagora.com.br/", links, sep ="")
links_saocarlos<- c(links_saocarlos, links)
}
dados <- data.frame()
for(links in links_saocarlos){
pag1<- readLines(links)
pag1<- htmlParse(pag1)
pag1<- xmlRoot(pag1)
titulo <- xpathSApply(pag1, "//div[@class='row-fluid row-margin']/h2", xmlValue)
data_hora <- xpathSApply (pag1, "//div[@class='horarios']", xmlValue)
texto <- xpathSApply(pag1, "//div[@id='HOTWordsTxt']/p", xmlValue)
dados <- rbind(dados, data.frame(titulo, data_hora, texto))
}
agregar <- aggregate(dados$texto,list(dados$titulo,dados$data_hora),paste,collapse=' ')
Mto thanks for the help Daniel.
– Gabriel F