I can’t do the "web scraping" properly from a Python comic strip site

Asked

Viewed 62 times

0

Well, I was making a code that would check the day of each strip/gif of the page and, if the day is the same as the current day (in the code I put 14 only because the site does not update weekend and I needed to test somehow), I would download the comic strip/gif. However, two errors occur: the code does not download all comics/gifs(I noticed a pattern of up to 5 strips downloaded, nothing else), besides, sometimes even when the date is less than the current date, the code downloads the same way.

from bs4 import BeautifulSoup
import requests
import datetime
import os

os.chdir('C:\\Users\\Rafael\\Desktop\\Scraping\\leninja_imgs')

def get_img():
    r = requests.get("https://leninja.com.br/page/2/")
    soup = BeautifulSoup(r.text, 'lxml')
    daysPost = soup.select(".day-post")
    imgLinks = [i.get("src") for i in soup.select(".le-inner-content img")]
    #actualday = datetime.datetime.now().day
    actualday = 14
    n = 0

    for day in daysPost:
        if int(day.getText()) == actualday:
            req = requests.get(imgLinks[n])
            img = open(os.path.basename(imgLinks[n]), "wb")

            for chunk in req.iter_content(100000):
                img.write(chunk)    

        else:
            print("Não foi possível baixar a imagem!")
            return False
        n += 1
    return True

get_img()

1 answer

0

Images are a list, so you can download all:

# coding=utf-8

from bs4 import BeautifulSoup
import requests
import datetime
import os

os.chdir('./')

def get_img():
    r = requests.get("https://leninja.com.br/page/2/")
    soup = BeautifulSoup(r.text, 'lxml')
    daysPost = soup.select(".day-post")
    imgLinks = [i.get("src") for i in soup.select(".le-inner-content img")]
    #actualday = datetime.datetime.now().day
    actualday = 20
    n = 0

    for img in imgLinks:
        try:
            req = requests.get(imgLinks[n])


        except requests.exceptions.RequestException as e:
            print e
            print("Não foi possível baixar a imagem!")
            return False

        img = open(os.path.basename(imgLinks[n]), "wb")

        for chunk in req.iter_content(100000):
            img.write(chunk)
        n += 1
    return True

get_img()

Browser other questions tagged

You are not signed in. Login or sign up in order to post.