I can’t do the "web scraping" properly from a Python comic strip site

Question

I can’t do the "web scraping" properly from a Python comic strip site

Asked 6 years, 11 months ago

Viewed 62 times

0

Well, I was making a code that would check the day of each strip/gif of the page and, if the day is the same as the current day (in the code I put 14 only because the site does not update weekend and I needed to test somehow), I would download the comic strip/gif. However, two errors occur: the code does not download all comics/gifs(I noticed a pattern of up to 5 strips downloaded, nothing else), besides, sometimes even when the date is less than the current date, the code downloads the same way.

from bs4 import BeautifulSoup
import requests
import datetime
import os

os.chdir('C:\\Users\\Rafael\\Desktop\\Scraping\\leninja_imgs')

def get_img():
    r = requests.get("https://leninja.com.br/page/2/")
    soup = BeautifulSoup(r.text, 'lxml')
    daysPost = soup.select(".day-post")
    imgLinks = [i.get("src") for i in soup.select(".le-inner-content img")]
    #actualday = datetime.datetime.now().day
    actualday = 14
    n = 0

    for day in daysPost:
        if int(day.getText()) == actualday:
            req = requests.get(imgLinks[n])
            img = open(os.path.basename(imgLinks[n]), "wb")

            for chunk in req.iter_content(100000):
                img.write(chunk)    

        else:
            print("Não foi possível baixar a imagem!")
            return False
        n += 1
    return True

get_img()

1 answer

Browser other questions tagged python python-3.x web-scraping beautifulsoup

You are not signed in. Login or sign up in order to post.

by tomasantunes • **1,579** points · Answer 1 · 2018-12-21T10:05:37+00:00

Images are a list, so you can download all:

# coding=utf-8

from bs4 import BeautifulSoup
import requests
import datetime
import os

os.chdir('./')

def get_img():
    r = requests.get("https://leninja.com.br/page/2/")
    soup = BeautifulSoup(r.text, 'lxml')
    daysPost = soup.select(".day-post")
    imgLinks = [i.get("src") for i in soup.select(".le-inner-content img")]
    #actualday = datetime.datetime.now().day
    actualday = 20
    n = 0

    for img in imgLinks:
        try:
            req = requests.get(imgLinks[n])


        except requests.exceptions.RequestException as e:
            print e
            print("Não foi possível baixar a imagem!")
            return False

        img = open(os.path.basename(imgLinks[n]), "wb")

        for chunk in req.iter_content(100000):
            img.write(chunk)
        n += 1
    return True

get_img()