I can't do "web scraping" properly from a Python comic strip site

Question

I can't do "web scraping" properly from a Python comic strip site

Well, I was making a code that would check the day of each strip / gif of the page and, if the day is the same as the current day (in the code I put 14 only because the site does not update weekend and I needed to test somehow), I would download the Strip/gif. However, two errors occur: the code does not download all the comics / gifs(I noticed a pattern of up to 5 downloaded comics, nothing else), moreover, sometimes even when the date is less than the date current, the code downloads the same way.

from bs4 import BeautifulSoup
import requests
import datetime
import os

os.chdir('C:\\Users\\Rafael\\Desktop\\Scraping\\leninja_imgs')

def get_img():
    r = requests.get("https://leninja.com.br/page/2/")
    soup = BeautifulSoup(r.text, 'lxml')
    daysPost = soup.select(".day-post")
    imgLinks = [i.get("src") for i in soup.select(".le-inner-content img")]
    #actualday = datetime.datetime.now().day
    actualday = 14
    n = 0

    for day in daysPost:
        if int(day.getText()) == actualday:
            req = requests.get(imgLinks[n])
            img = open(os.path.basename(imgLinks[n]), "wb")

            for chunk in req.iter_content(100000):
                img.write(chunk)    

        else:
            print("Não foi possível baixar a imagem!")
            return False
        n += 1
    return True

get_img()

0

python python-3.x web-scraping beautifulsoup

Author: Matheus Andrade, 2018-09-15

Source

1 answers

score 0 · Answer 1

The images are a list, so you can download them all:

# coding=utf-8

from bs4 import BeautifulSoup
import requests
import datetime
import os

os.chdir('./')

def get_img():
    r = requests.get("https://leninja.com.br/page/2/")
    soup = BeautifulSoup(r.text, 'lxml')
    daysPost = soup.select(".day-post")
    imgLinks = [i.get("src") for i in soup.select(".le-inner-content img")]
    #actualday = datetime.datetime.now().day
    actualday = 20
    n = 0

    for img in imgLinks:
        try:
            req = requests.get(imgLinks[n])


        except requests.exceptions.RequestException as e:
            print e
            print("Não foi possível baixar a imagem!")
            return False

        img = open(os.path.basename(imgLinks[n]), "wb")

        for chunk in req.iter_content(100000):
            img.write(chunk)
        n += 1
    return True

get_img()