Parsing a python html page. BeautifulSoup

Question

Parsing a python html page. BeautifulSoup

Hello everyone I was faced with the task of parsing data from this web page. It contains data on the works put up for auction. You need to get the data underlined with a red marker for each lot on the page. I tried to do this using BeautifulSoup and string search using regular expressions. I managed to collect data on the lot number, the name of the painting, countries, the name of the auction and its date (last right and leftmost blocks). Collecting data from the central blocks is difficult - I didn't find a way to refer to the strings enclosed in the tags. Here is an example of the current script:

import requests
import re
from bs4 import BeautifulSoup as bs
import pandas as pd

URL_TEMPLATE = "https://www.artprice.com/artist/15079/wassily-kandinsky/lots/pasts?ipp=100"
FILE_NAME = "test"

def parse(url = URL_TEMPLATE):
    result_list = {'lot': [], 'name': [], 'date': [], 'type1': [], 'type2': [], 'width': [], 'height': [], 'estimate': [], 'hummerprice': [], 'auction_date': [], 'auction': [], 'country': []}
    r = requests.get(URL_TEMPLATE)
    soup = bs(r.text, "html.parser")
    lot_info = soup.find_all('p', class_='hidden-xs')
    date_info = soup.find_all('date')
    names_info = soup.find_all('a', class_='sln_lot_show')
    auction_info = soup.find_all('p', class_='visible-xs')
    auction_date_info = soup.find_all(string=re.compile('\d\d\s\w\w\w\s\d\d\d\d'))[1::2]
    for i in range(len(lot_info)):
        result_list['lot'].append(lot_info[i].text)
    for i in range(len(date_info)):
        result_list['date'].append(date_info[i].text)
    for i in range (len(names_info)):
        result_list['name'].append(names_info[i].text)
    for i in range(0, len(auction_info), 2):
        result_list['auction'].append(soup.find_all('p', class_='visible-xs')[i].strong.string)
    for i in range(1, len(auction_info), 2):
        result_list['country'].append(soup.find_all('p', class_='visible-xs')[i].string)
    for i in range(len(auction_date_info)):
        result_list['auction_date'].append(auction_date_info[i])
    return result_list
df = pd.DataFrame(data=parse())
df.to_excel("test.xlsx")

The arrays type1 (which should correspond to "Print-Multiple" from the screen), type2 (="Print in colors"), width (75), height (80), and hummerprice (="not communicated) are left blank.

1

python парсер python-3.x beautiful-soup html

Author: impoz3v, 2020-05-19

Source

1 answers

score 0 · Accepted Answer

Here:

import requests
import re
from bs4 import BeautifulSoup as bs
import pandas as pd

URL_TEMPLATE = "https://www.artprice.com/artist/15079/wassily-kandinsky/lots/pasts?ipp=100"
FILE_NAME = "test"

def parse(url = URL_TEMPLATE):
    result_list = {'lot': [], 'name': [], 'date': [], 'type1': [], 'type2': [], 'width': [], 'height': [], 'auction_date': [], 'auction': [], 'country': []}
    r = requests.get(URL_TEMPLATE)
    soup = bs(r.text, "html.parser")
    lot_info = soup.find_all('p', class_='hidden-xs')
    date_info = soup.find_all('date')
    names_info = soup.find_all('a', class_='sln_lot_show')
    auction_info = soup.find_all('p', class_='visible-xs')
    auction_date_info = soup.find_all(string=re.compile('\d\d\s\w\w\w\s\d\d\d\d'))[1::2]
    my = soup.find_all('div',class_='col-xs-8 col-sm-6')
    for i in range(len(lot_info)):
        result_list['lot'].append(lot_info[i].text)
    for i in range(len(date_info)):
        result_list['date'].append(date_info[i].text)
    for i in range (len(names_info)):
        result_list['name'].append(names_info[i].text)
    for i in range(0, len(auction_info), 2):
        result_list['auction'].append(soup.find_all('p', class_='visible-xs')[i].strong.string)
    for i in range(1, len(auction_info), 2):
        result_list['country'].append(soup.find_all('p', class_='visible-xs')[i].string)
    for i in range(len(auction_date_info)):
        result_list['auction_date'].append(auction_date_info[i])
    for i in range(len(my)):
        all_p = my[i].find_all('p')
        lest = all_p[1].get_text().split(',')
        result_list['type1'].append(lest[0])
        result_list['type2'].append(lest[1])
        try:
            result_list['width'].append(all_p[1].find_all('span')[-1].get_text().replace('x','').replace('cm','').split()[0])
        except:
            result_list['width'].append('')
        try:
            result_list['height'].append(all_p[1].find_all('span')[-1].get_text().replace('x','').replace('cm','').split()[1])
        except:
            result_list['height'].append('')
    return result_list
df = pd.DataFrame(data=parse())
df
df.to_excel("test.xlsx")

In some moments, it happens that the string is empty, so I put try