[Python] 아마존 상품검색 페이지의 스폰서 광고 상품 ASIN 수집하기

아마존 상품 검색 페이지 중 오른 쪽 스폰서 광고 상품의 ASIN 정보를 수집한다.

# _*_ coding: utf-8 _*_

from bs4 import BeautifulSoup
import urllib
import html5lib
import time
import re

keywords = 'fishing rod'
headers = {'User-Agent': ' Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0'}
asins = []

values = {'keywords': keywords, 'page' : 1, 'ie' : 'UTF8', 'qid' : int(time.time()) }

data = urllib.parse.urlencode(values)
data = data.encode('ascii')

try:
    request = urllib.request.Request('https://www.amazon.com/s/ref=sr_ex_n_0', data, headers)

    with urllib.request.urlopen(request, timeout=30) as response:
        the_page = response.read()


    result = BeautifulSoup(the_page, 'html5lib')

    content = result.select("#desktop-rhs-carousels_click_within_right a[href*='product-reviews']")

    for tag in content:
        str = re.search('/product-reviews/(.+?)/.+', tag.get('href'))
        asin = str.group(1)

        if(asin not in asins):
            asins.append(asin)

    for asin in asins:
        print(asin)
except:
    pass

BeautifulSoup.select() 와 정규식을 사용해 ASIN 정보를 수집하고 출력한다.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.