상세 컨텐츠

본문 제목

파이썬 웹 스크래핑

Programming

by 신농해태 2021. 9. 18. 17:23

본문

반응형

1. 필요한 프로그램

1) visual studio code에서 open in browser를 검색하여 설치

2) pip install requests

3) pip install beautifulsoup4

4) pip install lxml

5) pip install selenium

6) 구글에서 chromedriver.exe 검색하여 다운로드

 

2. 규칙

1) xpath

/
//
*
@

 

2) 정규식 python regex (regular expression)
import re
.   하나의 문자  ex) ca.e  : care, cafe, case
^  문자열의 시작 ex) ^de : desk, destination
$  문자열의 끝 ex) se$ : case, base

 

 

3. 예제

import csv

import requests

from bs4 import BeautifulSoup

 

url = "https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page="

 

filename = "시가총액 1-200.csv"

#f=open(filename, "w", encoding="utf8", newline="") # excel에서 글자 깨짐

f=open(filename"w"encoding="utf-8-sig"newline="")

writer = csv.writer(f)

 

title = "N  종목명 현재가 전일비 등락률 액면가 시가총액    상장주식수   외국인비율   거래량 PER ROE".split("\t")

print(type(title))

writer.writerow(title)

 

for page in range(15):

    res = requests.get(url+str(page))

    res.raise_for_status()

    soup = BeautifulSoup(res.text"lxml")

 

    data_rows = soup.find("table"attrs={"class":"type_2"}).find("tbody").find_all("tr")

 

    for row in data_rows:

        columns = row.find_all("td")

        if len(columns) <= 1:

            continue

        data = [column.get_text().strip() for column in columns

        #print(data)    

        writer.writerow(data)

 

4. 예제 

from selenium import webdriver

browser = webdriver.Chrome()

browser.maximize_window()

 

url ="https://play.google.com/store/movies/top"

browser.get(url)

 

#browser.execute_script("window.scrollTo(0,1080)")

#browser.execute_script("window.scrollTo(0,2160)")

 

#browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")

 

import time

interval = 2

prev_height = browser.execute_script("return document.body.scrollHeight")

 

while True:

    browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")

    time.sleep(interval)

 

    curr_height = browser.execute_script("return document.body.scrollHeight")

    if curr_height == prev_height:

        break

 

    prev_height = curr_height

 

print("스크롤 완료")

 

import requests

from bs4 import BeautifulSoup

 

soup = BeautifulSoup(browser.page_source"lxml")

 

#movies = soup.find_all("div", attrs={"class":["ImZGtf mpg5gc", "Vpfmgd"]})

movies = soup.find_all("div"attrs={"class":"Vpfmgd"})

print(len(movies))

 

for movie in movies:

    title = movie.find("div"attrs={"class":"WsMG1c nnK0zc"}).get_text()

    #print(title)

 

    original_price = movie.find("span"attrs={"SUZt4c djCuy"})

    if original_price:

        original_price =  original_price.get_text()

    else:

        #print(title, "<할인되지 않은 영화 제외>")

        continue

 

    price = movie.find("span"attrs={"class":"VfPpfd ZdBevf i5DZme"}).get_text()

 

    link = movie.find("a"attrs={"class":"JC71ub"})["href"]

 

    print(f"제목:{title}")

    print(f"할인 전 금액 : {original_price}")

    print(f"할인 후 금액:{price}")

    print("링크:","https://play.google.com"+link)

    print("-"*100)

 

browser.quit()

728x90
LIST

관련글 더보기

댓글 영역