파이썬 웹 스크래핑
1. 필요한 프로그램
1) visual studio code에서 open in browser를 검색하여 설치
2) pip install requests
3) pip install beautifulsoup4
4) pip install lxml
5) pip install selenium
6) 구글에서 chromedriver.exe 검색하여 다운로드
2. 규칙
1) xpath
/
//
*
@
2) 정규식 python regex (regular expression)
import re
. 하나의 문자 ex) ca.e : care, cafe, case
^ 문자열의 시작 ex) ^de : desk, destination
$ 문자열의 끝 ex) se$ : case, base
3. 예제
import csv
import requests
from bs4 import BeautifulSoup
url = "https://finance.naver.com/sise/sise_market_sum.nhn?sosok=0&page="
filename = "시가총액 1-200.csv"
#f=open(filename, "w", encoding="utf8", newline="") # excel에서 글자 깨짐
f=open(filename, "w", encoding="utf-8-sig", newline="")
writer = csv.writer(f)
title = "N 종목명 현재가 전일비 등락률 액면가 시가총액 상장주식수 외국인비율 거래량 PER ROE".split("\t")
print(type(title))
writer.writerow(title)
for page in range(1, 5):
res = requests.get(url+str(page))
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
data_rows = soup.find("table", attrs={"class":"type_2"}).find("tbody").find_all("tr")
for row in data_rows:
columns = row.find_all("td")
if len(columns) <= 1:
continue
data = [column.get_text().strip() for column in columns]
#print(data)
writer.writerow(data)
4. 예제
from selenium import webdriver
browser = webdriver.Chrome()
browser.maximize_window()
url ="https://play.google.com/store/movies/top"
browser.get(url)
#browser.execute_script("window.scrollTo(0,1080)")
#browser.execute_script("window.scrollTo(0,2160)")
#browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
import time
interval = 2
prev_height = browser.execute_script("return document.body.scrollHeight")
while True:
browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(interval)
curr_height = browser.execute_script("return document.body.scrollHeight")
if curr_height == prev_height:
break
prev_height = curr_height
print("스크롤 완료")
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(browser.page_source, "lxml")
#movies = soup.find_all("div", attrs={"class":["ImZGtf mpg5gc", "Vpfmgd"]})
movies = soup.find_all("div", attrs={"class":"Vpfmgd"})
print(len(movies))
for movie in movies:
title = movie.find("div", attrs={"class":"WsMG1c nnK0zc"}).get_text()
#print(title)
original_price = movie.find("span", attrs={"SUZt4c djCuy"})
if original_price:
original_price = original_price.get_text()
else:
#print(title, "<할인되지 않은 영화 제외>")
continue
price = movie.find("span", attrs={"class":"VfPpfd ZdBevf i5DZme"}).get_text()
link = movie.find("a", attrs={"class":"JC71ub"})["href"]
print(f"제목:{title}")
print(f"할인 전 금액 : {original_price}")
print(f"할인 후 금액:{price}")
print("링크:","https://play.google.com"+link)
print("-"*100)
browser.quit()