웹 크롤링(Web crawling)
웹 크롤링은 웹상에 존재하는 정보를 수집하는 작업을 말한다.
네이버 영화 크롤링
import re
import requests
from bs4 import BeautifulSoup
BASE_URL = "https://movie.naver.com/movie"
def get_page(page_url):
page = requests.get(page_url)
soup = BeautifulSoup(page.text)
return soup, page
영화 별점 평균 내기
def get_avg_stars(reviews):
score = 0
for review in reviews:
score += review['review_star']
avg = score / len(reviews)
return avg
영화 제목으로 영화 코드 받기
def get_movie_code(movie_title):
search_url = f"{BASE_URL}/search/result.nhn?query={movie_title}§ion=all&ie=utf8"
resp = requests.get(search_url)
soup = BeautifulSoup(resp.content, 'html.parser')
soup1 = soup.find('dt').find('a')['href']
soup2 = soup1.split("code=")
soup3 = soup2[1]
movie_code = int(soup3)
print(movie_code)
return movie_code
def get_reviews(movie_code, page_num=1):
review_url = f"{BASE_URL}/point/af/list.nhn?st=mcode&sword={movie_code}&target=after&page={page_num}"
resp = requests.get(review_url)
soup = BeautifulSoup(resp.content, 'html.parser')
table_rows = soup.find_all('td', class_ = 'title')
review_list = []
for r in table_rows:
review = dict()
review['review_text'] = str(r.select_one('br').next_sibling.strip())
review['review_star'] = int(r.select_one('em').get_text())
review_list.append(review)
return review_list
영화 리뷰를 원하는 리뷰 수 만큼 크롤링
def scrape_by_review_num(movie_title, review_num):
movie_code = get_movie_code(movie_title)
i = 1
reviews = []
while len(reviews) < review_num:
reviews.extend(get_reviews(movie_code, i))
i += 1
reviews = reviews[:review_num]
return reviews
영화 리뷰를 원하는 페이지 수 만큼 크롤링
def scrape_by_page_num(movie_title, page_num=10):
reviews = []
movie_code = get_movie_code(movie_title)
for i in range(page_num):
reviews.extend(get_reviews(movie_code, i))
return reviews
db에 저장
def store_by_page_num(movie_title, page_num=10, conn=conn):
cursor = conn.cursor()
reviews = Part_1.scrape_by_page_num(movie_title, page_num)
pd_reviews = pd.DataFrame(reviews)
pd_reviews['movie_title'] = movie_title
pd_reviews.columns = ['review_text', 'review_star', 'movie_title']
pd_reviews.to_sql('Review', conn, if_exists = 'append', index_label = 'id')
conn.commit()
def init_db(conn=conn):
create_table = """CREATE TABLE Review (
id INTEGER,
review_text TEXT,
review_star FLOAT,
movie_title VARCHAR(128),
PRIMARY KEY (id)
);"""
drop_table_if_exists = "DROP TABLE IF EXISTS Review;"
cur = conn.cursor()
cur.execute(drop_table_if_exists)
cur.execute(create_table)
cur.close()
'AI > Data Engineer' 카테고리의 다른 글
database : SQL (0) | 2021.03.14 |
---|---|
docker (0) | 2021.03.14 |