본문 바로가기

AI/Data Engineer

Web crawling

웹 크롤링(Web crawling)

웹 크롤링은 웹상에 존재하는 정보를 수집하는 작업을 말한다.

 

네이버 영화 크롤링

import re
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://movie.naver.com/movie"


def get_page(page_url):
    page = requests.get(page_url)
    soup = BeautifulSoup(page.text)

    return soup, page

 

영화 별점 평균 내기

def get_avg_stars(reviews):
    score = 0
    for review in reviews:
        score += review['review_star']
    avg = score / len(reviews)

    return avg

 

영화 제목으로 영화 코드 받기

def get_movie_code(movie_title):

    search_url = f"{BASE_URL}/search/result.nhn?query={movie_title}&section=all&ie=utf8"

    resp = requests.get(search_url)
    soup = BeautifulSoup(resp.content, 'html.parser')
    soup1 = soup.find('dt').find('a')['href']
    soup2 = soup1.split("code=")
    soup3 = soup2[1]
    movie_code = int(soup3)
    print(movie_code)

    return movie_code

 

 

 

def get_reviews(movie_code, page_num=1):

    review_url = f"{BASE_URL}/point/af/list.nhn?st=mcode&sword={movie_code}&target=after&page={page_num}"
    
    resp = requests.get(review_url)
    soup = BeautifulSoup(resp.content, 'html.parser')

    table_rows = soup.find_all('td', class_ = 'title')
    
    review_list = []

    for r in table_rows:
        review = dict()
        
        review['review_text'] = str(r.select_one('br').next_sibling.strip())
        review['review_star'] = int(r.select_one('em').get_text())
        
        review_list.append(review)
    
    return review_list

 

영화 리뷰를 원하는 리뷰 수 만큼 크롤링

def scrape_by_review_num(movie_title, review_num):
 
    movie_code = get_movie_code(movie_title)
    i = 1
    reviews = []
    while len(reviews) < review_num:
        reviews.extend(get_reviews(movie_code, i))
        i += 1

    reviews = reviews[:review_num]
    
    return reviews

 

영화 리뷰를 원하는 페이지 수 만큼 크롤링

def scrape_by_page_num(movie_title, page_num=10):
    reviews = []
    movie_code = get_movie_code(movie_title)
    
    for i in range(page_num):
        reviews.extend(get_reviews(movie_code, i))
    
    return reviews

 

db에 저장

def store_by_page_num(movie_title, page_num=10, conn=conn):

    cursor = conn.cursor()

    reviews = Part_1.scrape_by_page_num(movie_title, page_num)
    
    pd_reviews = pd.DataFrame(reviews)

    pd_reviews['movie_title'] = movie_title
    pd_reviews.columns = ['review_text', 'review_star', 'movie_title']

    pd_reviews.to_sql('Review', conn, if_exists = 'append', index_label = 'id')

    conn.commit()
    
 def init_db(conn=conn):

    create_table = """CREATE TABLE Review (
                        id INTEGER,
                        review_text TEXT,
                        review_star FLOAT,
                        movie_title VARCHAR(128),
                        PRIMARY KEY (id)
                        );"""

    drop_table_if_exists = "DROP TABLE IF EXISTS Review;"

    cur = conn.cursor()

    cur.execute(drop_table_if_exists)
    cur.execute(create_table)
    cur.close()

'AI > Data Engineer' 카테고리의 다른 글

database : SQL  (0) 2021.03.14
docker  (0) 2021.03.14