본문 바로가기

Data Science/Python

Python으로 하는 EDA(Exploratory Data Analysis)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as fm
import matplotlib as mpl
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
from google.colab import drive
drive.mount('/content/drive')

 

1. Check data (missing values, descriptive statistics)

data = pd.read_csv('/content/sample_data/vgsales.csv')

data.head()

 

import missingno as msno

msno.matrix(df = data.iloc[:, :], figsize=(8, 8), color=(0.5, 0.6, 0.8));

 

 

 

 

 

data.isnull().sum()

#결측 데이터 제거
data = data.dropna(axis = 0)

data.shape

 

data.describe()

 

data['cate_year'] = pd.cut(data.Year, bins=[1980, 1990, 2000, 2010, 2020], labels=['80년대', '90년대', '2000년대', '2010년 이후'])

 

2. Game Revenue Trend

%matplotlib inline

x = np.arange(1980, 2019, 1)

#year를 기준으로 count pivot table 생성
datayearpivot = data.pivot_table(index=['Year'], aggfunc='count')
y = datayearpivot['Rank']

data_year_sum = data.groupby('Year').sum()
y2 = data_year_sum['Global_Sales']

fig = plt.figure(figsize=(15, 7))
ax11 = fig.add_subplot(111)
ax11.plot(x, y, label = 'Count')

#twinx는 같은 x축을 쓰는 plot
ax12 = ax11.twinx()
ax12.plot(x, y2, '--', label = 'Sales')

ax11.legend(loc=2, fontsize=15)
ax12.legend(loc=0, fontsize=15)
ax11.grid()
ax11.set_xlabel('Year', fontsize=15)
ax11.set_ylabel('Count', fontsize=15)
ax12.set_ylabel('Sales', fontsize=15)
ax12.set_title('Games Count & Sales per Year', fontsize=20)

ax12.annotate('Sales Maximum', 
            xy=(2008, 678.90), xycoords='data', xytext=(-100, -10), textcoords='offset points',
            arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.5"))

 

national_sales = pd.pivot_table(data, values=('NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'), 
                                index = ['Year'], aggfunc = np.sum).sort_values(by = 'Year', ascending = True)

ax = national_sales.plot(kind = 'line', figsize = (15, 8), fontsize = 18)
ax.set_title('Annual Sales Trends by Country', fontsize=20)
ax.grid()
ax.set_xlabel('Year', fontsize=16)
ax.set_ylabel('Sales', fontsize=16)

 

game_group2 = game_group[['NA_Sales',	'EU_Sales',	'JP_Sales',	'Other_Sales',	'Global_Sales']]
game_group2

plt.figure(figsize=(20,13))

sns.heatmap(game_group2, cmap=sns.light_palette(
    "gray", as_cmap=True), annot=True, fmt='.1f')
plt.title("Heatmap")
plt.show()

 

 

 

. Top Platform & Genre

 

plt.figure(figsize=(15, 5))

ax21 = sns.countplot('Platform', data = data, order = data['Platform'].value_counts().index, palette='bone')
ax21.set_title('Preferred Platform', fontsize=20)

 

 

plt.figure(figsize=(15, 5))

ax21 = sns.countplot('Genre', data = data, order = data['Genre'].value_counts().index, palette='Purples')
ax21.set_title('Preferred Genre', fontsize=20)

 

genre_top5 = pd.DataFrame(game_group['Global_Sales'].reset_index('Genre').sort_values(by='Global_Sales', ascending=False)).head(5)
labels = genre_top5['Genre']
colors = plt.cm.Reds(np.linspace(0, 5, 30))

#plt.title('sales top 5 genre', fontsize=20)
#plt.pie(genre_top5['Global_Sales'], labels=labels, shadow=True, startangle= 90, colors=colors, autopct='%.1f');


fig = px.pie(genre_top5, values='Global_Sales', names=genre_top5['Genre'], title='Publishers Global Market Share (%)')
fig.update_traces(textposition='inside', textinfo='percent+label', textfont_size=20)
fig.update_layout(
    autosize=False,
    width=500,
    height=500)

 

. Top 10 Preferred Games by Genre

 

data['Genre'].unique()

action = data[data['Genre']=='Action']
sports = data[data['Genre']=='Sports']
shooter = data[data['Genre']=='Shooter']

action_group = action.groupby('Name').sum()
action_group = action_group.sort_values(by=['Global_Sales'], ascending=False).head(10).reset_index('Name')

plt.rcParams['figure.figsize'] = (20, 7)
ax = sns.barplot(x = action_group['Name'], y = action_group['Global_Sales'], palette='magma')
plt.xticks(fontsize = 13, rotation = 45)
plt.title(label = 'Top 10 games in action genre', fontsize = 20)
ax.set_xlabel('Name', fontsize=16)
ax.set_ylabel('Sales', fontsize=16)

 

 

sports_group = sports.groupby('Name').sum()
sports_group = sports_group.sort_values(by=['Global_Sales'], ascending=False).head(10).reset_index('Name')

plt.rcParams['figure.figsize'] = (20, 7)
ax = sns.barplot(x = sports_group['Name'], y = sports_group['Global_Sales'], palette='hls')
plt.xticks(fontsize = 13, rotation = 45)
plt.title(label = 'Top 10 games in sports genre', fontsize = 20)
ax.set_xlabel('Name', fontsize=16)
ax.set_ylabel('Sales', fontsize=16)

 

shooter_group = shooter.groupby('Name').sum()
shooter_group = shooter_group.sort_values(by=['Global_Sales'], ascending=False).head(10).reset_index('Name')

plt.rcParams['figure.figsize'] = (20, 7)
ax = sns.barplot(x = shooter_group['Name'], y = shooter_group['Global_Sales'], palette='PuBu')
plt.xticks(fontsize = 13, rotation = 45)
plt.title(label = 'Top 10 games in shooter genre', fontsize = 20)
ax.set_xlabel('Name', fontsize=16)
ax.set_ylabel('Sales', fontsize=16)

'Data Science > Python' 카테고리의 다른 글

정부 API로 josn 파싱하기  (0) 2021.03.02
지수 표기 숫자로 변환하여 표현하기  (0) 2021.02.23
Python 결측치 처리  (0) 2021.01.14
python으로 카이제곱 검정  (0) 2021.01.06
Python으로 T-test  (0) 2021.01.05