In [1]:
#티스토리 윈도우 사이즈 맞추기
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
In [2]:
import scipy as sp
import numpy as np
import pandas as pd
from scipy import stats
In [3]:
cpi_korea = pd.read_csv("소비자물가지수_2020100__20230107180655.csv", encoding='cp949')
In [4]:
cpi_korea.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 시점 19 non-null int64
1 전국 19 non-null float64
dtypes: float64(1), int64(1)
memory usage: 432.0 bytes
In [5]:
cpi_korea['시점'] = cpi_korea['시점'].astype(str)
In [6]:
cpi_korea['시점'] = pd.to_datetime(cpi_korea['시점'], format='%Y')
In [7]:
cpi_korea['시점']
Out[7]:
0 2004-01-01
1 2005-01-01
2 2006-01-01
3 2007-01-01
4 2008-01-01
5 2009-01-01
6 2010-01-01
7 2011-01-01
8 2012-01-01
9 2013-01-01
10 2014-01-01
11 2015-01-01
12 2016-01-01
13 2017-01-01
14 2018-01-01
15 2019-01-01
16 2020-01-01
17 2021-01-01
18 2022-01-01
Name: 시점, dtype: datetime64[ns]
In [8]:
cpi_korea.head()
Out[8]:
시점 | 전국 | |
---|---|---|
0 | 2004-01-01 | 72.418 |
1 | 2005-01-01 | 74.413 |
2 | 2006-01-01 | 76.081 |
3 | 2007-01-01 | 78.010 |
4 | 2008-01-01 | 81.656 |
In [9]:
import datetime as dt
In [10]:
index = pd.date_range(start='2003-01-01', end='2022-01-01', freq='AS')
index
Out[10]:
DatetimeIndex(['2003-01-01', '2004-01-01', '2005-01-01', '2006-01-01',
'2007-01-01', '2008-01-01', '2009-01-01', '2010-01-01',
'2011-01-01', '2012-01-01', '2013-01-01', '2014-01-01',
'2015-01-01', '2016-01-01', '2017-01-01', '2018-01-01',
'2019-01-01', '2020-01-01', '2021-01-01', '2022-01-01'],
dtype='datetime64[ns]', freq='AS-JAN')
In [11]:
data = {'가격':[440, 700, 700, 1000, 1000, 1200, 1200, 1200, 1200, 1400, 1400, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1700, 1700],
'중량':[55, 51, 51, 51, 51, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 41, 41, 41]}
index = pd.date_range(start='2004-01-01', end='2023-01-01', freq='AS')
homerun_ball = pd.DataFrame(data, index = index, columns=['가격','중량'])
# 2004년 440원 / 55g => 2023년 1700원 / 41g
# 2004년 가격인상 +260원, 중량절감 -4g
# 2006년 가격인상 +300원
# 2008년 가격인상 +200원, 중량절감 -5g
# 20011년 가격인상 +200원
# 2013년 가격인상 +100원
# 2019년 중량절감 -5g
# 2021년 가격인상 +200원
In [12]:
homerun_ball['1g당_가격'] = round(homerun_ball['가격'] / homerun_ball['중량'], 0)
homerun_ball
Out[12]:
가격 | 중량 | 1g당_가격 | |
---|---|---|---|
2004-01-01 | 440 | 55 | 8.0 |
2005-01-01 | 700 | 51 | 14.0 |
2006-01-01 | 700 | 51 | 14.0 |
2007-01-01 | 1000 | 51 | 20.0 |
2008-01-01 | 1000 | 51 | 20.0 |
2009-01-01 | 1200 | 46 | 26.0 |
2010-01-01 | 1200 | 46 | 26.0 |
2011-01-01 | 1200 | 46 | 26.0 |
2012-01-01 | 1200 | 46 | 26.0 |
2013-01-01 | 1400 | 46 | 30.0 |
2014-01-01 | 1400 | 46 | 30.0 |
2015-01-01 | 1500 | 46 | 33.0 |
2016-01-01 | 1500 | 46 | 33.0 |
2017-01-01 | 1500 | 46 | 33.0 |
2018-01-01 | 1500 | 46 | 33.0 |
2019-01-01 | 1500 | 46 | 33.0 |
2020-01-01 | 1500 | 46 | 33.0 |
2021-01-01 | 1500 | 41 | 37.0 |
2022-01-01 | 1700 | 41 | 41.0 |
2023-01-01 | 1700 | 41 | 41.0 |
In [13]:
homerun_ball['시점'] = homerun_ball.index
In [14]:
homerun_ball.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 20 entries, 2004-01-01 to 2023-01-01
Freq: AS-JAN
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 가격 20 non-null int64
1 중량 20 non-null int64
2 1g당_가격 20 non-null float64
3 시점 20 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 800.0 bytes
In [15]:
homerun_ball['가격']
Out[15]:
2004-01-01 440
2005-01-01 700
2006-01-01 700
2007-01-01 1000
2008-01-01 1000
2009-01-01 1200
2010-01-01 1200
2011-01-01 1200
2012-01-01 1200
2013-01-01 1400
2014-01-01 1400
2015-01-01 1500
2016-01-01 1500
2017-01-01 1500
2018-01-01 1500
2019-01-01 1500
2020-01-01 1500
2021-01-01 1500
2022-01-01 1700
2023-01-01 1700
Freq: AS-JAN, Name: 가격, dtype: int64
In [16]:
df_merged = pd.merge_asof(cpi_korea, homerun_ball )
In [17]:
df_merged.columns = ['시점','CPI','가격','중량','1g당_가격']
In [18]:
import matplotlib.pyplot as plt
plt.rc('font', family='NanumGothic')
plt.figure(figsize=(20, 8))
# 데이터
y1_value = df_merged['중량']
y2_value = df_merged['가격']
y3_value = df_merged['1g당_가격']
y4_value = df_merged['CPI']
plt.subplot(1, 2, 1)
plt.title('\n 2004~2022 홈런볼 중량 및 가격 변화 \n', fontsize=20, font="NanumGothic")
# 바차트 : YoY
bar = plt.bar(df_merged['시점'], y1_value, align='center',hatch="//",
edgecolor = 'yellow', label='중량', width=200, color='yellowgreen')
plt.xlabel('DATE')
plt.ylabel('중량')
plt.grid()
plt.ylim(0,60)
plt.legend(loc=2)
plt.twinx()
plt.plot(df_merged['시점'], y2_value, color='green', label='가격',marker='v')
plt.ylabel('가격')
plt.legend(loc=1)
plt.ylim(0,1800)
###
ax1 = plt.subplot(1, 2, 2)
plt.title('\n 2004~2022 홈런볼 1g 당 가격변화(정규화 전) \n', fontsize=20, font="NanumGothic")
ax1.plot(df_merged['시점'], y3_value, color='midnightblue', marker='*', label='1g당 가격')
ax1.set_ylabel('홈런볼 1g당 가격')
ax1.legend(loc=2)
ax2 = ax1.twinx()
ax2.plot(df_merged['시점'], y4_value, color='red', marker='*', label='소비자물가지수')#, secondary_y=True)
ax2.set_ylabel('한국 소비자물가지수')
ax2.legend(loc=1)
plt.grid()
plt.show()
In [19]:
df_merged
Out[19]:
시점 | CPI | 가격 | 중량 | 1g당_가격 | |
---|---|---|---|---|---|
0 | 2004-01-01 | 72.418 | 440 | 55 | 8.0 |
1 | 2005-01-01 | 74.413 | 700 | 51 | 14.0 |
2 | 2006-01-01 | 76.081 | 700 | 51 | 14.0 |
3 | 2007-01-01 | 78.010 | 1000 | 51 | 20.0 |
4 | 2008-01-01 | 81.656 | 1000 | 51 | 20.0 |
5 | 2009-01-01 | 83.906 | 1200 | 46 | 26.0 |
6 | 2010-01-01 | 86.373 | 1200 | 46 | 26.0 |
7 | 2011-01-01 | 89.850 | 1200 | 46 | 26.0 |
8 | 2012-01-01 | 91.815 | 1200 | 46 | 26.0 |
9 | 2013-01-01 | 93.010 | 1400 | 46 | 30.0 |
10 | 2014-01-01 | 94.196 | 1400 | 46 | 30.0 |
11 | 2015-01-01 | 94.861 | 1500 | 46 | 33.0 |
12 | 2016-01-01 | 95.783 | 1500 | 46 | 33.0 |
13 | 2017-01-01 | 97.645 | 1500 | 46 | 33.0 |
14 | 2018-01-01 | 99.086 | 1500 | 46 | 33.0 |
15 | 2019-01-01 | 99.466 | 1500 | 46 | 33.0 |
16 | 2020-01-01 | 100.000 | 1500 | 46 | 33.0 |
17 | 2021-01-01 | 102.500 | 1500 | 41 | 37.0 |
18 | 2022-01-01 | 107.710 | 1700 | 41 | 41.0 |
In [20]:
(107.7 - 72.4)/72.4
Out[20]:
0.4875690607734806
In [21]:
107.7/72.4
Out[21]:
1.4875690607734806
In [22]:
(41-8)/8
Out[22]:
4.125
In [23]:
41/8
Out[23]:
5.125
서로 다른 스케일의 두 데이터를 정규화¶
In [24]:
# 2017.7 = 100 기준인 US값을 400.67 로 나눠주고 100을 곱해 환산해줍니다.
df_merged_new = df_merged[['시점']].copy()
base_point_CPI = float(df_merged['CPI'][df_merged['시점']=='2004-01-01'])
base_point_HRB = float(df_merged['1g당_가격'][df_merged['시점']=='2004-01-01'])
df_merged_new['CPI(2004=100)'] = df_merged['CPI'] / base_point_CPI * 100
df_merged_new['HRB(2004=100)'] = df_merged['1g당_가격'] / base_point_HRB * 100
In [25]:
df_merged_new
Out[25]:
시점 | CPI(2004=100) | HRB(2004=100) | |
---|---|---|---|
0 | 2004-01-01 | 100.000000 | 100.0 |
1 | 2005-01-01 | 102.754840 | 175.0 |
2 | 2006-01-01 | 105.058135 | 175.0 |
3 | 2007-01-01 | 107.721837 | 250.0 |
4 | 2008-01-01 | 112.756497 | 250.0 |
5 | 2009-01-01 | 115.863459 | 325.0 |
6 | 2010-01-01 | 119.270071 | 325.0 |
7 | 2011-01-01 | 124.071363 | 325.0 |
8 | 2012-01-01 | 126.784777 | 325.0 |
9 | 2013-01-01 | 128.434919 | 375.0 |
10 | 2014-01-01 | 130.072634 | 375.0 |
11 | 2015-01-01 | 130.990914 | 412.5 |
12 | 2016-01-01 | 132.264078 | 412.5 |
13 | 2017-01-01 | 134.835262 | 412.5 |
14 | 2018-01-01 | 136.825099 | 412.5 |
15 | 2019-01-01 | 137.349830 | 412.5 |
16 | 2020-01-01 | 138.087216 | 412.5 |
17 | 2021-01-01 | 141.539396 | 462.5 |
18 | 2022-01-01 | 148.733740 | 512.5 |
In [26]:
import matplotlib.pyplot as plt
plt.rc('font', family='NanumGothic')
plt.figure(figsize=(20, 8))
# 데이터
y1_value = df_merged['중량']
y2_value = df_merged['가격']
y3_value = df_merged_new['CPI(2004=100)']
y4_value = df_merged_new['HRB(2004=100)']
plt.subplot(1, 2, 1)
plt.title('\n 2004~2022 홈런볼 중량 및 가격 변화 \n', fontsize=20, font="NanumGothic")
# 바차트 : YoY
bar = plt.bar(df_merged['시점'], y1_value, align='center',hatch="//",
edgecolor = 'yellow', label='중량', width=200, color='yellowgreen')
plt.xlabel('DATE')
plt.ylabel('중량')
plt.grid()
plt.ylim(0,60)
# plt.margins(x=0)
plt.legend(loc=2)
# for rect in bar:
# height = rect.get_height()
# plt.text(rect.get_x() + rect.get_width()/2.0, height-5, '%.1f g'% height, ha='center', va = 'bottom', size=12, color='red')
plt.twinx()
plt.plot(df_merged['시점'], y2_value, color='green', label='가격',marker='v')
plt.ylabel('가격')
plt.legend(loc=1)
plt.ylim(0,1800)
###
ax1 = plt.subplot(1, 2, 2)
plt.title('\n 2004~2022 홈런볼 가격(1g) vs. 소비자물가지수 \n', fontsize=20, font="NanumGothic")
# fig, ax1 = plt.subplots()
ax1.plot(df_merged['시점'], y3_value, color='midnightblue', marker='*', label='홈런볼 가격지수')
ax1.set_ylabel('홈런볼 1g당 가격')
ax1.legend(loc=2)
ax1.set_ylim(0,600)
ax2 = ax1.twinx()
ax2.plot(df_merged['시점'], y4_value, color='red', marker='*', label='소비자물가지수')#, secondary_y=True)
ax2.set_ylabel('한국 소비자물가지수')
ax2.set_ylim(0,600)
# for i in range(len(xtick_label_position)):
# height = homerun_ball['1g당_가격'][i]
# plt.text(homerun_ball['시점'][i], height-1, '%d원/g'% height, ha='center', va = 'bottom', size=10)
ax2.legend(loc=1)
plt.grid()
plt.show()
In [27]:
# 1. 기본 스타일 설정
plt.rc('font', family='NanumGothic')
plt.rcParams['figure.figsize'] = (20, 8)
# 2. 데이터 준비
x = df_merged_new['시점']
y1 = df_merged_new['CPI(2004=100)']
y2 = df_merged_new['HRB(2004=100)']
# 3. 그래프 그리기
fig, ax1 = plt.subplots()
# y1
ax1.plot(x, y1, '-s', color='green', markersize=3, linewidth=3, alpha=0.7, label='대한민국 소비자 물가지수')
# y2
ax1.plot(x, y2, '-s', color='blue', markersize=3, linewidth=3, alpha=0.7, label='1g당 홈런볼 가격')
ax1.set_xlabel('DATE')
ax1.set_ylabel('INDEX')
ax1.tick_params(axis='both', direction='in')
ax1.set_zorder(ax2.get_zorder() + 10)
ax1.patch.set_visible(False)
ax1.legend(loc='upper left')
plt.show()
'빅데이터(Big Data) 이론과 코드 > 6. 통계지식' 카테고리의 다른 글
최대 우도 추정법(Maximum Likelihood Estimation, MLE) (0) | 2025.01.06 |
---|---|
예산과 전문가 없이 데이터로 인포그래픽을 만들기 (1) | 2024.06.25 |
마트 홈런볼과 편의점 홈런볼의 독립표본 t검정 (1) | 2023.01.18 |
[홈런볼로 배우는 데이터 경제] t검정 (1) | 2022.12.26 |
왜도(skewness)와 첨도(kurtosis) (0) | 2022.02.28 |