Python 기본문법 정리
and Pandas
# DateTime Index를 생성한다
import pandas as pd
one_year = pd.date_range('2018-01-01',periods=365)
data = [i for i in range(len(one_year))]
data = pd.Series(data, index=one_year)
businessday = data.resample('B')
(365 - len(businessday)) / 2
52.0
# Tips
data.resample('M').mean()
2018-01-31 15.0 2018-02-28 44.5 2018-03-31 74.0 2018-04-30 104.5 2018-05-31 135.0 2018-06-30 165.5 2018-07-31 196.0 2018-08-31 227.0 2018-09-30 257.5 2018-10-31 288.0 2018-11-30 318.5 2018-12-31 349.0 Freq: M, dtype: float64
# Tips
data.resample('M').ohlc()
open | high | low | close | |
---|---|---|---|---|
2018-01-31 | 0 | 30 | 0 | 30 |
2018-02-28 | 31 | 58 | 31 | 58 |
2018-03-31 | 59 | 89 | 59 | 89 |
2018-04-30 | 90 | 119 | 90 | 119 |
2018-05-31 | 120 | 150 | 120 | 150 |
2018-06-30 | 151 | 180 | 151 | 180 |
2018-07-31 | 181 | 211 | 181 | 211 |
2018-08-31 | 212 | 242 | 212 | 242 |
2018-09-30 | 243 | 272 | 243 | 272 |
2018-10-31 | 273 | 303 | 273 | 303 |
2018-11-30 | 304 | 333 | 304 | 333 |
2018-12-31 | 334 | 364 | 334 | 364 |
pd.date_range('1/1/2011', periods=5, freq='H')
DatetimeIndex(['2011-01-01 00:00:00', '2011-01-01 01:00:00', '2011-01-01 02:00:00', '2011-01-01 03:00:00', '2011-01-01 04:00:00'], dtype='datetime64[ns]', freq='H')
pd.date_range('1/1/2011', periods=5, freq='min')
DatetimeIndex(['2011-01-01 00:00:00', '2011-01-01 00:01:00', '2011-01-01 00:02:00', '2011-01-01 00:03:00', '2011-01-01 00:04:00'], dtype='datetime64[ns]', freq='T')
pd.date_range('1/1/2011', periods=5, freq='s')
DatetimeIndex(['2011-01-01 00:00:00', '2011-01-01 00:00:01', '2011-01-01 00:00:02', '2011-01-01 00:00:03', '2011-01-01 00:00:04'], dtype='datetime64[ns]', freq='S')
pd.date_range('1/1/2011', periods=5, freq='ms')
DatetimeIndex([ '2011-01-01 00:00:00', '2011-01-01 00:00:00.001000', '2011-01-01 00:00:00.002000', '2011-01-01 00:00:00.003000', '2011-01-01 00:00:00.004000'], dtype='datetime64[ns]', freq='L')
pd.date_range('1/1/2011', periods=5, freq='1ms')
DatetimeIndex([ '2011-01-01 00:00:00', '2011-01-01 00:00:00.001000', '2011-01-01 00:00:00.002000', '2011-01-01 00:00:00.003000', '2011-01-01 00:00:00.004000'], dtype='datetime64[ns]', freq='L')
import numpy as np
data = np.random.rand(110)
date = pd.date_range('2018-06-06', periods=110, freq='ms')
pd.Series(data, index=date)[:20]
2018-06-06 00:00:00.000 0.724359 2018-06-06 00:00:00.001 0.426986 2018-06-06 00:00:00.002 0.908999 2018-06-06 00:00:00.003 0.247796 2018-06-06 00:00:00.004 0.065260 2018-06-06 00:00:00.005 0.959609 2018-06-06 00:00:00.006 0.354474 2018-06-06 00:00:00.007 0.635283 2018-06-06 00:00:00.008 0.254580 2018-06-06 00:00:00.009 0.426275 2018-06-06 00:00:00.010 0.169618 2018-06-06 00:00:00.011 0.088882 2018-06-06 00:00:00.012 0.258088 2018-06-06 00:00:00.013 0.659739 2018-06-06 00:00:00.014 0.474797 2018-06-06 00:00:00.015 0.909689 2018-06-06 00:00:00.016 0.980873 2018-06-06 00:00:00.017 0.167155 2018-06-06 00:00:00.018 0.374242 2018-06-06 00:00:00.019 0.057846 Freq: L, dtype: float64
import pandas as pd
date_nan = pd.date_range('2018-06-01', periods=5, freq='h')
date_nan
DatetimeIndex(['2018-06-01 00:00:00', '2018-06-01 01:00:00', '2018-06-01 02:00:00', '2018-06-01 03:00:00', '2018-06-01 04:00:00'], dtype='datetime64[ns]', freq='H')
date_utc = pd.date_range('2018-06-01', periods=5, freq='h', tz='UTC')
date_utc
DatetimeIndex(['2018-06-01 00:00:00+00:00', '2018-06-01 01:00:00+00:00', '2018-06-01 02:00:00+00:00', '2018-06-01 03:00:00+00:00', '2018-06-01 04:00:00+00:00'], dtype='datetime64[ns, UTC]', freq='H')
date_kor = pd.date_range(start='2018-01-01', periods=5, tz='Asia/Tokyo')
date_kor
DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00', '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00', '2018-01-05 00:00:00+09:00'], dtype='datetime64[ns, Asia/Tokyo]', freq='D')
date_kor.tz_convert('UTC')
DatetimeIndex(['2017-12-31 15:00:00+00:00', '2018-01-01 15:00:00+00:00', '2018-01-02 15:00:00+00:00', '2018-01-03 15:00:00+00:00', '2018-01-04 15:00:00+00:00'], dtype='datetime64[ns, UTC]', freq='D')
date_utc.tz_convert('Asia/Tokyo')
DatetimeIndex(['2018-06-01 09:00:00+09:00', '2018-06-01 10:00:00+09:00', '2018-06-01 11:00:00+09:00', '2018-06-01 12:00:00+09:00', '2018-06-01 13:00:00+09:00'], dtype='datetime64[ns, Asia/Tokyo]', freq='H')
# 처음 생성할때 UTC 정보가 없으면 수정이 어렵다
# date_nan.tz_convert(9)
# date_nan.tz_convert('UTC')
data_nine = date_utc.tz_convert(9)
print(data_nine.tzinfo)
data_nine[:10]
pytz.FixedOffset(0)
DatetimeIndex(['2018-06-01 00:00:09+00:00:09', '2018-06-01 01:00:09+00:00:09', '2018-06-01 02:00:09+00:00:09', '2018-06-01 03:00:09+00:00:09', '2018-06-01 04:00:09+00:00:09'], dtype='datetime64[ns, pytz.FixedOffset(0)]', freq='H')
# 목록보기
# List of tz database time zones 을 위키에서 찾기
from pandas_datareader import get_data_yahoo
kosdaq = get_data_yahoo('031510.KQ', '2018-01-01').Close
%matplotlib inline
kosdaq.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f44ee46c6d8>
import pandas as pd
pd.DataFrame(kosdaq)
kosdaq.index = pd.DatetimeIndex(kosdaq.index)
kosdaq.tail(10)
Date 2018-01-19 6400 2018-01-22 6350 2018-01-23 6260 2018-01-24 6090 2018-01-25 6270 2018-01-26 6230 2018-01-29 6090 2018-01-30 5870 2018-01-31 5640 2018-06-08 5050 Name: Close, dtype: int64
# 현재 Yahoo finance API가 중간 자료가 빈 구간이 존재한다
# 2018-02-01 부터 2018-06-03 4달 사이의 자료가 존재하지 않음
# 이를 고치려면 별도 Naver 크롤링 등 별도 경로를 활용하여 보완해야 한다
Q. voice, Signal 분석에서 중요한 데이터는?
A. Raw 데이터 로 해당 데이터를 직접 표준화/ 정규화 한다
Q. 금융분석에서 중요한 데이터는?
A. Raw 데이터 (X)
A. 이익/ 손실여부의 판단
때문에 해당 데이터를 변화율로 전환한 뒤 이를 Log 수익률 을 사용하면 기간별 분석에 용이하다
import pandas as pd
prices = pd.read_html("https://coinmarketcap.com/currencies/bitcoin/historical-data/?start=20180103&end=20180530")[0]
prices = prices[::-1]
prices.reset_index(inplace=True, drop=True)
prices.tail()
Date | Open* | High | Low | Close** | Volume | Market Cap | |
---|---|---|---|---|---|---|---|
143 | May 26, 2018 | 7486.48 | 7595.16 | 7349.12 | 7355.88 | 4051540000 | 127682000000 |
144 | May 27, 2018 | 7362.08 | 7381.74 | 7270.96 | 7368.22 | 4056520000 | 125575000000 |
145 | May 28, 2018 | 7371.31 | 7419.05 | 7100.89 | 7135.99 | 5040600000 | 125748000000 |
146 | May 29, 2018 | 7129.46 | 7526.42 | 7090.68 | 7472.59 | 5662660000 | 121636000000 |
147 | May 30, 2018 | 7469.73 | 7573.77 | 7313.60 | 7406.52 | 4922540000 | 127454000000 |
prices.index = pd.DatetimeIndex(prices.Date)
prices.columns = [ col.replace('*', '') for col in prices.columns]
del prices['Date']
prices.head(3)
Open | High | Low | Close | Volume | Market Cap | |
---|---|---|---|---|---|---|
Date | ||||||
2018-01-03 | 14978.2 | 15572.8 | 14844.5 | 15201.0 | 16871900000 | 251312000000 |
2018-01-04 | 15270.7 | 15739.7 | 14522.2 | 15599.2 | 21783200000 | 256250000000 |
2018-01-05 | 15477.2 | 17705.2 | 15202.8 | 17429.5 | 23840900000 | 259748000000 |
prices.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 148 entries, 2018-01-03 to 2018-05-30 Data columns (total 6 columns): Open 148 non-null float64 High 148 non-null float64 Low 148 non-null float64 Close 148 non-null float64 Volume 148 non-null int64 Market Cap 148 non-null int64 dtypes: float64(4), int64(2) memory usage: 8.1 KB
%matplotlib inline
prices.Close.plot(figsize=(16,4), grid=True)
prices.tail()
Open | High | Low | Close | Volume | Market Cap | |
---|---|---|---|---|---|---|
Date | ||||||
2018-05-26 | 7486.48 | 7595.16 | 7349.12 | 7355.88 | 4051540000 | 127682000000 |
2018-05-27 | 7362.08 | 7381.74 | 7270.96 | 7368.22 | 4056520000 | 125575000000 |
2018-05-28 | 7371.31 | 7419.05 | 7100.89 | 7135.99 | 5040600000 | 125748000000 |
2018-05-29 | 7129.46 | 7526.42 | 7090.68 | 7472.59 | 5662660000 | 121636000000 |
2018-05-30 | 7469.73 | 7573.77 | 7313.60 | 7406.52 | 4922540000 | 127454000000 |
from pandas_datareader import get_data_google
get_data_google('KRX:KOSPI')
/home/markbaum/Python/python/lib/python3.6/site-packages/pandas_datareader/google/daily.py:41: UnstableAPIWarning: The Google Finance API has not been stable since late 2017. Requests seem to fail at random. Failure is especially common when bulk downloading. warnings.warn(UNSTABLE_WARNING, UnstableAPIWarning)
--------------------------------------------------------------------------- RemoteDataError Traceback (most recent call last) <ipython-input-4-dc8d7e90e953> in <module>() ----> 1 get_data_google('KRX:KOSPI') ~/Python/python/lib/python3.6/site-packages/pandas_datareader/data.py in get_data_google(*args, **kwargs) 65 66 def get_data_google(*args, **kwargs): ---> 67 return GoogleDailyReader(*args, **kwargs).read() 68 69 ~/Python/python/lib/python3.6/site-packages/pandas_datareader/base.py in read(self) 208 if isinstance(self.symbols, (compat.string_types, int)): 209 df = self._read_one_data(self.url, --> 210 params=self._get_params(self.symbols)) 211 # Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT']) 212 elif isinstance(self.symbols, DataFrame): ~/Python/python/lib/python3.6/site-packages/pandas_datareader/base.py in _read_one_data(self, url, params) 82 """ read one data from specified URL """ 83 if self._format == 'string': ---> 84 out = self._read_url_as_StringIO(url, params=params) 85 elif self._format == 'json': 86 out = self._get_response(url, params=params).json() ~/Python/python/lib/python3.6/site-packages/pandas_datareader/base.py in _read_url_as_StringIO(self, url, params) 93 Open url (and retry) 94 """ ---> 95 response = self._get_response(url, params=params) 96 text = self._sanitize_response(response) 97 out = StringIO() ~/Python/python/lib/python3.6/site-packages/pandas_datareader/base.py in _get_response(self, url, params, headers) 153 msg += '\nResponse Text:\n{0}'.format(last_response_text) 154 --> 155 raise RemoteDataError(msg) 156 157 def _get_crumb(self, *args): RemoteDataError: Unable to read URL: https://finance.google.co.uk/bctzjpnsun/historical?q=KRX%3AKOSPI&startdate=Jan+01%2C+2010&enddate=Jun+08%2C+2018&output=csv Response Text: b'<!DOCTYPE html>\n<html lang=en>\n <meta charset=utf-8>\n <meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width">\n <title>Error 404 (Not Found)!!1</title>\n <style>\n *{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat 0% 0%/100% 100%;-moz-border-image:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) 0}}@media only screen and (-webkit-min-device-pixel-ratio:2){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat;-webkit-background-size:100% 100%}}#logo{display:inline-block;height:54px;width:150px}\n </style>\n <a href=//www.google.com/><span id=logo aria-label=Google></span></a>\n <p><b>404.</b> <ins>That\xe2\x80\x99s an error.</ins>\n <p>The requested URL <code>/bctzjpnsun/historical</code> was not found on this server. <ins>That\xe2\x80\x99s all we know.</ins>\n'
from googlefinance.get import get_data
get_data('KRX:KOSPI')
code | open | high | low | close | volume | |
---|---|---|---|---|---|---|
date | ||||||
2018-04-24 | KRX:KOSPI | 2478.47 | 2479.32 | 2454.44 | 2464.14 | 524280000 |
2018-04-25 | KRX:KOSPI | 2444.19 | 2453.54 | 2436.51 | 2448.81 | 466095000 |
2018-04-26 | KRX:KOSPI | 2460.62 | 2484.09 | 2456.20 | 2475.64 | 520917000 |
2018-04-27 | KRX:KOSPI | 2497.75 | 2508.13 | 2484.19 | 2492.40 | 464957000 |
2018-04-30 | KRX:KOSPI | 2502.29 | 2515.38 | 2500.22 | 2515.38 | 746291000 |
2018-05-03 | KRX:KOSPI | 2506.94 | 2507.91 | 2487.25 | 2487.25 | 614102000 |
2018-05-04 | KRX:KOSPI | 2486.47 | 2487.77 | 2461.38 | 2461.38 | 612024000 |
2018-05-08 | KRX:KOSPI | 2468.45 | 2479.75 | 2444.08 | 2449.81 | 714734000 |
2018-05-09 | KRX:KOSPI | 2450.71 | 2451.86 | 2428.79 | 2443.98 | 593365000 |
2018-05-10 | KRX:KOSPI | 2458.67 | 2464.72 | 2448.01 | 2464.16 | 490004000 |
2018-05-11 | KRX:KOSPI | 2469.30 | 2483.85 | 2468.41 | 2477.71 | 599900000 |
2018-05-14 | KRX:KOSPI | 2482.97 | 2486.17 | 2471.91 | 2476.11 | 650707000 |
2018-05-15 | KRX:KOSPI | 2476.87 | 2480.22 | 2456.20 | 2458.54 | 712619000 |
2018-05-16 | KRX:KOSPI | 2446.64 | 2465.55 | 2444.67 | 2459.82 | 691813000 |
2018-05-17 | KRX:KOSPI | 2468.72 | 2472.82 | 2448.43 | 2448.45 | 596771000 |
2018-05-18 | KRX:KOSPI | 2459.73 | 2461.95 | 2452.34 | 2460.65 | 431376000 |
2018-05-21 | KRX:KOSPI | 2464.07 | 2472.30 | 2447.69 | 2465.57 | 588088000 |
2018-05-23 | KRX:KOSPI | 2462.98 | 2476.86 | 2460.07 | 2471.91 | 752675000 |
2018-05-24 | KRX:KOSPI | 2477.48 | 2481.31 | 2457.84 | 2466.01 | 638125000 |
2018-05-25 | KRX:KOSPI | 2452.80 | 2466.57 | 2444.77 | 2460.80 | 653592000 |
2018-05-28 | KRX:KOSPI | 2465.00 | 2482.40 | 2463.14 | 2478.96 | 708362000 |
2018-05-29 | KRX:KOSPI | 2476.70 | 2479.68 | 2457.18 | 2457.25 | 571836000 |
2018-05-30 | KRX:KOSPI | 2446.81 | 2449.88 | 2399.58 | 2409.03 | 575922000 |
2018-05-31 | KRX:KOSPI | 2428.83 | 2430.15 | 2415.50 | 2423.01 | 816930000 |
2018-06-01 | KRX:KOSPI | 2419.63 | 2445.31 | 2418.11 | 2438.96 | 603023000 |
2018-06-04 | KRX:KOSPI | 2444.62 | 2452.67 | 2441.25 | 2447.76 | 438929000 |
2018-06-05 | KRX:KOSPI | 2450.39 | 2455.78 | 2432.81 | 2453.76 | 501363000 |
2018-06-07 | KRX:KOSPI | 2468.26 | 2478.67 | 2466.01 | 2470.58 | 474146000 |