#!/usr/bin/env python
# coding: utf-8

# # Pandas 簡介

# ## Pandas 是 Python 裡面被用來作資料分析及整理最常用的套件。

# ### 這一份筆記主要說明以下重點：
# 
# * 資料結構及其基本操作
#   - Series
#   - DataFrame
# * 網路資料存取
# * 資料視覺化
# 
# 參考資料：
# 
# * [Python Data Analysis Library](http://pandas.pydata.org/)
# * [Pandas Cookbook](http://pandas.pydata.org/pandas-docs/version/0.18.0/cookbook.html)
# * [Pandas 入門介紹](https://github.com/Wei1234c/Introduction_to_Pandas)
# * [Financial Time Series](https://github.com/yhilpisch/py4fi/blob/master/ipython3/06_Financial_Time_Series.ipynb)
# * [Pandas API references](http://pandas.pydata.org/pandas-docs/stable/api.html#api-dataframe-stats)

# In[2]:


import pandas as pd

pd.__version__


# In[3]:


get_ipython().run_line_magic('matplotlib', 'inline')

import numpy as np
from datetime import datetime


# ## Pandas 的資料結構

# ### Series

# In[4]:


s = pd.Series([1, 2, 3, 4, 5])
s


# In[5]:


u = pd.Series([1, 2, 3, 4, 5])
u


# In[6]:


s+u


# #### Series 跟 ndarray 有什麼不同呢？

# In[7]:


s.values


# In[8]:


s.index


# #### index 可以在創建 Series 時指定

# In[9]:


s = pd.Series(range(5), index=list('abcde'))


# In[10]:


s


# In[11]:


u.index = list('bcdef')


# In[12]:


u


# In[13]:


s+u


# #### index 也可以在創建完 Series 後指定

# In[14]:


s.index = list('fghij')
s


# #### 什麼是 reindex？

# In[15]:


s.reindex(list('hijabcd'))


# In[16]:


s


# In[17]:


s.dtype


# In[18]:


s.shape


# In[19]:


s.ndim


# In[26]:


s


# In[27]:


s['i']


# In[28]:


s['f':'h']


# ### DataFrame

# In[29]:


data = np.random.randn(10, 4)


# In[30]:


df = pd.DataFrame(data)
df


# In[31]:


df.columns = ['No1', 'No2', 'No3', 'No4']
df


# In[32]:


df.index = pd.date_range('2016-01-01', periods=10)
df


# In[34]:


df.loc['2016-01-06']


# In[35]:


df['2016-01-06':'2016-01-08']


# In[ ]:


df.drop(datetime(2016, 1, 3), inplace=True)
df


# In[ ]:


df.index=range(9)
df


# In[33]:


df.iloc[1]


# In[ ]:


df.drop(4)


# In[39]:


df


# In[40]:


df['No1']


# In[41]:


df[['No1']]


# In[42]:


df[['No1','No3']]


# ## 使用 Pandas 作資料存取
# 
# ### 這一部分包括底下幾個重點：
# 
# * read_csv() 的使用
# * read_html() 的使用
# * 如何抓取股市資訊？
# 
# 底下練習一下將 [台灣證券交易所 - 加權股價指數歷史資料](http://www.tse.com.tw/ch/trading/indices/MI_5MINS_HIST/MI_5MINS_HIST.php#) 的資料轉成 DataFrame。

# ### 使用 read_csv() 讀取資料
# 
# 參考連結：[pandas.read_csv](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html)

# In[ ]:


df = pd.read_csv("data/MI_5MINS_HIST10603.csv", encoding="Big5", header=1)


# In[ ]:


df


# In[ ]:


df.drop(df.index[len(df.index)-1], inplace=True)
df


# ### 使用 read_html() 讀取資料
# 
# 參考連結：[pandas.read_html](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_html.html)

# In[ ]:


import requests
from bs4 import BeautifulSoup


# In[ ]:


r = requests.get("http://www.tse.com.tw/ch/trading/indices/MI_5MINS_HIST/MI_5MINS_HIST.php")
r.encoding = 'Big5'


# In[ ]:


soup = BeautifulSoup(r.text, "lxml")
tables = soup.select("table.board_trad")
tables


# In[ ]:


type(tables[0])


# In[ ]:


df_list = pd.read_html(str(tables[0]), header=1)


# In[ ]:


df_list[0]


# ### 使用 Pandas 讀取股價資訊
# 
# 參考資料：[pandas-datareadre 說明文件](https://pandas-datareader.readthedocs.io/en/latest/)

# In[43]:


import pandas_datareader.data as web
from datetime import datetime


# In[44]:


df = web.DataReader("TSLA", 'yahoo', datetime(2018,1,1))
df.head()


# In[45]:


df.tail()


# In[46]:


df['Close'].plot()


# In[47]:


df.describe()


# In[48]:


df.info()


# ## 資料視覺化
# 
# Series 跟 DataFrame 都附帶有一個產生各類圖表的 plot()，預設的情況下，它會產生線形圖。
# 
# 參考資料：[Pandas Plotting](http://pandas.pydata.org/pandas-docs/stable/visualization.html)

# In[ ]:


s = pd.Series(np.random.randn(10), index=np.arange(10))
s.plot()


# In[ ]:


s.plot(kind="bar")


# In[ ]:


df = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'))
df.plot()


# In[ ]:


df.plot(kind='bar')