#!/usr/bin/env python
# coding: utf-8

# # 分组与聚合

# * GroupBy对象

# In[1]:


import pandas as pd
import numpy as np


# In[61]:


df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                            'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df


# In[7]:


df.groupby('A',as_index=True)    # groupby是一个分组对象，分组之后并不是直接产生结果的


# In[11]:


df.groupby('A',as_index=False).sum()    # 分组后的聚合是对每一组进行聚合  


# In[9]:


df.groupby('A',as_index=False).sum().index


# In[10]:


df.groupby('A').sum()


# In[5]:


df.groupby('A').sum().shape


# In[12]:


df.groupby('A').size()


# In[8]:


df.groupby(['A','B']).sum()   


# In[13]:


df.groupby(['A','B']).size()


# In[15]:


df


# In[16]:


# 按自定义key分组，列表
self_def_key = [1, 1, 2, 2, 2, 1, 1, 1]
df.groupby(self_def_key).mean()


# In[17]:


# 按自定义key分组，多层列表
df.groupby([df['A'], df['B']]).size()


# In[19]:


df.groupby([df['A'], df['B']]).mean()


# In[20]:


df.groupby([df['A'], df['B']]).mean().unstack()   


# * GroupBy对象分组迭代  
# groupBy对象是一个可迭代的对象   
# 
# 可以遍历，也可以转成列表

# In[21]:


# 单层分组
grouped1 = df.groupby('A')
for group_name, group_data in grouped1:
    print('---',group_name)
    print('***',group_data)


# In[22]:


# 多层分组
grouped2 = df.groupby(['A', 'B'])
for group_name, group_data in grouped2:
    print('---',group_name)
    print('***',group_data)


# In[23]:


# GroupBy对象转换list，凡是可迭代对象都可以转换成列表
list(grouped1)


# In[24]:


# GroupBy对象转换dict
dict(list(grouped1))


# In[25]:


# 按列分组
print(df.dtypes)

# 按数据类型分组
print(df.groupby(df.dtypes, axis=1).size())
df.groupby(df.dtypes, axis=1).sum()


# * 其他分组方法

# In[26]:


df2 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['A', 'B', 'C', 'D', 'E'])
df2.iloc[1, 1:4] = np.NaN
df2


# In[27]:


# 通过字典分组
mapping_dict = {'a':'python', 'b':'python', 'c':'java', 'd':'C', 'e':'java'}
print(df2.groupby(mapping_dict, axis=1).size())
print(df2.groupby(mapping_dict, axis=1).count()) # 非NaN的个数
df2.groupby(mapping_dict, axis=1).sum()


# In[28]:


# 通过函数分组
df3 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['AA', 'BBB', 'CC', 'D', 'EE'])
print(df3)

def group_key(idx):
    """
        idx 为列索引或行索引
    """
    #return idx
    return len(idx)

list(df3.groupby(group_key))


# In[32]:


# 以上自定义函数等价于
df3.groupby(len).size()


# In[33]:


# 通过索引级别分组
columns = pd.MultiIndex.from_arrays([['Python', 'Java', 'Python', 'Java', 'Python'],
                                     ['A', 'A', 'B', 'C', 'B']], names=['language', 'index'])
df4 = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
df4


# In[35]:


# 根据language进行分组
df4.groupby(level='language', axis=1).sum()
df4.groupby(level='index', axis=1).sum()


# * 聚合

# In[36]:


df


# In[38]:


# 内置的聚合函数
df.groupby('A').describe().T


# In[52]:


# 自定义聚合函数   
def peak_range(df):
    """
        返回数值范围
    """
    #print type(df) #参数为索引所对应的记录
    return df.max() - df.min()

print(df.groupby('A').agg(peak_range))
print(df.groupby('A').agg(lambda df : df.max() - df.min()))    # agg传进去的一定要是reduce相关函数


# In[72]:


# 应用多个聚合函数

# 同时应用多个聚合函数
df.groupby('A').agg(['mean', 'std', 'count', peak_range]) # 默认列名为函数名


# In[54]:


df.groupby('A').agg(['mean', 'std', 'count', ('range', peak_range)]) # 通过元组提供新的列名


# In[55]:


# 每列作用不同的聚合函数
dict_mapping = {'C':'mean',
                'D':'sum'}
df.groupby('A').agg(dict_mapping)


# In[56]:


dict_mapping = {'C':['mean','max'],
                'D':'sum'}
df.groupby('A').agg(dict_mapping)


# In[57]:


# coding=utf-8
import pandas as pd
from matplotlib import pyplot as plt

file_path = "./starbucks_store_worldwide.csv"

df = pd.read_csv(file_path)

#使用matplotlib呈现出店铺总数排名前10的国家
#准备数据
data1 = df.groupby(by="Country").count()["Brand"].sort_values(ascending=False)[:10]
print(data1)
_x = data1.index
_y = data1.values

#画图 
plt.figure(figsize=(20,8),dpi=80)

plt.bar(range(len(_x)),_y)

plt.xticks(range(len(_x)),_x)

plt.show()


# In[ ]: