#!/usr/bin/env python
# coding: utf-8

# # 10 minutes to Koalas
# 
# This is a short introduction to Koalas, geared mainly for new users. This notebook shows you some key differences between pandas and Koalas. You can run this examples by yourself on a live notebook [here](https://mybinder.org/v2/gh/databricks/koalas/master?filepath=docs%2Fsource%2Fgetting_started%2F10min.ipynb). For Databricks users, you can import [the current .ipynb file](https://raw.githubusercontent.com/databricks/koalas/master/docs/source/getting_started/10min.ipynb) and run it after [installing Koalas](https://github.com/databricks/koalas#how-do-i-use-this-on-databricks).
# 
# Customarily, we import Koalas as follows:

# In[1]:


import pandas as pd
import numpy as np
import databricks.koalas as ks
from pyspark.sql import SparkSession


# ## Object Creation
# 
# 

# Creating a Koalas Series by passing a list of values, letting Koalas create a default integer index:

# In[2]:


s = ks.Series([1, 3, 5, np.nan, 6, 8])


# In[3]:


s


# Creating a Koalas DataFrame by passing a dict of objects that can be converted to series-like.

# In[4]:


kdf = ks.DataFrame(
    {'a': [1, 2, 3, 4, 5, 6],
     'b': [100, 200, 300, 400, 500, 600],
     'c': ["one", "two", "three", "four", "five", "six"]},
    index=[10, 20, 30, 40, 50, 60])


# In[5]:


kdf


# Creating a pandas DataFrame by passing a numpy array, with a datetime index and labeled columns:

# In[6]:


dates = pd.date_range('20130101', periods=6)


# In[7]:


dates


# In[8]:


pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))


# In[9]:


pdf


# Now, this pandas DataFrame can be converted to a Koalas DataFrame

# In[10]:


kdf = ks.from_pandas(pdf)


# In[11]:


type(kdf)


# It looks and behaves the same as a pandas DataFrame though

# In[12]:


kdf


# Also, it is possible to create a Koalas DataFrame from Spark DataFrame.  
# 
# Creating a Spark DataFrame from pandas DataFrame

# In[13]:


spark = SparkSession.builder.getOrCreate()


# In[14]:


sdf = spark.createDataFrame(pdf)


# In[15]:


sdf.show()


# Creating Koalas DataFrame from Spark DataFrame.
# `to_koalas()` is automatically attached to Spark DataFrame and available as an API when Koalas is imported.

# In[16]:


kdf = sdf.to_koalas()


# In[17]:


kdf


# Having specific [dtypes](http://pandas.pydata.org/pandas-docs/stable/basics.html#basics-dtypes) . Types that are common to both Spark and pandas are currently supported.

# In[18]:


kdf.dtypes


# ## Viewing Data
# 
# See the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html).

# See the top rows of the frame. The results may not be the same as pandas though: unlike pandas, the data in a Spark dataframe is not _ordered_, it has no intrinsic notion of index. When asked for the head of a dataframe, Spark will just take the requested number of rows from a partition. Do not rely on it to return specific rows, use `.loc` or `iloc` instead.

# In[19]:


kdf.head()


# Display the index, columns, and the underlying numpy data.
# 
# You can also retrieve the index; the index column can be ascribed to a DataFrame, see later

# In[20]:


kdf.index


# In[21]:


kdf.columns


# In[22]:


kdf.to_numpy()


# Describe shows a quick statistic summary of your data

# In[23]:


kdf.describe()


# Transposing your data

# In[24]:


kdf.T


# Sorting by its index

# In[25]:


kdf.sort_index(ascending=False)


# Sorting by value

# In[26]:


kdf.sort_values(by='B')


# ## Missing Data
# Koalas primarily uses the value `np.nan` to represent missing data. It is by default not included in computations. 
# 

# In[27]:


pdf1 = pdf.reindex(index=dates[0:4], columns=list(pdf.columns) + ['E'])


# In[28]:


pdf1.loc[dates[0]:dates[1], 'E'] = 1


# In[29]:


kdf1 = ks.from_pandas(pdf1)


# In[30]:


kdf1


# To drop any rows that have missing data.

# In[31]:


kdf1.dropna(how='any')


# Filling missing data.

# In[32]:


kdf1.fillna(value=5)


# ## Operations

# ### Stats
# Operations in general exclude missing data.
# 
# Performing a descriptive statistic:

# In[33]:


kdf.mean()


# ### Spark Configurations
# 
# Various configurations in PySpark could be applied internally in Koalas.
# For example, you can enable Arrow optimization to hugely speed up internal pandas conversion. See <a href="https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html">PySpark Usage Guide for Pandas with Apache Arrow</a>.

# In[34]:


prev = spark.conf.get("spark.sql.execution.arrow.enabled")  # Keep its default value.
ks.set_option("compute.default_index_type", "distributed")  # Use default index prevent overhead.
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings coming from Arrow optimizations.


# In[35]:


spark.conf.set("spark.sql.execution.arrow.enabled", True)
get_ipython().run_line_magic('timeit', 'ks.range(300000).to_pandas()')


# In[36]:


spark.conf.set("spark.sql.execution.arrow.enabled", False)
get_ipython().run_line_magic('timeit', 'ks.range(300000).to_pandas()')


# In[37]:


ks.reset_option("compute.default_index_type")
spark.conf.set("spark.sql.execution.arrow.enabled", prev)  # Set its default value back.


# ## Grouping
# By “group by” we are referring to a process involving one or more of the following steps:
# 
# - Splitting the data into groups based on some criteria
# - Applying a function to each group independently
# - Combining the results into a data structure

# In[38]:


kdf = ks.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})


# In[39]:


kdf


# Grouping and then applying the [sum()](https://koalas.readthedocs.io/en/latest/reference/api/databricks.koalas.groupby.GroupBy.sum.html#databricks.koalas.groupby.GroupBy.sum) function to the resulting groups.

# In[40]:


kdf.groupby('A').sum()


# Grouping by multiple columns forms a hierarchical index, and again we can apply the sum function.

# In[41]:


kdf.groupby(['A', 'B']).sum()


# ## Plotting
# See the <a href="https://koalas.readthedocs.io/en/latest/reference/frame.html#plotting">Plotting</a> docs.

# In[42]:


get_ipython().run_line_magic('matplotlib', 'inline')
from matplotlib import pyplot as plt


# In[43]:


pser = pd.Series(np.random.randn(1000),
                 index=pd.date_range('1/1/2000', periods=1000))


# In[44]:


kser = ks.Series(pser)


# In[45]:


kser = kser.cummax()


# In[46]:


kser.plot()


# On a DataFrame, the <a href="https://koalas.readthedocs.io/en/latest/reference/api/databricks.koalas.frame.DataFrame.plot.html#databricks.koalas.frame.DataFrame.plot">plot()</a> method is a convenience to plot all of the columns with labels:

# In[47]:


pdf = pd.DataFrame(np.random.randn(1000, 4), index=pser.index,
                   columns=['A', 'B', 'C', 'D'])


# In[48]:


kdf = ks.from_pandas(pdf)


# In[49]:


kdf = kdf.cummax()


# In[50]:


kdf.plot()


# ## Getting data in/out
# See the <a href="https://koalas.readthedocs.io/en/latest/reference/io.html">Input/Output
# </a> docs.

# ### CSV
# 
# CSV is straightforward and easy to use. See <a href="https://koalas.readthedocs.io/en/latest/reference/api/databricks.koalas.DataFrame.to_csv.html#databricks.koalas.DataFrame.to_csv">here</a> to write a CSV file and <a href="https://koalas.readthedocs.io/en/latest/reference/api/databricks.koalas.read_csv.html#databricks.koalas.read_csv">here</a> to read a CSV file.

# In[51]:


kdf.to_csv('foo.csv')
ks.read_csv('foo.csv').head(10)


# ### Parquet
# 
# Parquet is an efficient and compact file format to read and write faster. See <a href="https://koalas.readthedocs.io/en/latest/reference/api/databricks.koalas.DataFrame.to_parquet.html#databricks.koalas.DataFrame.to_parquet">here</a> to write a Parquet file and <a href="https://koalas.readthedocs.io/en/latest/reference/api/databricks.koalas.read_parquet.html#databricks.koalas.read_parquet">here</a> to read a Parquet file.

# In[52]:


kdf.to_parquet('bar.parquet')
ks.read_parquet('bar.parquet').head(10)


# ### Spark IO
# 
# In addition, Koalas fully support Spark's various datasources such as ORC and an external datasource.  See <a href="https://koalas.readthedocs.io/en/latest/reference/api/databricks.koalas.DataFrame.to_spark_io.html#databricks.koalas.DataFrame.to_spark_io">here</a> to write it to the specified datasource and <a href="https://koalas.readthedocs.io/en/latest/reference/api/databricks.koalas.read_spark_io.html#databricks.koalas.read_spark_io">here</a> to read it from the datasource.

# In[53]:


kdf.to_spark_io('zoo.orc', format="orc")
ks.read_spark_io('zoo.orc', format="orc").head(10)