#!/usr/bin/env python
# coding: utf-8
# # 10 minutes to Koalas
#
# This is a short introduction to Koalas, geared mainly for new users. This notebook shows you some key differences between pandas and Koalas. You can run this examples by yourself on a live notebook [here](https://mybinder.org/v2/gh/databricks/koalas/master?filepath=docs%2Fsource%2Fgetting_started%2F10min.ipynb). For Databricks users, you can import [the current .ipynb file](https://raw.githubusercontent.com/databricks/koalas/master/docs/source/getting_started/10min.ipynb) and run it after [installing Koalas](https://github.com/databricks/koalas#how-do-i-use-this-on-databricks).
#
# Customarily, we import Koalas as follows:
# In[1]:
import pandas as pd
import numpy as np
import databricks.koalas as ks
from pyspark.sql import SparkSession
# ## Object Creation
#
#
# Creating a Koalas Series by passing a list of values, letting Koalas create a default integer index:
# In[2]:
s = ks.Series([1, 3, 5, np.nan, 6, 8])
# In[3]:
s
# Creating a Koalas DataFrame by passing a dict of objects that can be converted to series-like.
# In[4]:
kdf = ks.DataFrame(
{'a': [1, 2, 3, 4, 5, 6],
'b': [100, 200, 300, 400, 500, 600],
'c': ["one", "two", "three", "four", "five", "six"]},
index=[10, 20, 30, 40, 50, 60])
# In[5]:
kdf
# Creating a pandas DataFrame by passing a numpy array, with a datetime index and labeled columns:
# In[6]:
dates = pd.date_range('20130101', periods=6)
# In[7]:
dates
# In[8]:
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
# In[9]:
pdf
# Now, this pandas DataFrame can be converted to a Koalas DataFrame
# In[10]:
kdf = ks.from_pandas(pdf)
# In[11]:
type(kdf)
# It looks and behaves the same as a pandas DataFrame though
# In[12]:
kdf
# Also, it is possible to create a Koalas DataFrame from Spark DataFrame.
#
# Creating a Spark DataFrame from pandas DataFrame
# In[13]:
spark = SparkSession.builder.getOrCreate()
# In[14]:
sdf = spark.createDataFrame(pdf)
# In[15]:
sdf.show()
# Creating Koalas DataFrame from Spark DataFrame.
# `to_koalas()` is automatically attached to Spark DataFrame and available as an API when Koalas is imported.
# In[16]:
kdf = sdf.to_koalas()
# In[17]:
kdf
# Having specific [dtypes](http://pandas.pydata.org/pandas-docs/stable/basics.html#basics-dtypes) . Types that are common to both Spark and pandas are currently supported.
# In[18]:
kdf.dtypes
# ## Viewing Data
#
# See the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html).
# See the top rows of the frame. The results may not be the same as pandas though: unlike pandas, the data in a Spark dataframe is not _ordered_, it has no intrinsic notion of index. When asked for the head of a dataframe, Spark will just take the requested number of rows from a partition. Do not rely on it to return specific rows, use `.loc` or `iloc` instead.
# In[19]:
kdf.head()
# Display the index, columns, and the underlying numpy data.
#
# You can also retrieve the index; the index column can be ascribed to a DataFrame, see later
# In[20]:
kdf.index
# In[21]:
kdf.columns
# In[22]:
kdf.to_numpy()
# Describe shows a quick statistic summary of your data
# In[23]:
kdf.describe()
# Transposing your data
# In[24]:
kdf.T
# Sorting by its index
# In[25]:
kdf.sort_index(ascending=False)
# Sorting by value
# In[26]:
kdf.sort_values(by='B')
# ## Missing Data
# Koalas primarily uses the value `np.nan` to represent missing data. It is by default not included in computations.
#
# In[27]:
pdf1 = pdf.reindex(index=dates[0:4], columns=list(pdf.columns) + ['E'])
# In[28]:
pdf1.loc[dates[0]:dates[1], 'E'] = 1
# In[29]:
kdf1 = ks.from_pandas(pdf1)
# In[30]:
kdf1
# To drop any rows that have missing data.
# In[31]:
kdf1.dropna(how='any')
# Filling missing data.
# In[32]:
kdf1.fillna(value=5)
# ## Operations
# ### Stats
# Operations in general exclude missing data.
#
# Performing a descriptive statistic:
# In[33]:
kdf.mean()
# ### Spark Configurations
#
# Various configurations in PySpark could be applied internally in Koalas.
# For example, you can enable Arrow optimization to hugely speed up internal pandas conversion. See PySpark Usage Guide for Pandas with Apache Arrow.
# In[34]:
prev = spark.conf.get("spark.sql.execution.arrow.enabled") # Keep its default value.
ks.set_option("compute.default_index_type", "distributed") # Use default index prevent overhead.
import warnings
warnings.filterwarnings("ignore") # Ignore warnings coming from Arrow optimizations.
# In[35]:
spark.conf.set("spark.sql.execution.arrow.enabled", True)
get_ipython().run_line_magic('timeit', 'ks.range(300000).to_pandas()')
# In[36]:
spark.conf.set("spark.sql.execution.arrow.enabled", False)
get_ipython().run_line_magic('timeit', 'ks.range(300000).to_pandas()')
# In[37]:
ks.reset_option("compute.default_index_type")
spark.conf.set("spark.sql.execution.arrow.enabled", prev) # Set its default value back.
# ## Grouping
# By “group by” we are referring to a process involving one or more of the following steps:
#
# - Splitting the data into groups based on some criteria
# - Applying a function to each group independently
# - Combining the results into a data structure
# In[38]:
kdf = ks.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
# In[39]:
kdf
# Grouping and then applying the [sum()](https://koalas.readthedocs.io/en/latest/reference/api/databricks.koalas.groupby.GroupBy.sum.html#databricks.koalas.groupby.GroupBy.sum) function to the resulting groups.
# In[40]:
kdf.groupby('A').sum()
# Grouping by multiple columns forms a hierarchical index, and again we can apply the sum function.
# In[41]:
kdf.groupby(['A', 'B']).sum()
# ## Plotting
# See the Plotting docs.
# In[42]:
get_ipython().run_line_magic('matplotlib', 'inline')
from matplotlib import pyplot as plt
# In[43]:
pser = pd.Series(np.random.randn(1000),
index=pd.date_range('1/1/2000', periods=1000))
# In[44]:
kser = ks.Series(pser)
# In[45]:
kser = kser.cummax()
# In[46]:
kser.plot()
# On a DataFrame, the plot() method is a convenience to plot all of the columns with labels:
# In[47]:
pdf = pd.DataFrame(np.random.randn(1000, 4), index=pser.index,
columns=['A', 'B', 'C', 'D'])
# In[48]:
kdf = ks.from_pandas(pdf)
# In[49]:
kdf = kdf.cummax()
# In[50]:
kdf.plot()
# ## Getting data in/out
# See the Input/Output
# docs.
# ### CSV
#
# CSV is straightforward and easy to use. See here to write a CSV file and here to read a CSV file.
# In[51]:
kdf.to_csv('foo.csv')
ks.read_csv('foo.csv').head(10)
# ### Parquet
#
# Parquet is an efficient and compact file format to read and write faster. See here to write a Parquet file and here to read a Parquet file.
# In[52]:
kdf.to_parquet('bar.parquet')
ks.read_parquet('bar.parquet').head(10)
# ### Spark IO
#
# In addition, Koalas fully support Spark's various datasources such as ORC and an external datasource. See here to write it to the specified datasource and here to read it from the datasource.
# In[53]:
kdf.to_spark_io('zoo.orc', format="orc")
ks.read_spark_io('zoo.orc', format="orc").head(10)