#!/usr/bin/env python # coding: utf-8 # # 10 minutes to Koalas # # This is a short introduction to Koalas, geared mainly for new users. This notebook shows you some key differences between pandas and Koalas. You can run this examples by yourself on a live notebook [here](https://mybinder.org/v2/gh/databricks/koalas/master?filepath=docs%2Fsource%2Fgetting_started%2F10min.ipynb). For Databricks users, you can import [the current .ipynb file](https://raw.githubusercontent.com/databricks/koalas/master/docs/source/getting_started/10min.ipynb) and run it after [installing Koalas](https://github.com/databricks/koalas#how-do-i-use-this-on-databricks). # # Customarily, we import Koalas as follows: # In[1]: import pandas as pd import numpy as np import databricks.koalas as ks from pyspark.sql import SparkSession # ## Object Creation # # # Creating a Koalas Series by passing a list of values, letting Koalas create a default integer index: # In[2]: s = ks.Series([1, 3, 5, np.nan, 6, 8]) # In[3]: s # Creating a Koalas DataFrame by passing a dict of objects that can be converted to series-like. # In[4]: kdf = ks.DataFrame( {'a': [1, 2, 3, 4, 5, 6], 'b': [100, 200, 300, 400, 500, 600], 'c': ["one", "two", "three", "four", "five", "six"]}, index=[10, 20, 30, 40, 50, 60]) # In[5]: kdf # Creating a pandas DataFrame by passing a numpy array, with a datetime index and labeled columns: # In[6]: dates = pd.date_range('20130101', periods=6) # In[7]: dates # In[8]: pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) # In[9]: pdf # Now, this pandas DataFrame can be converted to a Koalas DataFrame # In[10]: kdf = ks.from_pandas(pdf) # In[11]: type(kdf) # It looks and behaves the same as a pandas DataFrame though # In[12]: kdf # Also, it is possible to create a Koalas DataFrame from Spark DataFrame. # # Creating a Spark DataFrame from pandas DataFrame # In[13]: spark = SparkSession.builder.getOrCreate() # In[14]: sdf = spark.createDataFrame(pdf) # In[15]: sdf.show() # Creating Koalas DataFrame from Spark DataFrame. # `to_koalas()` is automatically attached to Spark DataFrame and available as an API when Koalas is imported. # In[16]: kdf = sdf.to_koalas() # In[17]: kdf # Having specific [dtypes](http://pandas.pydata.org/pandas-docs/stable/basics.html#basics-dtypes) . Types that are common to both Spark and pandas are currently supported. # In[18]: kdf.dtypes # ## Viewing Data # # See the [API Reference](https://koalas.readthedocs.io/en/latest/reference/index.html). # See the top rows of the frame. The results may not be the same as pandas though: unlike pandas, the data in a Spark dataframe is not _ordered_, it has no intrinsic notion of index. When asked for the head of a dataframe, Spark will just take the requested number of rows from a partition. Do not rely on it to return specific rows, use `.loc` or `iloc` instead. # In[19]: kdf.head() # Display the index, columns, and the underlying numpy data. # # You can also retrieve the index; the index column can be ascribed to a DataFrame, see later # In[20]: kdf.index # In[21]: kdf.columns # In[22]: kdf.to_numpy() # Describe shows a quick statistic summary of your data # In[23]: kdf.describe() # Transposing your data # In[24]: kdf.T # Sorting by its index # In[25]: kdf.sort_index(ascending=False) # Sorting by value # In[26]: kdf.sort_values(by='B') # ## Missing Data # Koalas primarily uses the value `np.nan` to represent missing data. It is by default not included in computations. # # In[27]: pdf1 = pdf.reindex(index=dates[0:4], columns=list(pdf.columns) + ['E']) # In[28]: pdf1.loc[dates[0]:dates[1], 'E'] = 1 # In[29]: kdf1 = ks.from_pandas(pdf1) # In[30]: kdf1 # To drop any rows that have missing data. # In[31]: kdf1.dropna(how='any') # Filling missing data. # In[32]: kdf1.fillna(value=5) # ## Operations # ### Stats # Operations in general exclude missing data. # # Performing a descriptive statistic: # In[33]: kdf.mean() # ### Spark Configurations # # Various configurations in PySpark could be applied internally in Koalas. # For example, you can enable Arrow optimization to hugely speed up internal pandas conversion. See PySpark Usage Guide for Pandas with Apache Arrow. # In[34]: prev = spark.conf.get("spark.sql.execution.arrow.enabled") # Keep its default value. ks.set_option("compute.default_index_type", "distributed") # Use default index prevent overhead. import warnings warnings.filterwarnings("ignore") # Ignore warnings coming from Arrow optimizations. # In[35]: spark.conf.set("spark.sql.execution.arrow.enabled", True) get_ipython().run_line_magic('timeit', 'ks.range(300000).to_pandas()') # In[36]: spark.conf.set("spark.sql.execution.arrow.enabled", False) get_ipython().run_line_magic('timeit', 'ks.range(300000).to_pandas()') # In[37]: ks.reset_option("compute.default_index_type") spark.conf.set("spark.sql.execution.arrow.enabled", prev) # Set its default value back. # ## Grouping # By “group by” we are referring to a process involving one or more of the following steps: # # - Splitting the data into groups based on some criteria # - Applying a function to each group independently # - Combining the results into a data structure # In[38]: kdf = ks.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) # In[39]: kdf # Grouping and then applying the [sum()](https://koalas.readthedocs.io/en/latest/reference/api/databricks.koalas.groupby.GroupBy.sum.html#databricks.koalas.groupby.GroupBy.sum) function to the resulting groups. # In[40]: kdf.groupby('A').sum() # Grouping by multiple columns forms a hierarchical index, and again we can apply the sum function. # In[41]: kdf.groupby(['A', 'B']).sum() # ## Plotting # See the Plotting docs. # In[42]: get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib import pyplot as plt # In[43]: pser = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) # In[44]: kser = ks.Series(pser) # In[45]: kser = kser.cummax() # In[46]: kser.plot() # On a DataFrame, the plot() method is a convenience to plot all of the columns with labels: # In[47]: pdf = pd.DataFrame(np.random.randn(1000, 4), index=pser.index, columns=['A', 'B', 'C', 'D']) # In[48]: kdf = ks.from_pandas(pdf) # In[49]: kdf = kdf.cummax() # In[50]: kdf.plot() # ## Getting data in/out # See the Input/Output # docs. # ### CSV # # CSV is straightforward and easy to use. See here to write a CSV file and here to read a CSV file. # In[51]: kdf.to_csv('foo.csv') ks.read_csv('foo.csv').head(10) # ### Parquet # # Parquet is an efficient and compact file format to read and write faster. See here to write a Parquet file and here to read a Parquet file. # In[52]: kdf.to_parquet('bar.parquet') ks.read_parquet('bar.parquet').head(10) # ### Spark IO # # In addition, Koalas fully support Spark's various datasources such as ORC and an external datasource. See here to write it to the specified datasource and here to read it from the datasource. # In[53]: kdf.to_spark_io('zoo.orc', format="orc") ks.read_spark_io('zoo.orc', format="orc").head(10)