#!/usr/bin/env python # coding: utf-8 # # Profiler performance # # We use the part of the instacart data that you can find here https://www.instacart.com/datasets/grocery-shopping-2017 # # Specically order_products__prior.csv a 4 columns, 33.2 Million rows csv file. # # Before 2.2.10 # It took 355.58 seconds to process all the data set in a Windows 10, # Instacart data # # After 2.2.10 # It took 78 sec. infer== False # # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: import sys sys.path.append("..") # In[3]: # Create optimus from optimus import Optimus op = Optimus(master="local[*]", app_name = "optimus" ,verbose =True, checkpoint= True) # ### Benchmark # In[4]: df = op.load.csv("C:\\Users\\argenisleon\\Desktop\\order_products__prior.csv") # In[5]: df.table() # In[7]: get_ipython().run_cell_magic('time', '', 'df.groupBy("order_id").count().sort("count",ascending=False).show()\n') # In[3]: get_ipython().run_cell_magic('time', '', 'df.cols.frequency("order_id")\n') # In[7]: get_ipython().run_cell_magic('time', '', 'op.profiler.to_json(df, "order_id", infer=False, relative_error=1)\n') # In[19]: a = df.limit(10) 1:46 2:25 # In[13]: get_ipython().run_cell_magic('time', '', 'df.cols.frequency("order_id")\n') # In[24]: from optimus import Profiler p = Profiler() p.run(a,"order_id") # In[22]: op.profiler.run(a, "order_id", infer=True, relative_error=1) # In[1]: df.groupBy("order_id").count().sort("count",ascending=False) # In[ ]: