from dask.distributed import Client, progress
e = Client('10.200.30.241:8786')
e.restart()
e
import dask.dataframe as dd
import pandas as pd
df = pd.read_csv('/data/jcrist/airline/1990.csv')
dtypes = df.dtypes.to_dict()
df = dd.read_csv('/data/jcrist/airline/198*.csv', dtype=dtypes)
df = e.persist(df)
progress(df)
df
df.dtypes
df.head()
%time len(df)
expr = df.DepDelay.groupby(df.Origin).mean().nlargest(10)
expr
expr.compute()
expr = df.DepDelay[df.Origin == 'EWR'].max()
expr
expr.compute()