#!/usr/bin/env python # coding: utf-8 # In[1]: import sys sys.executable # # Ickle # *Data Analysis Library for Python* #
# # This Jupyter Notebook serves as the documentation for Ickle. # # You can contribute to Ickle here: https://github.com/karishmashuklaa/ickle # ## Table Of Contents # 1. [Getting Started](#Getting-Started) # 2. [DataFrame and Visual Representation](#DataFrame-and-Visual-Representation) # 3. [Basic Properties](#Basic-Properties) # 4. [Selection of Subsets](#Selection-of-Subsets) # 5. [Basic and Aggregation Methods](#Basic-And-Aggregation-Methods) # 6. [Non-Aggregation Methods](#Non-Aggregation-Methods) # 7. [Other Methods](#Other-Methods) # 8. [Arithmetic and Comparison Operators](#Arithmetic-And-Comparison-Operators) # 9. [String-Only Methods](#String-Only-Methods) # 10. [Pivot Table](#Pivot-Table) # 11. [Read CSV](#Read-CSV) # 12. [Read SQL](#Read-SQL) # 13. [Read Excel](#Read-Excel) # ## Getting Started # ### Installation # # Ickle can be installed via pip. # # `pip install ickle` # ### Import # # `import ickle as ick` # ## DataFrame and Visual Representation # # *[Go to table of contents](#Table-Of-Contents)* # ### DataFrame # A `DataFrame` holds two dimensional heterogenous data. It accepts dictionary as input, with Numpy arrays as values and strings as column names. # # Parameters: # - `data`: A dictionary of strings mapped to Numpy arrays. The key will become the column name. # In[3]: import numpy as np import ickle as ick # In[3]: name = np.array(['John', 'Sam', 'Tina', 'Josh', 'Jack', 'Jill']) place = np.array(['Kolkata', 'Mumbai', 'Delhi', 'Mumbai', 'Mumbai', 'Mumbai']) weight = np.array([57, 70, 54, 59, 62, 70]) married = np.array([True, False, True, False, False, False]) data = {'name': name, 'place': place, 'weight': weight, 'married': married} df = ick.DataFrame(data) # ### Visual Representation # `DataFrame` can be displayed in the following manner # In[4]: df # We will use the above `DataFrame` throughout the notebook # ## Basic Properties # 1. [len](#len) # 2. [columns](#columns) # 3. [shape](#shape) # 4. [values](#values) # 5. [dtypes](#dtypes) # # *[Go to table of contents](#Table-Of-Contents)* # ### `len` # returns: the number of rows in the `DataFrame` # In[5]: len(df) # ### `columns` # returns: list of column names # In[6]: df.columns # ### Modify exisiting column names # In[7]: df.columns = ['NAME', 'PLACE', 'WEIGHT', 'MARRIED'] df # ### `shape` # returns: two-item tuple of number of rows and columns in the DataFrame # In[8]: df.shape # ### `values` # returns: a single 2D NumPy array of all the columns of data. # In[9]: df.values # ### `dtypes` # returns: a two-column `DataFrame` of column names in one column and their data type in the other # In[10]: df.dtypes # ## Selection of Subsets # 1. [Select a single column](#Select-a-single-column) # 2. [Select multiple columns](#Select-multiple-columns) # 3. [Boolean selection](#Boolean-selection) # 4. [Simultaneuous selection of row and column](#Simultaneuous-selection-of-row-and-column) # 6. [Add new / Overwrite existing columns](#Add-new-/-Overwrite-existing-columns) # # *[Go to table of contents](#Table-Of-Contents)* # ### Select a single column # by passing the name of column as a string # In[11]: df['NAME'] # ### Select multiple columns # by passing column names as a list of strings # In[12]: df[['NAME', 'PLACE']] # ### Boolean Selection # In[13]: bool_sel = df['WEIGHT'] > 60 df[bool_sel] # ### Simultaneuous selection of row and column # df[row, col] # In[14]: df[0,2] # ### Select columns as strings # In[15]: df[0, 'WEIGHT'] # ### Select rows as slices # In[16]: df[:1, 'WEIGHT'] # ### Select rows as booleans and lists # In[17]: bool_row = df['MARRIED'] # In[18]: df[bool_row, 'WEIGHT'] # ### Add new / Overwrite existing columns # In[19]: df['AGE'] = np.array([21, 41, 22, 42, 32, 25]) df # ## Basic And Aggregation Methods # Basic Methods: # 1. [head()](#head) # 2. [tail()](#tail) # # Aggregation Methods: # 1. [min()](#min) # 2. [max()](#max) # 3. [mean()](#mean) # 4. [median()](#median) # 5. [sum()](#sum) # 6. [var()](#var) # 7. [std()](#std) # 8. [all()](#all) # 9. [any()](#any) # 10. [argmax()](#argmax) # 11. [argmin()](#argmin) # # # *[Go to table of contents](#Table-Of-Contents)* # ## head # # **head(n)** # # returns: the first n rows. By default n=5 # In[20]: df.head() # ## tail # # **tail(n)** # return the last n rows.By default n=5 # In[21]: df.tail() # ## Aggregation Methods # # **All aggregation methods are applied only column-wise and not row-wise.** # # 1. [min()](#min) # 2. [max()](#max) # 3. [median()](#median) # 4. [mean()](#mean) # 5. [sum()](#sum) # 6. [var()](#var) # 7. [std()](#std) # 8. [all()](#all) # 9. [any()](#any) # 10. [argmax()](#argmax) # 11. [argmin()](#argmin) # ## min # # **min()** # # used to get a minimum value for each column # In[22]: df.min() # ## max # # **max()** # # used to get a maximum value for each column # In[23]: df.max() # ## median # # **median()** # # computes median for each numeric column # In[24]: df.median() # ## mean # # **mean()** # # computes mean for each numeric column # In[25]: df.mean() # ## sum # # **sum()** # # returns the sum of all values for each column # In[26]: df.sum() # ## var # # **var()** # # computes variance for each numeric column # In[27]: df.var() # ## std # # **std()** # # computes standard deviation for each numeric column # In[28]: df.std() # ## all # # **all()** # # it tests whether all array elements along the columns evaluate to True # In[29]: df.all() # ## any # # **any()** # # it tests whether any array element along the columns evaluate to True # In[30]: df.any() # ## argmax # # **argmax()** # # returns the indices of the maximum values # In[31]: df.argmax() # ## argmin # # **argmin()** # # returns the indices of the minimum values # In[32]: df.argmin() # ## Other Methods # 1. [isna()](#isna) # 2. [count()](#count) # 3. [unique()](#unique) # 4. [nunique()](#nunique) # 5. [value_counts()](#value_counts) # 6. [rename()](#rename) # 7. [drop()](#drop) # 8. [diff()](#diff) # 9. [pct_change()](#pct_change) # 10. [sort_values()](#sort_values) # 11. [sample()](#sample) # ### isna # # **isna()** # # Determines whether each value in the DataFrame is missing or not # In[33]: df.isna() # ### count # # **count()** # # Counts the number of non-missing values per column # In[107]: df.count() # ### unique # # **unique()** # # Finds the unique values of each column # In[35]: dfs = df.unique() dfs[3] # ### nunique # # **nunique()** # # Finds the **number** of unique values in each column. # # Identical to `unique()` # In[36]: df.nunique() # ### value_counts # # **value_counts(normalize=False)** # # Returns the frequency of each unique value for each column. # # Parameters # # `normalize`: bool # # If True, returns the relative frequencies(percent) # In[37]: dfs = df.value_counts() dfs[1] # ### rename # # **rename(columns)** # # Renames columns in the DataFrame # # Parameters # # `columns`: dict # # A dictionary mapping the old column name to the new column name # # # In[38]: df.rename({'WEIGHT': 'WEIGHT (kg)'}) # ### drop # # **drop(columns)** # # Drops one or more columns from a DataFrame # # Parameters # # `columns`: str or list of strings # In[39]: df.drop('AGE') # ### diff # # **diff(n=1)** # # Take the difference between the current value and the nth value above it # # Parameters # # `n`: int # In[111]: df.diff(n=2) # ### pct_change # # **pct_change(n=1)** # # Take the percentage difference between the current value and the nth value above it # # Parameters # # `n`: int # In[104]: df.pct_change(n=1) # ### sort_values # # **sort_values(by, asc=True)** # # Sort the DataFrame by one or more values # # Parameters # # `by`: str or list of column names # # `asc`: boolean of sorting order # # In[115]: df.sort_values('AGE', asc=False) # ### sample # # **sample(n=None, frac=None, replace=False, seed=None)** # # Randomly samples rows of the DataFrame # # Parameters # # `n`: int # # number of rows to return # # `frac`: float # # Proportion of the data to sample # # `replace`: bool # # Whether or not to sample with replacement # # `seed`: int # # Seed the random number generator # In[116]: df.sample(n=4, frac=0.5, replace=True) # ## Non Aggregation Methods # 1. [abs()](#abs) # 2. [cummin()](#cummin) # 3. [cummax()](#cummax) # 4. [cumsum()](#cumsum) # 5. [clip()](#clip) # 6. [round()](#round) # 7. [copy()](#copy) # # *[Go to table of contents](#Table-Of-Contents)* # ### abs # # **abs()** # # Takes the absolute value of each value in the DataFrame # In[40]: df.abs() # ### cummin # # **cummin()** # # Finds cumulative minimum by column # In[41]: df.cummin() # ### cummax # # **cummax()** # # Finds cumulative maximum by column # In[42]: df.cummax() # ### cumsum # # **cumsum()** # # Finds cumulative sum by column # In[43]: df.cumsum() # ### clip # # **clip(lower=None, upper=None)** # # All values less than `lower` will be set to `lower` # # All values greater than `upper` will be set to `upper` # # Parameters # # `lower`: number or None # # `upper`: number or None # In[44]: df.clip(lower=55, upper=60) # ### round # # **round(n)** # # Rounds values to the nearest `n` decimals # In[45]: df.round(n=1) # ### copy # # **copy()** # # Copies the DataFrame # In[46]: df.copy() # ## Arithmetic And Comparison Operators # 1. [Addition](#Addition) # 2. [Subtraction](#Subtraction) # 3. [Multiplication](#Multiplication) # 4. [Division](#Division) # 5. [Floor Division](#Floor-Division) # 6. [Power](#Power) # 7. [Greater than](#Greater-than) # 8. [Less than](#Less-than) # 9. [Greater than equal to](#Greater-than-equal) # 10. [Lesser than equal to](#Lesser-than-equal) # 11. [Not Equal](#Not-Equal) # 12. [Equal](#Equal) # # *Arithmetic and Comparison Operators only work with numerical columns* # # *[Go to table of contents](#Table-Of-Contents)* # In[110]: df_op = df['WEIGHT'] df_op # ### Addition # In[50]: df_op + 2 # In[51]: 2 + df_op # ### Subtraction # In[52]: df_op - 2 # In[53]: 2 - df_op # ### Multiplication # In[54]: df_op * 2 # In[55]: 2 * df_op # ### Division # In[56]: df_op / 2 # In[57]: 2 / df_op # ### Floor Division # In[58]: df_op // 2 # In[59]: 2 // df_op # ### Power # In[60]: df_op ** 3 # In[61]: 3 ** df_op # ### Greater than # In[62]: df_op > 50 # ### Less than # In[63]: df_op < 55 # ### Greater than equal # In[64]: df_op >= 75 # ### Lesser than equal # In[65]: df_op <= 55 # ### Not Equal # In[66]: df_op != 55 # ### Equal # In[67]: df_op == 70 # ## String Only Methods # All the strings behave in the same manner as built-in string functions in Python. # These methods can be used only with `columns` # # 1. [capitalize()](#capitalize) # 2. [center()](#center) # 3. [count()](#count) # 4. [endswith()](#endswith) # 5. [startswith()](#startswith) # 6. [find()](#find) # 7. [len()](#len) # 8. [get()](#get) # 9. [index()](#index) # 10. [isalnum()](#isalnum) # 11. [isalpha()](#isalpha) # 12. [isdecimal()](#isdecimal) # 13. [islower()](#islower) # 14. [isnumeric()](#isnumeric) # 15. [isspace()](#isspace) # 16. [istitle()](#istitle) # 17. [isupper()](#isupper) # 18. [lstrip()](#lstrip) # 19. [rstrip()](#rstrip) # 20. [strip()](#strip) # 21. [replace()](#replace) # 22. [swapcase()](#swapcase) # 23. [title()](#title) # 24. [lower()](#lower) # 25. [upper()](#upper) # 26. [zfill()](#zfill) # 27. [encode()](#encode) # # # *[Go to table of contents](#Table-Of-Contents)* # ### capitalize # # **capitalize(col)** # In[68]: df.str.capitalize('NAME') # ### center # # **center(col, width, fillchar=None)** # In[69]: df.str.center('NAME', 10, 'a') # ### count # # **count(col, sub, start=None, stop=None)** # In[70]: df.str.count('PLACE', 'Mumbai') # ### endswith # # **endswith(col, suffix, start=None, stop=None)** # In[71]: df.str.endswith('NAME', 'n') # ### startswith # # **startswith(col, suffix, start=None, stop=None)** # In[72]: df.str.startswith('NAME', 'J') # ### find # # **find(col, sub, start=None, stop=None)** # In[73]: df.str.find('NAME', 'Tina') # ### len # # **len(col)** # In[74]: df.str.len('NAME') # ### get # # **get(col, item)** # In[75]: df.str.get('NAME', 0) # ### index # # **index(col, sub, start=None, stop=None)** # In[76]: df.str.index('NAME', '') # ### isalnum # # **isalnum(col)** # In[77]: df.str.isalnum('NAME') # ### isalpha # # **isalpha(col)** # In[78]: df.str.isalpha('NAME') # ### isdecimal # # **isdecimal(col)** # In[79]: df.str.isdecimal('NAME') # ### isnumeric # # **isnumeric(col)** # In[80]: df.str.isnumeric('NAME') # ### isspace # # **isspace(col)** # In[81]: df.str.isspace('NAME') # ### istitle # # **istitle(col)** # In[82]: df.str.istitle('NAME') # ### isupper # # **isupper(col)** # In[84]: df.str.isupper('NAME') # ### islower # # **islower(col)** # In[83]: df.str.islower('NAME') # ### lstrip # # **lstrip(col, chars)** # In[85]: df.str.lstrip('NAME', 'o') # ### rstrip # # **rstrip(col, chars)** # In[86]: df.str.rstrip('NAME', 'o') # ### strip # # **strip(col, chars)** # In[87]: df.str.strip('NAME', 'o') # ### replace # # **replace(col, old, new, count=None)** # In[88]: df.str.replace('NAME', 'John', 'Cena') # ### swapcase # # **swapcase(col)** # In[89]: df.str.swapcase('NAME') # ### title # # **title(col)** # In[90]: df.str.title('NAME') # ### lower # # **lower(col)** # In[91]: df.str.lower('NAME') # ### upper # # **upper(col)** # In[92]: df.str.upper('NAME') # ### zfill # # **zfill(col, width)** # In[93]: df.str.zfill('NAME', 10) # ### encode # # **encode(col, encoding='utf-8', errors='strict')** # In[94]: df.str.encode('NAME') # ## Pivot Table # Creates a pivot table from one or two 'grouping' columns # # Parameters # # `rows`: str of column name to group by (Optional) # # `columns`: str of column name to group by (Optional) # # `values`: str of column name to aggregate (Required) # # `aggfunc`: str of aggregation function # # *[Go to table of contents](#Table-Of-Contents)* # In[95]: df.pivot_table(rows='NAME', columns='PLACE', values='WEIGHT', aggfunc='mean') # ## Read CSV # Reads a CSV file into a dataframe # # Parameters # # `filelocation`: str of the file location # # `header`: int of the number of lines to skip in case of multi-header files, None in case of header-less files (Optional) # # **read_csv(file, header)** # # *[Go to table of contents](#Table-Of-Contents)* # In[96]: data = ick.read_csv('./dataset/employee.csv'); data.head() # In[10]: data = ick.read_csv('./dataset/employee.csv', header=None); data.head() # ## Read SQL # Reads the data from SQL query into a dataframe # # Parameters # # `sql`: str of the sql statement to be executed # # `drivername`: str of driver engine of the database # # `username`: str of username to connect to the database # # `password`: str of password to connect to the database # # `host`: str of host to connect to the database # # `port`: int of port to connect to the database # # `database`: str of database name # # **read_sql(sql, drivername, username, password, host, port, database)** # # *[Go to table of contents](#Table-Of-Contents)* # In[11]: drivername="postgresql" username="postgres" password="test123" host="localhost" port="5432" database="spotify_trends" sql="SELECT id, firstname, lastname, email FROM authors" ick.read_sql(sql,drivername,username,password,host,port,database) # ## Read Excel # Reads the data from a simple Excel file into a dataframe # # Parameters # # `file_path`: str of the path to the Excel file that you want to read # `sheet_name`: str of sheet name # # Supported file extensions - xlsx, xlsm, xltx, xltm # # **read_excel(file_path, sheet_name)** # # *[Go to table of contents](#Table-Of-Contents)* # In[4]: ick.read_excel(file_path="./dataset/Book1.xlsx", sheet_name="Sheet1") # In[ ]: