#!/usr/bin/env python
# coding: utf-8
# In[1]:
import sys
sys.executable
# # Ickle
# *Data Analysis Library for Python*
#
#
# This Jupyter Notebook serves as the documentation for Ickle.
#
# You can contribute to Ickle here: https://github.com/karishmashuklaa/ickle
# ## Table Of Contents
# 1. [Getting Started](#Getting-Started)
# 2. [DataFrame and Visual Representation](#DataFrame-and-Visual-Representation)
# 3. [Basic Properties](#Basic-Properties)
# 4. [Selection of Subsets](#Selection-of-Subsets)
# 5. [Basic and Aggregation Methods](#Basic-And-Aggregation-Methods)
# 6. [Non-Aggregation Methods](#Non-Aggregation-Methods)
# 7. [Other Methods](#Other-Methods)
# 8. [Arithmetic and Comparison Operators](#Arithmetic-And-Comparison-Operators)
# 9. [String-Only Methods](#String-Only-Methods)
# 10. [Pivot Table](#Pivot-Table)
# 11. [Read CSV](#Read-CSV)
# 12. [Read SQL](#Read-SQL)
# 13. [Read Excel](#Read-Excel)
# ## Getting Started
# ### Installation
#
# Ickle can be installed via pip.
#
# `pip install ickle`
# ### Import
#
# `import ickle as ick`
# ## DataFrame and Visual Representation
#
# *[Go to table of contents](#Table-Of-Contents)*
# ### DataFrame
# A `DataFrame` holds two dimensional heterogenous data. It accepts dictionary as input, with Numpy arrays as values and strings as column names.
#
# Parameters:
# - `data`: A dictionary of strings mapped to Numpy arrays. The key will become the column name.
# In[3]:
import numpy as np
import ickle as ick
# In[3]:
name = np.array(['John', 'Sam', 'Tina', 'Josh', 'Jack', 'Jill'])
place = np.array(['Kolkata', 'Mumbai', 'Delhi', 'Mumbai', 'Mumbai', 'Mumbai'])
weight = np.array([57, 70, 54, 59, 62, 70])
married = np.array([True, False, True, False, False, False])
data = {'name': name, 'place': place, 'weight': weight, 'married': married}
df = ick.DataFrame(data)
# ### Visual Representation
# `DataFrame` can be displayed in the following manner
# In[4]:
df
# We will use the above `DataFrame` throughout the notebook
# ## Basic Properties
# 1. [len](#len)
# 2. [columns](#columns)
# 3. [shape](#shape)
# 4. [values](#values)
# 5. [dtypes](#dtypes)
#
# *[Go to table of contents](#Table-Of-Contents)*
# ### `len`
# returns: the number of rows in the `DataFrame`
# In[5]:
len(df)
# ### `columns`
# returns: list of column names
# In[6]:
df.columns
# ### Modify exisiting column names
# In[7]:
df.columns = ['NAME', 'PLACE', 'WEIGHT', 'MARRIED']
df
# ### `shape`
# returns: two-item tuple of number of rows and columns in the DataFrame
# In[8]:
df.shape
# ### `values`
# returns: a single 2D NumPy array of all the columns of data.
# In[9]:
df.values
# ### `dtypes`
# returns: a two-column `DataFrame` of column names in one column and their data type in the other
# In[10]:
df.dtypes
# ## Selection of Subsets
# 1. [Select a single column](#Select-a-single-column)
# 2. [Select multiple columns](#Select-multiple-columns)
# 3. [Boolean selection](#Boolean-selection)
# 4. [Simultaneuous selection of row and column](#Simultaneuous-selection-of-row-and-column)
# 6. [Add new / Overwrite existing columns](#Add-new-/-Overwrite-existing-columns)
#
# *[Go to table of contents](#Table-Of-Contents)*
# ### Select a single column
# by passing the name of column as a string
# In[11]:
df['NAME']
# ### Select multiple columns
# by passing column names as a list of strings
# In[12]:
df[['NAME', 'PLACE']]
# ### Boolean Selection
# In[13]:
bool_sel = df['WEIGHT'] > 60
df[bool_sel]
# ### Simultaneuous selection of row and column
# df[row, col]
# In[14]:
df[0,2]
# ### Select columns as strings
# In[15]:
df[0, 'WEIGHT']
# ### Select rows as slices
# In[16]:
df[:1, 'WEIGHT']
# ### Select rows as booleans and lists
# In[17]:
bool_row = df['MARRIED']
# In[18]:
df[bool_row, 'WEIGHT']
# ### Add new / Overwrite existing columns
# In[19]:
df['AGE'] = np.array([21, 41, 22, 42, 32, 25])
df
# ## Basic And Aggregation Methods
# Basic Methods:
# 1. [head()](#head)
# 2. [tail()](#tail)
#
# Aggregation Methods:
# 1. [min()](#min)
# 2. [max()](#max)
# 3. [mean()](#mean)
# 4. [median()](#median)
# 5. [sum()](#sum)
# 6. [var()](#var)
# 7. [std()](#std)
# 8. [all()](#all)
# 9. [any()](#any)
# 10. [argmax()](#argmax)
# 11. [argmin()](#argmin)
#
#
# *[Go to table of contents](#Table-Of-Contents)*
# ## head
#
# **head(n)**
#
# returns: the first n rows. By default n=5
# In[20]:
df.head()
# ## tail
#
# **tail(n)**
# return the last n rows.By default n=5
# In[21]:
df.tail()
# ## Aggregation Methods
#
# **All aggregation methods are applied only column-wise and not row-wise.**
#
# 1. [min()](#min)
# 2. [max()](#max)
# 3. [median()](#median)
# 4. [mean()](#mean)
# 5. [sum()](#sum)
# 6. [var()](#var)
# 7. [std()](#std)
# 8. [all()](#all)
# 9. [any()](#any)
# 10. [argmax()](#argmax)
# 11. [argmin()](#argmin)
# ## min
#
# **min()**
#
# used to get a minimum value for each column
# In[22]:
df.min()
# ## max
#
# **max()**
#
# used to get a maximum value for each column
# In[23]:
df.max()
# ## median
#
# **median()**
#
# computes median for each numeric column
# In[24]:
df.median()
# ## mean
#
# **mean()**
#
# computes mean for each numeric column
# In[25]:
df.mean()
# ## sum
#
# **sum()**
#
# returns the sum of all values for each column
# In[26]:
df.sum()
# ## var
#
# **var()**
#
# computes variance for each numeric column
# In[27]:
df.var()
# ## std
#
# **std()**
#
# computes standard deviation for each numeric column
# In[28]:
df.std()
# ## all
#
# **all()**
#
# it tests whether all array elements along the columns evaluate to True
# In[29]:
df.all()
# ## any
#
# **any()**
#
# it tests whether any array element along the columns evaluate to True
# In[30]:
df.any()
# ## argmax
#
# **argmax()**
#
# returns the indices of the maximum values
# In[31]:
df.argmax()
# ## argmin
#
# **argmin()**
#
# returns the indices of the minimum values
# In[32]:
df.argmin()
# ## Other Methods
# 1. [isna()](#isna)
# 2. [count()](#count)
# 3. [unique()](#unique)
# 4. [nunique()](#nunique)
# 5. [value_counts()](#value_counts)
# 6. [rename()](#rename)
# 7. [drop()](#drop)
# 8. [diff()](#diff)
# 9. [pct_change()](#pct_change)
# 10. [sort_values()](#sort_values)
# 11. [sample()](#sample)
# ### isna
#
# **isna()**
#
# Determines whether each value in the DataFrame is missing or not
# In[33]:
df.isna()
# ### count
#
# **count()**
#
# Counts the number of non-missing values per column
# In[107]:
df.count()
# ### unique
#
# **unique()**
#
# Finds the unique values of each column
# In[35]:
dfs = df.unique()
dfs[3]
# ### nunique
#
# **nunique()**
#
# Finds the **number** of unique values in each column.
#
# Identical to `unique()`
# In[36]:
df.nunique()
# ### value_counts
#
# **value_counts(normalize=False)**
#
# Returns the frequency of each unique value for each column.
#
# Parameters
#
# `normalize`: bool
#
# If True, returns the relative frequencies(percent)
# In[37]:
dfs = df.value_counts()
dfs[1]
# ### rename
#
# **rename(columns)**
#
# Renames columns in the DataFrame
#
# Parameters
#
# `columns`: dict
#
# A dictionary mapping the old column name to the new column name
#
#
# In[38]:
df.rename({'WEIGHT': 'WEIGHT (kg)'})
# ### drop
#
# **drop(columns)**
#
# Drops one or more columns from a DataFrame
#
# Parameters
#
# `columns`: str or list of strings
# In[39]:
df.drop('AGE')
# ### diff
#
# **diff(n=1)**
#
# Take the difference between the current value and the nth value above it
#
# Parameters
#
# `n`: int
# In[111]:
df.diff(n=2)
# ### pct_change
#
# **pct_change(n=1)**
#
# Take the percentage difference between the current value and the nth value above it
#
# Parameters
#
# `n`: int
# In[104]:
df.pct_change(n=1)
# ### sort_values
#
# **sort_values(by, asc=True)**
#
# Sort the DataFrame by one or more values
#
# Parameters
#
# `by`: str or list of column names
#
# `asc`: boolean of sorting order
#
# In[115]:
df.sort_values('AGE', asc=False)
# ### sample
#
# **sample(n=None, frac=None, replace=False, seed=None)**
#
# Randomly samples rows of the DataFrame
#
# Parameters
#
# `n`: int
#
# number of rows to return
#
# `frac`: float
#
# Proportion of the data to sample
#
# `replace`: bool
#
# Whether or not to sample with replacement
#
# `seed`: int
#
# Seed the random number generator
# In[116]:
df.sample(n=4, frac=0.5, replace=True)
# ## Non Aggregation Methods
# 1. [abs()](#abs)
# 2. [cummin()](#cummin)
# 3. [cummax()](#cummax)
# 4. [cumsum()](#cumsum)
# 5. [clip()](#clip)
# 6. [round()](#round)
# 7. [copy()](#copy)
#
# *[Go to table of contents](#Table-Of-Contents)*
# ### abs
#
# **abs()**
#
# Takes the absolute value of each value in the DataFrame
# In[40]:
df.abs()
# ### cummin
#
# **cummin()**
#
# Finds cumulative minimum by column
# In[41]:
df.cummin()
# ### cummax
#
# **cummax()**
#
# Finds cumulative maximum by column
# In[42]:
df.cummax()
# ### cumsum
#
# **cumsum()**
#
# Finds cumulative sum by column
# In[43]:
df.cumsum()
# ### clip
#
# **clip(lower=None, upper=None)**
#
# All values less than `lower` will be set to `lower`
#
# All values greater than `upper` will be set to `upper`
#
# Parameters
#
# `lower`: number or None
#
# `upper`: number or None
# In[44]:
df.clip(lower=55, upper=60)
# ### round
#
# **round(n)**
#
# Rounds values to the nearest `n` decimals
# In[45]:
df.round(n=1)
# ### copy
#
# **copy()**
#
# Copies the DataFrame
# In[46]:
df.copy()
# ## Arithmetic And Comparison Operators
# 1. [Addition](#Addition)
# 2. [Subtraction](#Subtraction)
# 3. [Multiplication](#Multiplication)
# 4. [Division](#Division)
# 5. [Floor Division](#Floor-Division)
# 6. [Power](#Power)
# 7. [Greater than](#Greater-than)
# 8. [Less than](#Less-than)
# 9. [Greater than equal to](#Greater-than-equal)
# 10. [Lesser than equal to](#Lesser-than-equal)
# 11. [Not Equal](#Not-Equal)
# 12. [Equal](#Equal)
#
# *Arithmetic and Comparison Operators only work with numerical columns*
#
# *[Go to table of contents](#Table-Of-Contents)*
# In[110]:
df_op = df['WEIGHT']
df_op
# ### Addition
# In[50]:
df_op + 2
# In[51]:
2 + df_op
# ### Subtraction
# In[52]:
df_op - 2
# In[53]:
2 - df_op
# ### Multiplication
# In[54]:
df_op * 2
# In[55]:
2 * df_op
# ### Division
# In[56]:
df_op / 2
# In[57]:
2 / df_op
# ### Floor Division
# In[58]:
df_op // 2
# In[59]:
2 // df_op
# ### Power
# In[60]:
df_op ** 3
# In[61]:
3 ** df_op
# ### Greater than
# In[62]:
df_op > 50
# ### Less than
# In[63]:
df_op < 55
# ### Greater than equal
# In[64]:
df_op >= 75
# ### Lesser than equal
# In[65]:
df_op <= 55
# ### Not Equal
# In[66]:
df_op != 55
# ### Equal
# In[67]:
df_op == 70
# ## String Only Methods
# All the strings behave in the same manner as built-in string functions in Python.
# These methods can be used only with `columns`
#
# 1. [capitalize()](#capitalize)
# 2. [center()](#center)
# 3. [count()](#count)
# 4. [endswith()](#endswith)
# 5. [startswith()](#startswith)
# 6. [find()](#find)
# 7. [len()](#len)
# 8. [get()](#get)
# 9. [index()](#index)
# 10. [isalnum()](#isalnum)
# 11. [isalpha()](#isalpha)
# 12. [isdecimal()](#isdecimal)
# 13. [islower()](#islower)
# 14. [isnumeric()](#isnumeric)
# 15. [isspace()](#isspace)
# 16. [istitle()](#istitle)
# 17. [isupper()](#isupper)
# 18. [lstrip()](#lstrip)
# 19. [rstrip()](#rstrip)
# 20. [strip()](#strip)
# 21. [replace()](#replace)
# 22. [swapcase()](#swapcase)
# 23. [title()](#title)
# 24. [lower()](#lower)
# 25. [upper()](#upper)
# 26. [zfill()](#zfill)
# 27. [encode()](#encode)
#
#
# *[Go to table of contents](#Table-Of-Contents)*
# ### capitalize
#
# **capitalize(col)**
# In[68]:
df.str.capitalize('NAME')
# ### center
#
# **center(col, width, fillchar=None)**
# In[69]:
df.str.center('NAME', 10, 'a')
# ### count
#
# **count(col, sub, start=None, stop=None)**
# In[70]:
df.str.count('PLACE', 'Mumbai')
# ### endswith
#
# **endswith(col, suffix, start=None, stop=None)**
# In[71]:
df.str.endswith('NAME', 'n')
# ### startswith
#
# **startswith(col, suffix, start=None, stop=None)**
# In[72]:
df.str.startswith('NAME', 'J')
# ### find
#
# **find(col, sub, start=None, stop=None)**
# In[73]:
df.str.find('NAME', 'Tina')
# ### len
#
# **len(col)**
# In[74]:
df.str.len('NAME')
# ### get
#
# **get(col, item)**
# In[75]:
df.str.get('NAME', 0)
# ### index
#
# **index(col, sub, start=None, stop=None)**
# In[76]:
df.str.index('NAME', '')
# ### isalnum
#
# **isalnum(col)**
# In[77]:
df.str.isalnum('NAME')
# ### isalpha
#
# **isalpha(col)**
# In[78]:
df.str.isalpha('NAME')
# ### isdecimal
#
# **isdecimal(col)**
# In[79]:
df.str.isdecimal('NAME')
# ### isnumeric
#
# **isnumeric(col)**
# In[80]:
df.str.isnumeric('NAME')
# ### isspace
#
# **isspace(col)**
# In[81]:
df.str.isspace('NAME')
# ### istitle
#
# **istitle(col)**
# In[82]:
df.str.istitle('NAME')
# ### isupper
#
# **isupper(col)**
# In[84]:
df.str.isupper('NAME')
# ### islower
#
# **islower(col)**
# In[83]:
df.str.islower('NAME')
# ### lstrip
#
# **lstrip(col, chars)**
# In[85]:
df.str.lstrip('NAME', 'o')
# ### rstrip
#
# **rstrip(col, chars)**
# In[86]:
df.str.rstrip('NAME', 'o')
# ### strip
#
# **strip(col, chars)**
# In[87]:
df.str.strip('NAME', 'o')
# ### replace
#
# **replace(col, old, new, count=None)**
# In[88]:
df.str.replace('NAME', 'John', 'Cena')
# ### swapcase
#
# **swapcase(col)**
# In[89]:
df.str.swapcase('NAME')
# ### title
#
# **title(col)**
# In[90]:
df.str.title('NAME')
# ### lower
#
# **lower(col)**
# In[91]:
df.str.lower('NAME')
# ### upper
#
# **upper(col)**
# In[92]:
df.str.upper('NAME')
# ### zfill
#
# **zfill(col, width)**
# In[93]:
df.str.zfill('NAME', 10)
# ### encode
#
# **encode(col, encoding='utf-8', errors='strict')**
# In[94]:
df.str.encode('NAME')
# ## Pivot Table
# Creates a pivot table from one or two 'grouping' columns
#
# Parameters
#
# `rows`: str of column name to group by (Optional)
#
# `columns`: str of column name to group by (Optional)
#
# `values`: str of column name to aggregate (Required)
#
# `aggfunc`: str of aggregation function
#
# *[Go to table of contents](#Table-Of-Contents)*
# In[95]:
df.pivot_table(rows='NAME', columns='PLACE', values='WEIGHT', aggfunc='mean')
# ## Read CSV
# Reads a CSV file into a dataframe
#
# Parameters
#
# `filelocation`: str of the file location
#
# `header`: int of the number of lines to skip in case of multi-header files, None in case of header-less files (Optional)
#
# **read_csv(file, header)**
#
# *[Go to table of contents](#Table-Of-Contents)*
# In[96]:
data = ick.read_csv('./dataset/employee.csv');
data.head()
# In[10]:
data = ick.read_csv('./dataset/employee.csv', header=None);
data.head()
# ## Read SQL
# Reads the data from SQL query into a dataframe
#
# Parameters
#
# `sql`: str of the sql statement to be executed
#
# `drivername`: str of driver engine of the database
#
# `username`: str of username to connect to the database
#
# `password`: str of password to connect to the database
#
# `host`: str of host to connect to the database
#
# `port`: int of port to connect to the database
#
# `database`: str of database name
#
# **read_sql(sql, drivername, username, password, host, port, database)**
#
# *[Go to table of contents](#Table-Of-Contents)*
# In[11]:
drivername="postgresql"
username="postgres"
password="test123"
host="localhost"
port="5432"
database="spotify_trends"
sql="SELECT id, firstname, lastname, email FROM authors"
ick.read_sql(sql,drivername,username,password,host,port,database)
# ## Read Excel
# Reads the data from a simple Excel file into a dataframe
#
# Parameters
#
# `file_path`: str of the path to the Excel file that you want to read
# `sheet_name`: str of sheet name
#
# Supported file extensions - xlsx, xlsm, xltx, xltm
#
# **read_excel(file_path, sheet_name)**
#
# *[Go to table of contents](#Table-Of-Contents)*
# In[4]:
ick.read_excel(file_path="./dataset/Book1.xlsx", sheet_name="Sheet1")
# In[ ]: