#!/usr/bin/env python
# coding: utf-8
# ![Banner](../media/banner2.png)
#
# ---
# # Workshop 2.3: Advanced Pandas
#
# * **Contributors**:
# * Ashwin Patil (@ashwin)
# * Luis Francisco Monge Martinez (@LuckyLuke)
# * Jose Rodriguez (@Cyb3rPandah)
# * Ian Hellen (@ianhellen)
#
# * **Agenda**:
# * [Joins and merges](#joins)
# * [Using styles](#styles)
# * [Reshaping/preprocessing data](#reshaping)
# * [Pivot tables](#pivots)
# * [Time manipulation](#time)
# * [Other useful operations](#otherops)
#
# * **Notebook**: https://aka.ms/Jupyterthon-ws-2-3
# * **License**: [Creative Commons Attribution-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-sa/4.0/)
#
# * **Q&A** - OTR Discord **#Jupyterthon #WORKSHOP DAY 2 - ADVANCED PANDAS**
# ---
#
# # Joins and merges [Ashwin]
#
# **Pandas UserGuide :** [Merge, join, concatenate and compare](https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html)
#
# ### Load some data and normalize it into:
# - Processes
# - ParentProcesses
# - Users
# In[58]:
import pandas as pd
procs_df = pd.read_csv(
"../data/process_tree.csv",
parse_dates=["TimeCreatedUtc", "TimeGenerated"],
index_col=0
)
parents = procs_df[["ProcessId", "ParentProcessName"]].drop_duplicates()
procs = (
procs_df[["NewProcessId", "NewProcessName", "CommandLine", "ProcessId", "TimeCreatedUtc", "SubjectUserSid"]]
.drop_duplicates()
.rename(columns={"ProcessId": "ParentProcessId"})
)
users = procs_df[['SubjectUserSid', 'SubjectUserName', 'SubjectDomainName']].drop_duplicates()
print("original", len(procs_df))
print("procs", len(procs))
print("parents", len(parents))
print("users", len(users))
display(procs.head(3))
display(parents)
display(users)
# ## Joining on Index using pd.concat
#
#
# pd.concat([df1, df2...]) #
# # We saw using pd.concat to append rows in part 1 # In[2]: # Do some processing on the original DF dec_logon_id = ( pd.DataFrame(procs_df.SubjectLogonId.apply(lambda x: int(x, base=16))) .rename(columns={"SubjectLogonId": "SubjectLogonId_dec"}) ) dec_logon_id.head(5) # #### pd.concat with `axis="columns"` or `axis=1` joins column-wise (horizontally) # In[3]: ( pd.concat([procs_df, dec_logon_id], axis="columns") .head() .filter(regex=".*Process.*|Sub.*") ) # ## Key-based Joins # #
# df1.merge(df2, ...)
# df1.join(df2, ...)
#
# df.style(...) #
# In[9]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") # Generate a summary summary_df = ( net_df[["RemoteRegion", "TotalAllowedFlows", "L7Protocol"]] .groupby("RemoteRegion") .agg( FlowsSum = pd.NamedAgg("TotalAllowedFlows", "sum"), FlowsVar = pd.NamedAgg("TotalAllowedFlows", "var"), FlowsStdDev = pd.NamedAgg("TotalAllowedFlows", "std"), L7Prots = pd.NamedAgg("L7Protocol", "nunique"), ) ) summary_df # ### highlight_max/highlight_mix # ## df.style.highlight_max(...) #
# In[10]: df_style = summary_df.style.highlight_max(color="blue").highlight_min(color="green") df_style # ### Color gradients # ## df.style.background_gradient(...) #
# In[60]: import seaborn as sns cm = sns.light_palette("blue", as_cmap=True) summary_df.style.background_gradient(cmap=cm).format("{:.1f}") # ### Inline bars # ## df.style.bar(...) #
# # In[12]: summary_df.style.bar(color="blue").format("{:.2f}") # In[ ]: # In[59]: summary_df.style.set_properties(**{ 'background-color': 'black', 'color': 'lawngreen', 'font-family': 'consolas', }).format("{:.2f}") # --- # # # Reshaping/preprocessing data? [Ian] # # - Dealing with nulls/NAs # - Type conversion # - Renaming columns # - Pandas operations: melt, explode, transpose, indexing/stack/unstack # - Dealing with complex Python objects - explode # - Tidy data - melt # # ## Dealing with nulls/NAs # # [Working with missing data](https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#missing-data) # # pandas primarily uses `NaN` to represent missing data which is of `floattype`. # IEEE 754 floating point representation of Not a Number (NaN). # # Tip: Often you will see TypeError exceptions about not being able to # perform an expected operation on a float (when you were expecting the type # to be a string or other object type). This is very likely due NaNs in your data. # # Also NaT - is the equivalent of NaN for DateTime data. # # Sometimes python also raises `None` for missing data. `NoneType` object. # In[14]: import pandas as pd net_df = pd.read_pickle("../data/az_net_comms_df.pkl") len(net_df) # In[15]: print(f"Null elements in DataFrame: {net_df.isnull().values.sum()} \n\ Rows with null elements: {net_df.shape[0] - net_df.dropna().shape[0]}") # ### Which columns have NAs? # #
# df.isna()
# series.isna()
#
# df.dropna() # removes all rows with ANY NaNs
# df.dropna(axis=1) # removes all columns with ANY NaNs
#
# df.dropna(how="all") # removes all rows that are ALL NaNs
# df.dropna(axis=1, how="all") # removes all cols that are ALL NaNs
#
# df.fillna(replacement) # replace NaNs with 'replacement'
# df[column] = df[column].fillna(replacement) # replace NaNs in a single column
#
# series.column.astype(target) # convert type
#
# df.column.astype(target|{col1: type1, col2, type2...}) # convert multiple cols
#
# df.rename(columns={col1: col1_new, col2: ....}) # rename
#
# df.rename(func, axis='columns') #
# In[29]: net_df.rename(str.lower, axis='columns').columns # In[30]: net_df.columns # ### Statically rename using assignment # In[31]: net_df.columns = [ "timegenerated", "flowstarttime", "flowendtime", "flowintervalendtime", "flowtype", "resourcegroup", "vmname", "vmipaddress", "publicips", "srcip", "destip", "l4protocol", "l7protocol", "destport", "flowdirection", "allowedoutflows", "allowedinflows", "deniedinflows", "deniedoutflows", "remoteregion", "vmregion", "allextips", "totalallowedflows", ] # ## Pandas operations: melt, explode, transpose, indexing/stack/unstack [Ashwin] # ### Dealing with complex Python objects - explode # In[32]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") net_df.PublicIPs.head(10) # In[33]: net_df.PublicIPs.count() # In[34]: net_df_ext = net_df.explode("PublicIPs") # In[35]: net_df_ext.PublicIPs.head(10) # In[36]: len(net_df_ext.PublicIPs.unique()) # ### Tidy data - melt # Pandas.melt() unpivots a DataFrame from wide format to long format. # melt() function is useful to message a DataFrame into a format where one or more columns are identifier variables, while all other columns, considered measured variables, are unpivoted to the row axis, leaving just two non-identifier columns, variable and value. # In[37]: net_df_min = net_df[["FlowType", "AllExtIPs", "TotalAllowedFlows"]] # In[38]: pd.melt(net_df_min, id_vars=['AllExtIPs']) # ### Transpose # In[39]: net_df_min.head().T # ### Indexing, Stack and Unstack # In[40]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") # In[41]: net_df_agg = net_df.groupby("AllExtIPs").agg({"TotalAllowedFlows":['mean', 'min', 'max'], "AllowedOutFlows":['mean', 'min', 'max'], "AllowedInFlows":['mean', 'min', 'max']}) # In[42]: net_df_agg.head() # In[43]: net_df_agg["TotalAllowedFlows"]["mean"] # In[44]: idx = pd.IndexSlice net_df_agg.loc[:,idx[:,'mean']] # In[45]: net_df_agg_stacked = net_df_agg.stack() # In[46]: net_df_agg_stacked.head() # In[47]: net_df_agg_stacked.loc[("10.0.3.4","mean"),"TotalAllowedFlows"] # In[48]: net_df_agg_stacked.unstack().head() # --- # # Pivoting/pivot tables [Ashwin] # # ### [Reshaping and Pivot Tables](https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html) # In[49]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") net_df.head() # In[50]: # Prepare groupby dataset to perform pivot. Does expect column with unique values net_df_grouped = net_df.groupby(['FlowIntervalEndTime','FlowType'])['AllExtIPs'].count().reset_index() net_df_grouped.head() # In[51]: net_df_grouped.pivot(index="FlowIntervalEndTime", columns="FlowType", values="AllExtIPs") # --- # # Time manipulation [Ashwin] # # - Timezone conversions # - Resample - Grouping by time # In[52]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") net_df.head() # ### Timezone considerations # In[53]: dti = pd.to_datetime(net_df['TimeGenerated']) dti_utc = dti.dt.tz_localize("UTC") print(dti_utc) # In[54]: dti_pst = dti.dt.tz_localize("US/Pacific") print(dti_pst) # ### Grouping by time # # [Resampling](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling) - `resample()` time-based groupby # In[55]: net_df.set_index('TimeGenerated').resample('H')['FlowType'].count() # --- # # Other Useful operations [Ian] # # - Chaining multiple operations with "." # - Including external functions with pipe # - Apply, assign, others ???? # ## Chaining multiple operations with "." # In[61]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") net_df.head() # In[63]: net_df[["TimeGenerated","AllExtIPs"]].groupby("AllExtIPs").agg("count") # In[65]: net_df[["RemoteRegion","AllExtIPs"]].groupby("AllExtIPs").agg("count").sort_values(by="RemoteRegion", ascending=False) # ### Use parentheses to let you stack the functions vertically # In[66]: ( net_df[["RemoteRegion","AllExtIPs"]] .groupby("AllExtIPs") .agg("count") .sort_values(by="RemoteRegion", ascending=False) ) # In[67]: ( net_df[["RemoteRegion", "AllExtIPs"]] .groupby("AllExtIPs") .agg("count") .sort_values( by="RemoteRegion", ascending=False ) .head(5) ) # In[68]: ( net_df[["RemoteRegion","AllExtIPs"]] .groupby("AllExtIPs") .agg("count") .sort_values(by="RemoteRegion", ascending=False) .head(5) .index .to_list() ) # ## External functions with `.pipe` # #
# df.pipe(function)
#
# series.apply(function)
# df.column_name.apply(function)
#
# df.apply(function, axis=1) # apply by row
# df.apply(function [, axis=0]) # apply by column
#