#!/usr/bin/env python # coding: utf-8 # ![Banner](../media/banner2.png) # # --- # # Workshop 2.3: Advanced Pandas # # * **Contributors**: # * Ashwin Patil (@ashwin) # * Luis Francisco Monge Martinez (@LuckyLuke) # * Jose Rodriguez (@Cyb3rPandah) # * Ian Hellen (@ianhellen) #

# * **Agenda**: # * [Joins and merges](#joins) # * [Using styles](#styles) # * [Reshaping/preprocessing data](#reshaping) # * [Pivot tables](#pivots) # * [Time manipulation](#time) # * [Other useful operations](#otherops) #

# * **Notebook**: https://aka.ms/Jupyterthon-ws-2-3 # * **License**: [Creative Commons Attribution-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-sa/4.0/) # # * **Q&A** - OTR Discord **#Jupyterthon #WORKSHOP DAY 2 - ADVANCED PANDAS** # --- # # # Joins and merges [Ashwin] # # **Pandas UserGuide :** [Merge, join, concatenate and compare](https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html) # # ### Load some data and normalize it into: # - Processes # - ParentProcesses # - Users # In[58]: import pandas as pd procs_df = pd.read_csv( "../data/process_tree.csv", parse_dates=["TimeCreatedUtc", "TimeGenerated"], index_col=0 ) parents = procs_df[["ProcessId", "ParentProcessName"]].drop_duplicates() procs = ( procs_df[["NewProcessId", "NewProcessName", "CommandLine", "ProcessId", "TimeCreatedUtc", "SubjectUserSid"]] .drop_duplicates() .rename(columns={"ProcessId": "ParentProcessId"}) ) users = procs_df[['SubjectUserSid', 'SubjectUserName', 'SubjectDomainName']].drop_duplicates() print("original", len(procs_df)) print("procs", len(procs)) print("parents", len(parents)) print("users", len(users)) display(procs.head(3)) display(parents) display(users) # ## Joining on Index using pd.concat # #

# pd.concat([df1, df2...]) #

# # We saw using pd.concat to append rows in part 1 # In[2]: # Do some processing on the original DF dec_logon_id = ( pd.DataFrame(procs_df.SubjectLogonId.apply(lambda x: int(x, base=16))) .rename(columns={"SubjectLogonId": "SubjectLogonId_dec"}) ) dec_logon_id.head(5) # #### pd.concat with `axis="columns"` or `axis=1` joins column-wise (horizontally) # In[3]: ( pd.concat([procs_df, dec_logon_id], axis="columns") .head() .filter(regex=".*Process.*|Sub.*") ) # ## Key-based Joins # #

# df1.merge(df2, ...)
# df1.join(df2, ...) #

# # # Source tables # In[4]: display(procs.head()) display(users) # ### Simple merge on common key # In[5]: procs.merge(users, on="SubjectUserSid") # ### Left joins (also right and outer) # In[6]: procs.merge(users[1:], on="SubjectUserSid") # In[7]: procs.merge(users[1:], on="SubjectUserSid", how="left") # ### Joins where no common key # In[8]: ( procs.merge(parents, left_on="ParentProcessId", right_on="ProcessId") .head() .filter(regex=".*Process.*") ) # --- # # # Using Styles [Ian] # # - Max/min values # - Value coloring # - Inline bars # #

# df.style(...) #

# In[9]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") # Generate a summary summary_df = ( net_df[["RemoteRegion", "TotalAllowedFlows", "L7Protocol"]] .groupby("RemoteRegion") .agg( FlowsSum = pd.NamedAgg("TotalAllowedFlows", "sum"), FlowsVar = pd.NamedAgg("TotalAllowedFlows", "var"), FlowsStdDev = pd.NamedAgg("TotalAllowedFlows", "std"), L7Prots = pd.NamedAgg("L7Protocol", "nunique"), ) ) summary_df # ### highlight_max/highlight_mix # #

# df.style.highlight_max(...) #

# In[10]: df_style = summary_df.style.highlight_max(color="blue").highlight_min(color="green") df_style # ### Color gradients # #

# df.style.background_gradient(...) #

# In[60]: import seaborn as sns cm = sns.light_palette("blue", as_cmap=True) summary_df.style.background_gradient(cmap=cm).format("{:.1f}") # ### Inline bars # #

# df.style.bar(...) #

# # In[12]: summary_df.style.bar(color="blue").format("{:.2f}") # In[ ]: # In[59]: summary_df.style.set_properties(**{ 'background-color': 'black', 'color': 'lawngreen', 'font-family': 'consolas', }).format("{:.2f}") # --- # # # Reshaping/preprocessing data? [Ian] # # - Dealing with nulls/NAs # - Type conversion # - Renaming columns # - Pandas operations: melt, explode, transpose, indexing/stack/unstack # - Dealing with complex Python objects - explode # - Tidy data - melt # # ## Dealing with nulls/NAs # # [Working with missing data](https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#missing-data) # # pandas primarily uses `NaN` to represent missing data which is of `floattype`. # IEEE 754 floating point representation of Not a Number (NaN). # # Tip: Often you will see TypeError exceptions about not being able to # perform an expected operation on a float (when you were expecting the type # to be a string or other object type). This is very likely due NaNs in your data. # # Also NaT - is the equivalent of NaN for DateTime data. # # Sometimes python also raises `None` for missing data. `NoneType` object. # In[14]: import pandas as pd net_df = pd.read_pickle("../data/az_net_comms_df.pkl") len(net_df) # In[15]: print(f"Null elements in DataFrame: {net_df.isnull().values.sum()} \n\ Rows with null elements: {net_df.shape[0] - net_df.dropna().shape[0]}") # ### Which columns have NAs? # #

# df.isna()
# series.isna() #

# In[16]: net_df.isna().any() # In[17]: net_df.ResourceGroup.value_counts() # ### Filtering to see which columns have NaNs # # You can use `.isna()` on the whole DataFrame or a single column. # In[18]: net_df[net_df["PublicIPs"].isna()] # ### Removing NaNs with `.dropna` # #

# df.dropna() # removes all rows with ANY NaNs
# df.dropna(axis=1) # removes all columns with ANY NaNs #

# df.dropna(how="all") # removes all rows that are ALL NaNs
# df.dropna(axis=1, how="all") # removes all cols that are ALL NaNs
#

# # `dropna()` also supports `inplace=True`. Don't do it!!! # # In[19]: len(net_df.dropna()) # ### Replacing NaNs with values # #

# df.fillna(replacement) # replace NaNs with 'replacement'

# df[column] = df[column].fillna(replacement) # replace NaNs in a single column #

# # In[20]: net_df2 = net_df.fillna(value="N/A") # In[21]: net_df2.ResourceGroup.value_counts() # ## Type Conversion # In[22]: net_df.dtypes #

# series.column.astype(target) # convert type
#

# # Target can be a numpy type, a pandas dtype or a friendly string: # - "object" # - "datetime" # - "number" # #

# df.column.astype(target|{col1: type1, col2, type2...}) # convert multiple cols
#

# # In[23]: net_df.TotalAllowedFlows = net_df.TotalAllowedFlows.astype('str') # In[24]: net_df.TotalAllowedFlows.dtypes # ### Convert using explicit pandas function # # Gives you more control over specific conversions (esp for DateTime) # In[25]: net_df.TotalAllowedFlows = pd.to_numeric(net_df.TotalAllowedFlows) #pd.to_datetime #pd.to_timedelta # In[26]: net_df.TotalAllowedFlows.dtypes # ## Renaming columns # In[27]: net_df.columns #

# df.rename(columns={col1: col1_new, col2: ....}) # rename
#

# # In[28]: net_df.rename(columns={"FlowStartTime": "FlowStartDateTime", "FlowEndTime": "FlowEndDateTime"}).columns # #

# df.rename(func, axis='columns') #

# In[29]: net_df.rename(str.lower, axis='columns').columns # In[30]: net_df.columns # ### Statically rename using assignment # In[31]: net_df.columns = [ "timegenerated", "flowstarttime", "flowendtime", "flowintervalendtime", "flowtype", "resourcegroup", "vmname", "vmipaddress", "publicips", "srcip", "destip", "l4protocol", "l7protocol", "destport", "flowdirection", "allowedoutflows", "allowedinflows", "deniedinflows", "deniedoutflows", "remoteregion", "vmregion", "allextips", "totalallowedflows", ] # ## Pandas operations: melt, explode, transpose, indexing/stack/unstack [Ashwin] # ### Dealing with complex Python objects - explode # In[32]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") net_df.PublicIPs.head(10) # In[33]: net_df.PublicIPs.count() # In[34]: net_df_ext = net_df.explode("PublicIPs") # In[35]: net_df_ext.PublicIPs.head(10) # In[36]: len(net_df_ext.PublicIPs.unique()) # ### Tidy data - melt # Pandas.melt() unpivots a DataFrame from wide format to long format. # melt() function is useful to message a DataFrame into a format where one or more columns are identifier variables, while all other columns, considered measured variables, are unpivoted to the row axis, leaving just two non-identifier columns, variable and value. # In[37]: net_df_min = net_df[["FlowType", "AllExtIPs", "TotalAllowedFlows"]] # In[38]: pd.melt(net_df_min, id_vars=['AllExtIPs']) # ### Transpose # In[39]: net_df_min.head().T # ### Indexing, Stack and Unstack # In[40]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") # In[41]: net_df_agg = net_df.groupby("AllExtIPs").agg({"TotalAllowedFlows":['mean', 'min', 'max'], "AllowedOutFlows":['mean', 'min', 'max'], "AllowedInFlows":['mean', 'min', 'max']}) # In[42]: net_df_agg.head() # In[43]: net_df_agg["TotalAllowedFlows"]["mean"] # In[44]: idx = pd.IndexSlice net_df_agg.loc[:,idx[:,'mean']] # In[45]: net_df_agg_stacked = net_df_agg.stack() # In[46]: net_df_agg_stacked.head() # In[47]: net_df_agg_stacked.loc[("10.0.3.4","mean"),"TotalAllowedFlows"] # In[48]: net_df_agg_stacked.unstack().head() # --- # # Pivoting/pivot tables [Ashwin] # # ### [Reshaping and Pivot Tables](https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html) # In[49]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") net_df.head() # In[50]: # Prepare groupby dataset to perform pivot. Does expect column with unique values net_df_grouped = net_df.groupby(['FlowIntervalEndTime','FlowType'])['AllExtIPs'].count().reset_index() net_df_grouped.head() # In[51]: net_df_grouped.pivot(index="FlowIntervalEndTime", columns="FlowType", values="AllExtIPs") # --- # # Time manipulation [Ashwin] # # - Timezone conversions # - Resample - Grouping by time # In[52]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") net_df.head() # ### Timezone considerations # In[53]: dti = pd.to_datetime(net_df['TimeGenerated']) dti_utc = dti.dt.tz_localize("UTC") print(dti_utc) # In[54]: dti_pst = dti.dt.tz_localize("US/Pacific") print(dti_pst) # ### Grouping by time # # [Resampling](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling) - `resample()` time-based groupby # In[55]: net_df.set_index('TimeGenerated').resample('H')['FlowType'].count() # --- # # Other Useful operations [Ian] # # - Chaining multiple operations with "." # - Including external functions with pipe # - Apply, assign, others ???? # ## Chaining multiple operations with "." # In[61]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") net_df.head() # In[63]: net_df[["TimeGenerated","AllExtIPs"]].groupby("AllExtIPs").agg("count") # In[65]: net_df[["RemoteRegion","AllExtIPs"]].groupby("AllExtIPs").agg("count").sort_values(by="RemoteRegion", ascending=False) # ### Use parentheses to let you stack the functions vertically # In[66]: ( net_df[["RemoteRegion","AllExtIPs"]] .groupby("AllExtIPs") .agg("count") .sort_values(by="RemoteRegion", ascending=False) ) # In[67]: ( net_df[["RemoteRegion", "AllExtIPs"]] .groupby("AllExtIPs") .agg("count") .sort_values( by="RemoteRegion", ascending=False ) .head(5) ) # In[68]: ( net_df[["RemoteRegion","AllExtIPs"]] .groupby("AllExtIPs") .agg("count") .sort_values(by="RemoteRegion", ascending=False) .head(5) .index .to_list() ) # ## External functions with `.pipe` # #

# df.pipe(function)
#

# # You can call functions to do processing on your data. # The function must take a DataFrame as the first parameter and return a DataFrame # In[83]: # Define a couple of (not very useful) functions def drop_duplicates(df, column_name): return df.drop_duplicates(subset=column_name) def fill_missing_values(df): df_result = df.copy() for col in df_result.columns: df_result[col].fillna("N/A", inplace=True) return df_result # In[112]: display(net_df[["TimeGenerated", "ResourceGroup"]][net_df["ResourceGroup"].isna()]) print("rows with NaNs:", net_df.isnull().values.sum()) # In[113]: net_df.pipe(fill_missing_values).isnull().values.sum() # Using the drop_duplicates function # In[87]: len(net_df) # In[89]: net_df.pipe(drop_duplicates, "AllExtIPs").shape # Using both functions # In[91]: net_df = pd.read_pickle("../data/az_net_comms_df.pkl") len(net_df) # In[115]: net_df_cleaned = ( net_df .pipe(drop_duplicates, "AllExtIPs") .pipe(fill_missing_values) ) # In[116]: display(net_df[["TimeGenerated", "ResourceGroup"]][net_df["ResourceGroup"].isna()]) display(net_df_cleaned[["TimeGenerated", "ResourceGroup"]][net_df["ResourceGroup"].isna()]) # ## External functions with `.apply` # # apply is very inefficient but necessary sometimes. # # ### `.apply` and pandas series # #

# series.apply(function)
# df.column_name.apply(function)
#

# In[130]: display( net_df.VMIPAddress.apply(str.split, ".").head() ) display( net_df.VMRegion.apply(str.capitalize).head() ) # In[132]: # Using a lambda (inline) function display( net_df.VMIPAddress.apply( lambda col: "_".join(col.split(".")) ) .head() ) # In[142]: import ipaddress def to_ip(ip_str): if ip_str: ip = ipaddress.ip_address(ip_str) if ip.is_global: return "global", ip else: return "other", ip return "Unknown" display( net_df.AllExtIPs.apply(to_ip) .head(10) ) # ### `.apply` and DataFrames # #

# df.apply(function, axis=1) # apply by row
# df.apply(function [, axis=0]) # apply by column
#

# In[159]: display( net_df .apply(lambda row: row.RemoteRegion.upper(), axis=1) .head(5) ) display( net_df .apply(lambda row: row.RemoteRegion.capitalize() + ": " + str(hash(row.RemoteRegion)), axis=1) .head() ) # In[150]: def df_to_ip(row): for name in row.index: value = row[name] try: ip = ipaddress.ip_address(value) if ip.is_global: row[name] = f"IP global: {ip}" elif ip.is_private: row[name] = f"IP private: {ip}" else: row[name] = f"IP other: {ip}" except: pass return row net_df.apply(df_to_ip, axis=1).filter(regex=".*IP.*") # --- # # End of Session # # Break: 5 Minutes # # ![](../media/dog-leash-break.jpg)