#!/usr/bin/env python
# coding: utf-8
# ![Banner](../media/banner1.png)
#
# ---
# # Workshop 1.2: Acquiring Data
#
# * **Contributors**:
# * Roberto Rodriguez (@Cyb3rWard0g)
# * Jose Rodriguez (@Cyb3rPandah)
# * Ian Hellen (@ianhellen, gh:@ianhelle)
#
# * **Agenda**:
# * [Reading data from SIEMs and Databases](#reading)
# * [OTR Security Datasets - (aka Mordor)](#mordor)
#
# * **Notebook**: [https://aka.ms/Jupyterthon-ws-1-2](https://aka.ms/Jupyterthon-ws-1-2)
# * **License**: [Creative Commons Attribution-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-sa/4.0/)
#
# * **Q&A** - OTR Discord **#Jupyterthon #WORKSHOP DAY 1 - ACQUIRING DATA**
# ---
# # Reading from SIEMs and Databases
#
# ## Elasticsearch
#
# [https://elasticsearch-py.readthedocs.io/](https://elasticsearch-py.readthedocs.io/)
#
# ```
# python -m pip install elasticsearch
# ```
#
# - Importing libraries:
# In[ ]:
# Elasticsearch connector
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
# Data manipulation
import pandas as pd
# - Initializing an Elasticsearch client:
#
# Initialize an Elasticsearch client using a specific Elasticsearch URL. Next, you can pass the client to the Search object that we will use to represent the search request in a little bit.
# In[ ]:
es = Elasticsearch(['http://:9200'])
searchContext = Search(using=es, index='logs-*', doc_type='doc')
# - Setting the query search context:
#
# In addition, we will need to use the query class to pass an Elasticsearch query_string . For example, what if I want to query event_id 1 events?.
# In[ ]:
s = searchContext.query('query_string', query='event_id:1')
# - Running query & Exploring response:
#
# Finally, you can run the query and get the results back as a DataFrame.
# In[ ]:
response = s.execute()
if response.success():
df = pd.DataFrame((d.to_dict() for d in s.scan()))
df
# # Connect to Elasticsearch (Elasticsearch DSL Libraries)
#
# ![](../media/day1/elasticsearch-dsl-query.png)
#
# Reference: https://medium.com/threat-hunters-forge/jupyter-notebooks-from-sigma-rules-%EF%B8%8F-to-query-elasticsearch-31a74cc59b99
# ## Splunk
#
# [Huntlib - https://github.com/target/huntlib](https://github.com/target/huntlib)
#
# Also
#
# [SplunkSDK - https://github.com/splunk/splunk-sdk-python](https://github.com/splunk/splunk-sdk-python)
#
# - Importing libraries:
# In[ ]:
from huntlib.splunk import SplunkDF
# - Running query & Exploring response
# In[ ]:
df = s.search_df(
spl="search index=win_events EventCode=4688",
start_time="-2d@d",
end_time="@d"
)
# ## Sqlite
#
# - Importing libraries:
# In[ ]:
get_ipython().system('pip install ipython-sql')
# - Loading library
# In[1]:
get_ipython().run_cell_magic('capture', '', '%load_ext sql\n')
# - Connecting to database
# In[2]:
get_ipython().run_line_magic('sql', 'sqlite:///../data/browser2.db')
# - Executing queries
# In[3]:
get_ipython().run_cell_magic('sql', '', "SELECT\n name\nFROM\n sqlite_master\nWHERE\n type='table';\n")
# In[4]:
get_ipython().run_cell_magic('sql', '', 'SELECT * FROM history;\n')
# - Save query results in a Pandas DataFrame
# In[5]:
df = _.DataFrame()
# In[6]:
df
# ## Log analytics workspace
#
# **Requirements**:
# * Azure AD application
# * Secret text (credentials)
# * API permissions granted:
# * Service: Log Analytics API
# * Permission: Data.Read
# * Type: AppRole
# * Add application with `Log Analytics Reader` role to Log Analytic Workspace Access Control list.
# **Get OAUTH Access Token**
# * Application ID
# * Scope: https://api.loganalytics.io/.default
# * TenantId
# * Application secret
#
# https://securitydatasets.com/create/azureloganalyticsapi.html
# In[ ]:
import requests
import time
appId = "AppId"
scope = "https://api.loganalytics.io/.default"
tenantId = "TenantID"
secret = 'ApplicationSecret'
endpoint = f'https://login.microsoftonline.com/{tenantId}/oauth2/v2.0/token'
http_headers = {'Accept': 'application/json','Content-Type': 'application/json'}
data = {"scope" : scope, 'grant_type' : 'client_credentials', 'client_id' : appId, 'client_secret' : secret}
results = requests.get(endpoint, json=data, headers=http_headers, stream=False).json()
access_token = results["access_token"]
# **Run Query**
# In[ ]:
workspaceId = 'Workspace ID'
apiUri = f'https://api.loganalytics.io/v1/workspaces/{workspaceId}/query'
query = 'query'
HttpHeaders = {"Authorization" : f'Bearer {access_token}', 'Accept': 'application/json','Content-Type': 'application/json'}
data = {'query' : query}
query_results = requests.get(apiUri, json=data, headers=http_headers, stream=False).json()
# ## M365 advanced hunting APIs
#
# **Requirements**:
# * Azure AD application
# * Secret text (credentials)
# * API permissions granted:
# * Service: Microsoft Threat Protection
# * Permission:
# * AdvancedHunting.Read.All
# * Incident.Read.All
# * Type: AppRole
# **Get OAUTH Access Token**
# * Application ID
# * Scope: https://api.security.microsoft.com/.default
# * TenantId
# * Application secret
#
# https://securitydatasets.com/create/m365defenderhuntapi.html
# ## MSTICPy Data Providers
#
# There's a later session devoted to MSTICPy
#
# ```ipython3
# %pip install msticpy
# ```
#
# MSTICPy isn't a data source - just wraps a bunch of data sources in common API.
#
# Currently supports:
# - MS Sentinel (aka Azure Sentinel, Log Analytics), MS Defender, MS Graph, Azure Resource Graph
# - Splunk
# - Sumologic
# - Local data (CSV or Pickle) - would be easy to add supported pandas formats (any requests?)
# - Experimental support for Kusto/Azure Data explorer
#
# Typical usage:
# - import QueryProvider class
# - instantiate a QueryProvider object with the required provider name.
# - run `query_provider.connect()` method - params vary (e.g. connection string)
# - pre-defined, parameterized queries appear as methods of the query_provider class
# - ad hoc queries via `.exec_query()` method
# - output returned as a pandas dataframe
#
# If you use the MSTICPy init (`init_notebook`), the QueryProvider is imported for you
#
# ```python
# import msticpy
# msticpy.init_notebook(globals())
# ```
#
# In[17]:
# To install
# %pip install msticpy
# Alternative import - init_notebook imports QueryProvider and a bunch of other stuff
# import msticpy
# msticpy.init_notebook(globals())
from msticpy.data import QueryProvider
sentinel_prov = QueryProvider("AzureSentinel")
local_prov = QueryProvider("LocalData", query_paths=["../data"], data_paths=["../data"])
# #### Accessing queries as functions
# (usually need to connect before running one)
# In[ ]:
sentinel_prov.
# In[3]:
sentinel_prov.browse()
# In[19]:
local_prov.Network.list_network_flows().head()
# In[42]:
sentinel_prov.connect(
"loganalytics://code().tenant('72f988bf-86f1-41af-91ab-2d7cd011db47').workspace('8ecf8077-cf51-4820-aadd-14040956f35d')"
)
# ---
# # OTR Security Datasets (aka Mordor)
#
# ## Reading OTR-SecurityDatasets from code
#
# The [Security Datasets project](https://securitydatasets.com/introduction.html) is an open-source initiatve that contributes pre-recorded datasets that describe malicious activity, from different platforms, to the infosec community to expedite data analysis and threat research.
#
# We will consider the following steps to access Security Datasets:
#
# - Importing required Python Libraries
# - Making a HTTP request to GitHub repository
# - Generating ZipFile (Compressed) object from Bytes data
# - Extracting JSON file from ZipFile object
# - Reading JSON file (Lines = True)
#
# - Let's start by importing the required Python libraries in order to access Security Datasets' content:
# In[1]:
# Generate HTTP request
import requests
# Zip file object manipulation
from zipfile import ZipFile
# Byte data manipulations
from io import BytesIO
# Read JSON file
from pandas.io import json
# - We will make a HTTP request to the [Security Datasets GitHub repo](https://github.com/OTRF/Security-Datasets) using the **[get](https://docs.python-requests.org/en/latest/user/quickstart/#make-a-request)** method, and we are storing the reponse content in the variable *zipFileRequest*.
#
# It is important to note that we are using the **raw data link** related to the dataset. This type of links usually starts with **https://raw.githubusercontent.com/** + Project reference.
# In[8]:
url = 'https://raw.githubusercontent.com/OTRF/Security-Datasets/master/datasets/atomic/windows/discovery/host/empire_shell_net_localgroup_administrators.zip'
zipFileRequest = requests.get(url)
type(zipFileRequest)
# - The type of data of the content of HTTP repsonse is **[Bytes](https://docs.python.org/3/library/stdtypes.html#bytes)**.
# In[4]:
type(zipFileRequest.content)
# - We will create a **[BytesIO](https://docs.python.org/3/library/io.html#io.BytesIO)** object to access the response content and store it in a **[ZipFile](https://docs.python.org/3/library/zipfile.html#zipfile-objects)** object. All the data manipulation is performed in memory.
# In[7]:
zipFile = ZipFile(BytesIO(zipFileRequest.content))
type(zipFile)
# - Any ZipFile object can contain more than one file. We can access the list of files' names using the **[namelist](https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile.namelist)** method. Since the Security Datasets contains one file, we will reference the first element of the list when extracting the JSON file.
# In[9]:
zipFile.namelist()
# - We will extract the JSON file from the compressed folder using the **[extract](https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile.extract)** method. After running the code below, we will download and store the file in the directory specified in the *path* parameter.
#
# It is important to note that this method returns the normalized path to the JSON file. We are storing the directory path in the *datasetJSONPAth* variable and use it when trying to read the file.
# In[10]:
datasetJSONPath = zipFile.extract(zipFile.namelist()[0], path = '../data')
print(datasetJSONPath)
# - Now that the file was downloaded and know its direcotry path, we can read the JSON file using the **[read_json](https://pandas.pydata.org/docs/reference/api/pandas.io.json.read_json.html)** method.
#
# It is important to note that, when recording Security Dataset, each line of the JSON file represents an event. Therefore, it is important to set the parameter **lines** to *True*.
# In[11]:
dataset = json.read_json(path_or_buf = datasetJSONPath, lines=True)
# - The **read_json** method returns a **DataFrame** object. We will share more details of what a dataframe is in the next section of this workshop.
# In[12]:
type(dataset)
# - Finally, we should be able to start exploring our dataset using different functions or method such as [head](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html).
# In[13]:
dataset.head(n=1)
# ## Using MSTICPy to access Security Datasets
#
# In[21]:
#%pip install msticpy
import pandas as pd
from msticpy.data import QueryProvider
from msticpy.vis import mp_pandas_plot
qry_prov_sd = QueryProvider("Mordor")
# In[22]:
qry_prov_sd.connect()
# In[24]:
qry_prov_sd.list_queries()[:10]
# In[25]:
qry_prov_sd.search_queries("empire + localgroup")
#
# In[27]:
emp_df = qry_prov_sd.atomic.windows.discovery.host.empire_shell_net_localgroup_administrators()
emp_df.head()
# Make sure that timestamps actually are timestamps, not strings
# In[29]:
emp_df["EventTime"] = pd.to_datetime(emp_df["EventTime"])
# In[30]:
emp_df.mp_plot.timeline(time_column="EventTime", group_by="EventID")
# ### Security Datasets Browser
#
# - Browser properties
# - Filter by MITRE Tactic/Technique
# - Search across metadata, file names
# - Download selected datasets
# In[31]:
from msticpy.data.browsers.mordor_browser import MordorBrowser
m_browser = MordorBrowser()
# ### Downloaded data available in `browser.current_dataset`
# In[32]:
m_browser.current_dataset.head(3)
# ### Cached datasets available in `browser.datasets`
# In[33]:
m_browser.datasets
# ---
# # End of Session
# # Break: 5 Minutes
#
# ![](../media/dog-leash-break.jpg)