Here we're using the term Similarity Graph to mean a graph where the nodes are entities (PE Files in this case) and the edges are relationships between the nodes based on similar attributes. See Semantic Network for more information.
Workbench can be setup to utilize several indexers:
Neo4j also incorporates Lucene based indexing so not only can you capture a rich set of relationships between your data entities but searches and queries are super quick.
$ workbench_server
# Lets start to interact with workbench, please note there is NO specific client to workbench,
# Just use the ZeroRPC Python, Node.js, or CLI interfaces.
import zerorpc
c = zerorpc.Client()
c.connect("tcp://127.0.0.1:4242")
[None]
# Load in 100 PE Files
def workbench_load(file_list):
md5_list = []
for filename in file_list:
with open(filename,'rb') as f:
md5_list.append(c.store_sample(f.read(), filename, 'exe'))
print 'Files loaded: %d' % len(md5_list)
return md5_list
import os
file_list = [os.path.join('../data/pe/bad', child) for child in os.listdir('../data/pe/bad')]
md5s_bad = workbench_load(file_list)
file_list = [os.path.join('../data/pe/good', child) for child in os.listdir('../data/pe/good')]
md5s_good = workbench_load(file_list)
md5_list = md5s_bad + md5s_good
md5_list[:5]
Files loaded: 50 Files loaded: 50
['033d91aae8ad29ed9fbb858179271232', '0cb9aa6fb9c4aa3afad7a303e21ac0f3', '0e882ec9b485979ea84c7843d41ba36f', '0e8b030fb6ae48ffd29e520fc16b5641', '0eb9e990c521b30428a379700ec5ab3e']
# Compute pe_features on all files of type pe, just pull back the sparse features
imports = c.batch_work_request('pe_features', {'md5_list': md5_list, 'subkeys':['md5','sparse_features.imported_symbols']})
imports
<generator object iterator at 0x106074cd0>
df = pd.DataFrame(imports)
# Sending generator output into a Pandas Dataframe constructor
import pandas as pd
df_imports = pd.DataFrame(imports)
df_imports.head()
imported_symbols | md5 | |
---|---|---|
0 | [kernel32.dll:name=getenvironmentvariablew, ke... | 033d91aae8ad29ed9fbb858179271232 |
1 | [mfc42.dll:ordinal=2514, mfc42.dll:ordinal=516... | 0cb9aa6fb9c4aa3afad7a303e21ac0f3 |
2 | [msvbvm60.dll:ordinal=588 bound=1923206285, ms... | 0e882ec9b485979ea84c7843d41ba36f |
3 | [wsock32.dll:name=wsastartup, wsock32.dll:name... | 0e8b030fb6ae48ffd29e520fc16b5641 |
4 | [user32.dll:name=getwindow bound=2110180096, u... | 0eb9e990c521b30428a379700ec5ab3e |
5 rows × 2 columns
# Okay so we have lots of PE File attributes that we might want to look at lets do a bunch
# Note: We're invoking a couple of new workers: strings and pe_peid
# Compute pe_features on all files of type pe, just pull back the sparse features
df_warnings = pd.DataFrame(c.batch_work_request('pe_features', {'type_tag': 'exe', 'subkeys':['md5','sparse_features.pe_warning_strings']}))
df_warnings.head()
md5 | pe_warning_strings | |
---|---|---|
0 | 090a189f4eeb3c0b76e97acdb1a71c92 | [] |
1 | 093dee8d97fd9d35884ed52179b3d142 | [Suspicious flags set for section 5. Both IMAG... |
2 | 0dd74786d22edff0ce5b8e1b1e398618 | [] |
3 | 10328f92e7ec8735ea7846bf2c8254c2 | [] |
4 | 12fd4ef8f2cbbf98e0a5ced88258ddf3 | [] |
5 rows × 2 columns
# Compute strings on all files of type pe, just pull back the string_list
df_strings = pd.DataFrame(c.batch_work_request('strings', {'type_tag': 'exe', 'subkeys':['md5','string_list']}))
df_strings.head()
md5 | string_list | |
---|---|---|
0 | 090a189f4eeb3c0b76e97acdb1a71c92 | [!This program cannot be run in DOS mode., Ric... |
1 | 093dee8d97fd9d35884ed52179b3d142 | [!This program cannot be run in DOS mode., r\R... |
2 | 0dd74786d22edff0ce5b8e1b1e398618 | [!This program cannot be run in DOS mode., qHy... |
3 | 10328f92e7ec8735ea7846bf2c8254c2 | [!This program cannot be run in DOS mode., .te... |
4 | 12fd4ef8f2cbbf98e0a5ced88258ddf3 | [!This program cannot be run in DOS mode., qHy... |
5 rows × 2 columns
# Compute pe_peid on all files of type pe, just pull back the match_list
df_peids = pd.DataFrame(c.batch_work_request('pe_peid', {'type_tag': 'exe', 'subkeys':['md5','match_list']}))
df_peids.head()
match_list | md5 | |
---|---|---|
0 | [Microsoft Visual C++ 8] | 090a189f4eeb3c0b76e97acdb1a71c92 |
1 | [] | 093dee8d97fd9d35884ed52179b3d142 |
2 | [Microsoft Visual C++ 8] | 0dd74786d22edff0ce5b8e1b1e398618 |
3 | [Microsoft Visual C# v7.0 / Basic .NET] | 10328f92e7ec8735ea7846bf2c8254c2 |
4 | [Microsoft Visual C++ 8] | 12fd4ef8f2cbbf98e0a5ced88258ddf3 |
5 rows × 2 columns
SSDeep: computes context triggered piecewise hashes (CTPH) which can match inputs that have homologies.
Jaccard Index: a set based distance metric (overlap in element sets)
# For the first approach workbench already has a worker that does SSDeep Sims
ssdeep = pd.DataFrame(c.batch_work_request('pe_deep_sim', {'type_tag': 'exe'}))
ssdeep.head()
md5 | sim_list | |
---|---|---|
0 | 090a189f4eeb3c0b76e97acdb1a71c92 | [] |
1 | 093dee8d97fd9d35884ed52179b3d142 | [] |
2 | 0dd74786d22edff0ce5b8e1b1e398618 | [{u'sim': 65, u'md5': u'e0b173f23d873286169995... |
3 | 10328f92e7ec8735ea7846bf2c8254c2 | [] |
4 | 12fd4ef8f2cbbf98e0a5ced88258ddf3 | [] |
5 rows × 2 columns
# For the second approach we need to do a bit more work
# Here we setup a convenience function that takes a sparse feature list
# and computes pair wise similarities between each item in the list
def jaccard_sims(feature_df, name, thres):
md5s = feature_df['md5'].tolist()
features = feature_df[name].tolist()
sim_info_list = []
for md5_source, features_source in zip(md5s, features):
for md5_target, features_target in zip(md5s, features):
if md5_source == md5_target: continue
sim = jaccard_sim(features_source, features_target)
if sim > thres:
sim_info_list.append({'source':md5_source, 'target':md5_target, 'sim':sim})
return sim_info_list
def jaccard_sim(features1, features2):
''' Compute similarity between two sets using Jaccard similarity '''
set1 = set(features1)
set2 = set(features2)
try:
return len(set1.intersection(set2))/float(max(len(set1),len(set2)))
except ZeroDivisionError:
return 0
Here we're using the super awesome Neo4j as both an indexer and graph database.
Neo4j also incorporates Lucene based indexing so not only can you capture a rich set of relationships between your data entities but searches and queries are super quick.
Note: All images were captured by simply going to http://localhost:7474/browser/ (Neo4j Browser) and making some queries.
# First just add all the nodes
for md5 in md5s_bad:
c.add_node(md5, md5[:6], ['exe','bad'])
for md5 in md5s_good:
c.add_node(md5, md5[:6], ['exe','good'])
# Store the ssdeep sims as relationships
for i, row in ssdeep.iterrows():
for sim_info in row['sim_list']:
c.add_rel(row['md5'], sim_info['md5'], 'ssdeep')
# Compute the Jaccard Index between imported systems and store as relationships
sims = jaccard_sims(df_imports, 'imported_symbols', .8)
for sim_info in sims:
c.add_rel(sim_info['source'], sim_info['target'], 'imports')
# Compute the Jaccard Index between warnings and store as relationships
sims = jaccard_sims(df_warnings, 'pe_warning_strings', .5)
for sim_info in sims:
c.add_rel(sim_info['source'], sim_info['target'], 'warnings')
# Compute the Jaccard Index between strings and store as relationships
sims = jaccard_sims(df_strings, 'string_list', .7)
for sim_info in sims:
c.add_rel(sim_info['source'], sim_info['target'], 'strings')
# Removing PE IDs that show Microsoft Visual C
criterion = df_peids['match_list'].map(lambda x:any('Microsoft Visual C' not in y for y in x))
df_peids = df_peids[criterion]
# Compute the Jaccard Index between peids and store as relationships
sims = jaccard_sims(df_peids, 'match_list', .5)
for sim_info in sims:
c.add_rel(sim_info['source'], sim_info['target'], 'peids')
print df_peids['match_list']
7 [Installer VISE Custom] 13 [Safeguard 1.03 -> Simonzh] 26 [Borland Delphi 3.0 (???)] 27 [Borland Delphi 3.0 (???)] 36 [Borland Delphi 3.0 (???)] 52 [Microsoft Visual Basic v5.0 - v6.0] 58 [Armadillo v4.x] 65 [UPX v1.25 (Delphi) Stub] 66 [Borland Delphi 3.0 (???)] 68 [Borland Delphi 3.0 (???)] 71 [Microsoft Visual Basic v5.0] 73 [Safeguard 1.03 -> Simonzh] 75 [Borland Delphi 3.0 (???)] 76 [UPX -> www.upx.sourceforge.net] 79 [BobSoft Mini Delphi -> BoB / BobSoft] 81 [UPX v0.71 - v0.72, tElock v0.7x - v0.84] 82 [Borland Delphi 4.0] 84 [Pack Master v1.0, PEX v0.99] 85 [UPX v1.25 (Delphi) Stub] 94 [Dev-C++ v5] 97 [ASPack v1.06b] 99 [Upack v0.399 -> Dwing] Name: match_list, dtype: object
# Now run some graph queries against Neo4j
from py2neo import neo4j
graph_db = neo4j.GraphDatabaseService()
query = neo4j.CypherQuery(graph_db, "match (n:bad)-[r]-(m:good) return n.md5, labels(n),type(r), m.md5, labels(m)")
for record in query.stream():
v = record.values
print '%s(%s) ---%s--> %s(%s)' %(v[0],v[1],v[2],v[3],v[4])
2d09ca902990545fec9ac190b0338b50([u'bad', u'exe']) ---peids--> 41636f77ad6d9a396ea34e4786b96f2b([u'exe', u'good']) 2d09ca902990545fec9ac190b0338b50([u'bad', u'exe']) ---peids--> 41636f77ad6d9a396ea34e4786b96f2b([u'exe', u'good']) 1cea13cf888cd8ce4f869029f1dbb601([u'bad', u'exe']) ---warnings--> 52744454c74fac9fcc8f5efb6418c9b4([u'exe', u'good']) d94da41e7e809f7366971b3b50f8ef68([u'bad', u'exe']) ---warnings--> 52744454c74fac9fcc8f5efb6418c9b4([u'exe', u'good']) 1cea13cf888cd8ce4f869029f1dbb601([u'bad', u'exe']) ---warnings--> 52744454c74fac9fcc8f5efb6418c9b4([u'exe', u'good']) d94da41e7e809f7366971b3b50f8ef68([u'bad', u'exe']) ---warnings--> 52744454c74fac9fcc8f5efb6418c9b4([u'exe', u'good']) 1cea13cf888cd8ce4f869029f1dbb601([u'bad', u'exe']) ---warnings--> 5caea70e05a942e9ee9e02d178a28b1a([u'exe', u'good']) d94da41e7e809f7366971b3b50f8ef68([u'bad', u'exe']) ---warnings--> 5caea70e05a942e9ee9e02d178a28b1a([u'exe', u'good']) 1cea13cf888cd8ce4f869029f1dbb601([u'bad', u'exe']) ---warnings--> 5caea70e05a942e9ee9e02d178a28b1a([u'exe', u'good']) d94da41e7e809f7366971b3b50f8ef68([u'bad', u'exe']) ---warnings--> 5caea70e05a942e9ee9e02d178a28b1a([u'exe', u'good']) 1cea13cf888cd8ce4f869029f1dbb601([u'bad', u'exe']) ---warnings--> 73b459178a48657d5e92c41ec1fdd716([u'exe', u'good']) d94da41e7e809f7366971b3b50f8ef68([u'bad', u'exe']) ---warnings--> 73b459178a48657d5e92c41ec1fdd716([u'exe', u'good']) 1cea13cf888cd8ce4f869029f1dbb601([u'bad', u'exe']) ---warnings--> 73b459178a48657d5e92c41ec1fdd716([u'exe', u'good']) d94da41e7e809f7366971b3b50f8ef68([u'bad', u'exe']) ---warnings--> 73b459178a48657d5e92c41ec1fdd716([u'exe', u'good']) 2d094b6c69020091b68d1bcf5d11fa4b([u'bad', u'exe']) ---peids--> 73b459178a48657d5e92c41ec1fdd716([u'exe', u'good']) 2d09546831b17d2cc0583362b6d312ae([u'bad', u'exe']) ---peids--> 73b459178a48657d5e92c41ec1fdd716([u'exe', u'good']) 2d09cc92bbe29d96bb3a91b350d1725f([u'bad', u'exe']) ---peids--> 73b459178a48657d5e92c41ec1fdd716([u'exe', u'good']) 2d094b6c69020091b68d1bcf5d11fa4b([u'bad', u'exe']) ---peids--> 73b459178a48657d5e92c41ec1fdd716([u'exe', u'good']) 2d09546831b17d2cc0583362b6d312ae([u'bad', u'exe']) ---peids--> 73b459178a48657d5e92c41ec1fdd716([u'exe', u'good']) 2d09cc92bbe29d96bb3a91b350d1725f([u'bad', u'exe']) ---peids--> 73b459178a48657d5e92c41ec1fdd716([u'exe', u'good']) 2d094b6c69020091b68d1bcf5d11fa4b([u'bad', u'exe']) ---peids--> 74855d03ee3999e56b785a33b956245d([u'exe', u'good']) 2d09546831b17d2cc0583362b6d312ae([u'bad', u'exe']) ---peids--> 74855d03ee3999e56b785a33b956245d([u'exe', u'good']) 2d09cc92bbe29d96bb3a91b350d1725f([u'bad', u'exe']) ---peids--> 74855d03ee3999e56b785a33b956245d([u'exe', u'good']) 2d094b6c69020091b68d1bcf5d11fa4b([u'bad', u'exe']) ---peids--> 74855d03ee3999e56b785a33b956245d([u'exe', u'good']) 2d09546831b17d2cc0583362b6d312ae([u'bad', u'exe']) ---peids--> 74855d03ee3999e56b785a33b956245d([u'exe', u'good']) 2d09cc92bbe29d96bb3a91b350d1725f([u'bad', u'exe']) ---peids--> 74855d03ee3999e56b785a33b956245d([u'exe', u'good']) 2d094b6c69020091b68d1bcf5d11fa4b([u'bad', u'exe']) ---peids--> a3661a61f7e7b7d37e6d037ed747e7ef([u'exe', u'good']) 2d09546831b17d2cc0583362b6d312ae([u'bad', u'exe']) ---peids--> a3661a61f7e7b7d37e6d037ed747e7ef([u'exe', u'good']) 2d09cc92bbe29d96bb3a91b350d1725f([u'bad', u'exe']) ---peids--> a3661a61f7e7b7d37e6d037ed747e7ef([u'exe', u'good']) 2d094b6c69020091b68d1bcf5d11fa4b([u'bad', u'exe']) ---peids--> a3661a61f7e7b7d37e6d037ed747e7ef([u'exe', u'good']) 2d09546831b17d2cc0583362b6d312ae([u'bad', u'exe']) ---peids--> a3661a61f7e7b7d37e6d037ed747e7ef([u'exe', u'good']) 2d09cc92bbe29d96bb3a91b350d1725f([u'bad', u'exe']) ---peids--> a3661a61f7e7b7d37e6d037ed747e7ef([u'exe', u'good']) 1cea13cf888cd8ce4f869029f1dbb601([u'bad', u'exe']) ---warnings--> e87f31116298d4d4839e50fce87b9f6f([u'exe', u'good']) d94da41e7e809f7366971b3b50f8ef68([u'bad', u'exe']) ---warnings--> e87f31116298d4d4839e50fce87b9f6f([u'exe', u'good']) 1cea13cf888cd8ce4f869029f1dbb601([u'bad', u'exe']) ---warnings--> e87f31116298d4d4839e50fce87b9f6f([u'exe', u'good']) d94da41e7e809f7366971b3b50f8ef68([u'bad', u'exe']) ---warnings--> e87f31116298d4d4839e50fce87b9f6f([u'exe', u'good'])
neo_q = 'match (s{md5:"74855d03ee3999e56b785a33b956245d"})-[r]-(t) return type(r),t.md5,labels(t)'
query = neo4j.CypherQuery(graph_db, neo_q)
for record in query.stream():
v = record.values
print v
(u'warnings', u'a3661a61f7e7b7d37e6d037ed747e7ef', [u'exe', u'good']) (u'warnings', u'c6b6a394c597dfca84a2e98a9c0dc58f', [u'exe', u'good']) (u'warnings', u'a3661a61f7e7b7d37e6d037ed747e7ef', [u'exe', u'good']) (u'warnings', u'c6b6a394c597dfca84a2e98a9c0dc58f', [u'exe', u'good']) (u'peids', u'2d094b6c69020091b68d1bcf5d11fa4b', [u'bad', u'exe']) (u'peids', u'2d09546831b17d2cc0583362b6d312ae', [u'bad', u'exe']) (u'peids', u'2d09cc92bbe29d96bb3a91b350d1725f', [u'bad', u'exe']) (u'peids', u'73b459178a48657d5e92c41ec1fdd716', [u'exe', u'good']) (u'peids', u'a3661a61f7e7b7d37e6d037ed747e7ef', [u'exe', u'good']) (u'peids', u'2d094b6c69020091b68d1bcf5d11fa4b', [u'bad', u'exe']) (u'peids', u'2d09546831b17d2cc0583362b6d312ae', [u'bad', u'exe']) (u'peids', u'2d09cc92bbe29d96bb3a91b350d1725f', [u'bad', u'exe']) (u'peids', u'73b459178a48657d5e92c41ec1fdd716', [u'exe', u'good']) (u'peids', u'a3661a61f7e7b7d37e6d037ed747e7ef', [u'exe', u'good'])
neo_q = 'match (s{md5:"73b459178a48657d5e92c41ec1fdd716"}),(t:bad), p=allShortestPaths((s)-[*..2]-(t)) return p'
query = neo4j.CypherQuery(graph_db, neo_q)
for record in query.stream():
v = record.values
print v[0]
(76)-[:"peids"]->(18)-[:"warnings"]->(0) (76)-[:"peids"]->(18)-[:"warnings"]->(0) (76)-[:"peids"]->(18)-[:"warnings"]->(0) (76)-[:"peids"]->(18)-[:"warnings"]->(0) (76)-[:"warnings"]->(9) (76)-[:"warnings"]->(9) (76)-[:"peids"]->(16) (76)-[:"peids"]->(16) (76)-[:"peids"]->(18) (76)-[:"peids"]->(18) (76)-[:"peids"]->(25) (76)-[:"peids"]->(25) (76)-[:"peids"]->(18)-[:"warnings"]->(27) (76)-[:"peids"]->(18)-[:"warnings"]->(27) (76)-[:"peids"]->(18)-[:"warnings"]->(27) (76)-[:"peids"]->(18)-[:"warnings"]->(27) (76)-[:"peids"]->(18)-[:"warnings"]->(43) (76)-[:"peids"]->(18)-[:"warnings"]->(43) (76)-[:"peids"]->(18)-[:"warnings"]->(43) (76)-[:"peids"]->(18)-[:"warnings"]->(43) (76)-[:"warnings"]->(46) (76)-[:"warnings"]->(46)