# Lets start to interact with workbench, please note there is NO specific client to workbench,
# Just use the ZeroRPC Python, Node.js, or CLI interfaces.
import zerorpc
c = zerorpc.Client()
c.connect("tcp://127.0.0.1:4242")

# Load in 100 PE Files
def workbench_load(file_list):
    md5_list = []
    for filename in file_list:
        with open(filename,'rb') as f:
            md5_list.append(c.store_sample(f.read(), filename, 'exe'))
    print 'Files loaded: %d' % len(md5_list)
    return md5_list

import os
file_list = [os.path.join('../data/pe/bad', child) for child in os.listdir('../data/pe/bad')]
md5s_bad = workbench_load(file_list)
file_list = [os.path.join('../data/pe/good', child) for child in os.listdir('../data/pe/good')]
md5s_good = workbench_load(file_list)
md5_list = md5s_bad + md5s_good
md5_list[:5]

# Compute pe_features on all files of type pe, just pull back the sparse features
imports = c.batch_work_request('pe_features', {'md5_list': md5_list, 'subkeys':['md5','sparse_features.imported_symbols']})
imports

# Sending generator output into a Pandas Dataframe constructor
import pandas as pd
df_imports = pd.DataFrame(imports)
df_imports.head()

# Okay so we have lots of PE File attributes that we might want to look at lets do a bunch
# Note: We're invoking a couple of new workers: strings and pe_peid

# Compute pe_features on all files of type pe, just pull back the sparse features
df_warnings = pd.DataFrame(c.batch_work_request('pe_features', {'type_tag': 'exe', 'subkeys':['md5','sparse_features.pe_warning_strings']}))
df_warnings.head()

# Compute strings on all files of type pe, just pull back the string_list
df_strings = pd.DataFrame(c.batch_work_request('strings', {'type_tag': 'exe', 'subkeys':['md5','string_list']}))
df_strings.head()

# Compute pe_peid on all files of type pe, just pull back the match_list
df_peids = pd.DataFrame(c.batch_work_request('pe_peid', {'type_tag': 'exe', 'subkeys':['md5','match_list']}))
df_peids.head()

# For the first approach workbench already has a worker that does SSDeep Sims
ssdeep = pd.DataFrame(c.batch_work_request('pe_deep_sim', {'type_tag': 'exe'}))
ssdeep.head()

# For the second approach we need to do a bit more work
# Here we setup a convenience function that takes a sparse feature list
# and computes pair wise similarities between each item in the list
def jaccard_sims(feature_df, name, thres):
    md5s = feature_df['md5'].tolist()
    features = feature_df[name].tolist()
    sim_info_list = []
    for md5_source, features_source in zip(md5s, features):
        for md5_target, features_target in zip(md5s, features):
            if md5_source == md5_target: continue
            sim = jaccard_sim(features_source, features_target)
            if sim > thres:
                sim_info_list.append({'source':md5_source, 'target':md5_target, 'sim':sim})
    return sim_info_list

def jaccard_sim(features1, features2):
    ''' Compute similarity between two sets using Jaccard similarity '''
    set1 = set(features1)
    set2 = set(features2)
    try:
        return len(set1.intersection(set2))/float(max(len(set1),len(set2)))
    except ZeroDivisionError:
        return 0

# First just add all the nodes
for md5 in md5s_bad:
    c.add_node(md5, md5[:6], ['exe','bad'])
for md5 in md5s_good:
    c.add_node(md5, md5[:6], ['exe','good'])

# Store the ssdeep sims as relationships
for i, row in ssdeep.iterrows():
    for sim_info in row['sim_list']:
        c.add_rel(row['md5'], sim_info['md5'], 'ssdeep')

# Compute the Jaccard Index between imported systems and store as relationships
sims = jaccard_sims(df_imports, 'imported_symbols', .8)
for sim_info in sims:
    c.add_rel(sim_info['source'], sim_info['target'], 'imports')

# Compute the Jaccard Index between warnings and store as relationships
sims = jaccard_sims(df_warnings, 'pe_warning_strings', .5)
for sim_info in sims:
    c.add_rel(sim_info['source'], sim_info['target'], 'warnings')

# Compute the Jaccard Index between strings and store as relationships
sims = jaccard_sims(df_strings, 'string_list', .7)
for sim_info in sims:
    c.add_rel(sim_info['source'], sim_info['target'], 'strings')

# Removing PE IDs that show Microsoft Visual C
criterion = df_peids['match_list'].map(lambda x:any('Microsoft Visual C' not in y for y in x))
df_peids = df_peids[criterion]

# Compute the Jaccard Index between peids and store as relationships
sims = jaccard_sims(df_peids, 'match_list', .5)
for sim_info in sims:
    c.add_rel(sim_info['source'], sim_info['target'], 'peids')
print df_peids['match_list']

# Now run some graph queries against Neo4j
from py2neo import neo4j
graph_db = neo4j.GraphDatabaseService()
query = neo4j.CypherQuery(graph_db, "match (n:bad)-[r]-(m:good) return n.md5, labels(n),type(r), m.md5, labels(m)")
for record in query.stream():
    v = record.values
    print '%s(%s) ---%s--> %s(%s)' %(v[0],v[1],v[2],v[3],v[4])

neo_q = 'match (s{md5:"74855d03ee3999e56b785a33b956245d"})-[r]-(t) return type(r),t.md5,labels(t)'
query = neo4j.CypherQuery(graph_db, neo_q)
for record in query.stream():
    v = record.values
    print v

neo_q = 'match (s{md5:"73b459178a48657d5e92c41ec1fdd716"}),(t:bad), p=allShortestPaths((s)-[*..2]-(t)) return p'
query = neo4j.CypherQuery(graph_db, neo_q)
for record in query.stream():
    v = record.values
    print v[0]