# Lets start to interact with workbench, please note there is NO specific client to workbench, # Just use the ZeroRPC Python, Node.js, or CLI interfaces. import zerorpc c = zerorpc.Client() c.connect("tcp://127.0.0.1:4242") # Load in 100 PE Files def workbench_load(file_list): md5_list = [] for filename in file_list: with open(filename,'rb') as f: md5_list.append(c.store_sample(f.read(), filename, 'exe')) print 'Files loaded: %d' % len(md5_list) return md5_list import os file_list = [os.path.join('../data/pe/bad', child) for child in os.listdir('../data/pe/bad')] md5s_bad = workbench_load(file_list) file_list = [os.path.join('../data/pe/good', child) for child in os.listdir('../data/pe/good')] md5s_good = workbench_load(file_list) md5_list = md5s_bad + md5s_good md5_list[:5] # Compute pe_features on all files of type pe, just pull back the sparse features imports = c.batch_work_request('pe_features', {'md5_list': md5_list, 'subkeys':['md5','sparse_features.imported_symbols']}) imports # Sending generator output into a Pandas Dataframe constructor import pandas as pd df_imports = pd.DataFrame(imports) df_imports.head() # Okay so we have lots of PE File attributes that we might want to look at lets do a bunch # Note: We're invoking a couple of new workers: strings and pe_peid # Compute pe_features on all files of type pe, just pull back the sparse features df_warnings = pd.DataFrame(c.batch_work_request('pe_features', {'type_tag': 'exe', 'subkeys':['md5','sparse_features.pe_warning_strings']})) df_warnings.head() # Compute strings on all files of type pe, just pull back the string_list df_strings = pd.DataFrame(c.batch_work_request('strings', {'type_tag': 'exe', 'subkeys':['md5','string_list']})) df_strings.head() # Compute pe_peid on all files of type pe, just pull back the match_list df_peids = pd.DataFrame(c.batch_work_request('pe_peid', {'type_tag': 'exe', 'subkeys':['md5','match_list']})) df_peids.head() # For the first approach workbench already has a worker that does SSDeep Sims ssdeep = pd.DataFrame(c.batch_work_request('pe_deep_sim', {'type_tag': 'exe'})) ssdeep.head() # For the second approach we need to do a bit more work # Here we setup a convenience function that takes a sparse feature list # and computes pair wise similarities between each item in the list def jaccard_sims(feature_df, name, thres): md5s = feature_df['md5'].tolist() features = feature_df[name].tolist() sim_info_list = [] for md5_source, features_source in zip(md5s, features): for md5_target, features_target in zip(md5s, features): if md5_source == md5_target: continue sim = jaccard_sim(features_source, features_target) if sim > thres: sim_info_list.append({'source':md5_source, 'target':md5_target, 'sim':sim}) return sim_info_list def jaccard_sim(features1, features2): ''' Compute similarity between two sets using Jaccard similarity ''' set1 = set(features1) set2 = set(features2) try: return len(set1.intersection(set2))/float(max(len(set1),len(set2))) except ZeroDivisionError: return 0 # First just add all the nodes for md5 in md5s_bad: c.add_node(md5, md5[:6], ['exe','bad']) for md5 in md5s_good: c.add_node(md5, md5[:6], ['exe','good']) # Store the ssdeep sims as relationships for i, row in ssdeep.iterrows(): for sim_info in row['sim_list']: c.add_rel(row['md5'], sim_info['md5'], 'ssdeep') # Compute the Jaccard Index between imported systems and store as relationships sims = jaccard_sims(df_imports, 'imported_symbols', .8) for sim_info in sims: c.add_rel(sim_info['source'], sim_info['target'], 'imports') # Compute the Jaccard Index between warnings and store as relationships sims = jaccard_sims(df_warnings, 'pe_warning_strings', .5) for sim_info in sims: c.add_rel(sim_info['source'], sim_info['target'], 'warnings') # Compute the Jaccard Index between strings and store as relationships sims = jaccard_sims(df_strings, 'string_list', .7) for sim_info in sims: c.add_rel(sim_info['source'], sim_info['target'], 'strings') # Removing PE IDs that show Microsoft Visual C criterion = df_peids['match_list'].map(lambda x:any('Microsoft Visual C' not in y for y in x)) df_peids = df_peids[criterion] # Compute the Jaccard Index between peids and store as relationships sims = jaccard_sims(df_peids, 'match_list', .5) for sim_info in sims: c.add_rel(sim_info['source'], sim_info['target'], 'peids') print df_peids['match_list'] # Now run some graph queries against Neo4j from py2neo import neo4j graph_db = neo4j.GraphDatabaseService() query = neo4j.CypherQuery(graph_db, "match (n:bad)-[r]-(m:good) return n.md5, labels(n),type(r), m.md5, labels(m)") for record in query.stream(): v = record.values print '%s(%s) ---%s--> %s(%s)' %(v[0],v[1],v[2],v[3],v[4]) neo_q = 'match (s{md5:"74855d03ee3999e56b785a33b956245d"})-[r]-(t) return type(r),t.md5,labels(t)' query = neo4j.CypherQuery(graph_db, neo_q) for record in query.stream(): v = record.values print v neo_q = 'match (s{md5:"73b459178a48657d5e92c41ec1fdd716"}),(t:bad), p=allShortestPaths((s)-[*..2]-(t)) return p' query = neo4j.CypherQuery(graph_db, neo_q) for record in query.stream(): v = record.values print v[0]