import pandas as pd
print 'pandas version is', pd.__version__
import numpy as np
print 'numpy version is', np.__version__
import sklearn
print 'scikit-learn version is', sklearn.__version__
import matplotlib
print 'matplotlib version is', matplotlib.__version__
import matplotlib.pyplot as plt
pandas version is 0.13.0 numpy version is 1.7.1 scikit-learn version is 0.14.1 matplotlib version is 1.4.1
%matplotlib inline
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 16.0, 5.0
def plot_cm(cm, labels):
# Compute percentanges
percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)
print 'Confusion Matrix Stats'
for i, label_i in enumerate(labels):
for j, label_j in enumerate(labels):
print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())
# Show confusion matrix
# Thanks to kermit666 from stackoverflow
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid(b=False)
cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100)
plt.title('')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
def extract_character_info(string):
lowercase_runs = []
uppercase_runs = []
digit_runs = []
lower = map(str.islower, str(string))
upper = map(str.isupper, str(string))
digits = map(str.isdigit, str(string))
current_length = 0
current = False
for l in lower:
if l:
current_length += 1
current = True
else:
if current:
lowercase_runs.append(current_length)
current_length = 0
current = False
if current:
lowercase_runs.append(current_length)
current_length = 0
current = False
for u in upper:
if u:
current_length += 1
current = True
else:
if current:
uppercase_runs.append(current_length)
current_length = 0
current = False
if current:
uppercase_runs.append(current_length)
current_length = 0
current = False
for d in digits:
if d:
current_length += 1
current = True
else:
if current:
digit_runs.append(current_length)
current_length = 0
current = False
if current:
digit_runs.append(current_length)
return lowercase_runs, uppercase_runs, digit_runs
def extract_features(data):
features = {}
try:
features['sha256'] = data['metadata']['sha256']
features['size'] = data['metadata']['file_size']
features['entropy'] = data['metadata']['entropy']
if 'sourcefile' in data['characteristics']['java']:
features['source file'] = data['characteristics']['java']['sourcefile']
else:
features['source file'] = 'No Source File'
if 'access_permissions' in data['characteristics']['java']:
features['ap_count'] = len(data['characteristics']['java']['access_permissions'])
for ap in data['characteristics']['java']['access_permissions']:
features[str.lower(str(ap).replace(" ", "_"))] = 1
features['class name'] = data['characteristics']['java']['class_name']
features['class_name_slash_count'] = features['class name'].count('/')
features['class_name_length'] = len(features['class name'])
cn_lowercase_runs, cn_uppercase_runs, cn_digit_runs = extract_character_info(features['class name'])
cn_lowercase_run_longest = 0
cn_lowercase_run_average = 0
cn_uppercase_run_longest = 0
cn_uppercase_run_average = 0
cn_digit_run_longest = 0
cn_digit_run_average = 0
if cn_lowercase_runs:
cn_lowercase_run_longest = np.max(cn_lowercase_runs)
cn_lowercase_run_average = np.mean(cn_lowercase_runs)
features['class_name_lowercase_run_longest'] = cn_lowercase_run_longest
features['class_name_lowercase_run_avg'] = cn_lowercase_run_average
if cn_uppercase_runs:
cn_uppercase_run_longest = np.max(cn_uppercase_runs)
cn_uppercase_run_average = np.mean(cn_uppercase_runs)
features['class_name_uppercase_run_longest'] = cn_uppercase_run_longest
features['class_name_uppercase_run_avg'] = cn_uppercase_run_average
if cn_digit_runs:
cn_digit_run_longest = np.max(cn_digit_runs)
cn_digit_run_average = np.mean(cn_digit_runs)
features['class_name_digit_run_longest'] = cn_digit_run_longest
features['class_name_digit_run_avg'] = cn_digit_run_average
features['major version'] = data['characteristics']['java']['major_version']
features['minor version'] = data['characteristics']['java']['minor_version']
if 'method_names' in data['characteristics']['java']:
features['method names'] = data['characteristics']['java']['method_names']
else:
features['method names'] = []
features['methods_count'] = len(features['method names'])
lowercase_run_longest = 0
lowercase_run_average = 0
lowercase_runs = []
uppercase_run_longest = 0
uppercase_run_average = 0
uppercase_runs = []
digit_run_longest = 0
digit_run_average = 0
digit_runs = []
for method in features['method names']:
lc, uc, d = extract_character_info(method)
lowercase_runs.extend(lc)
uppercase_runs.extend(uc)
digit_runs.extend(d)
if lowercase_runs:
lowercase_run_longest = np.max(lowercase_runs)
lowercase_run_average = np.mean(lowercase_runs)
features['method_name_lowercase_run_longest'] = lowercase_run_longest
features['method_name_lowercase_run_avg'] = lowercase_run_average
if uppercase_runs:
uppercase_run_longest = np.max(uppercase_runs)
uppercase_run_average = np.mean(uppercase_runs)
features['method_name_uppercase_run_longest'] = uppercase_run_longest
features['method_name_uppercase_run_avg'] = uppercase_run_average
if digit_runs:
digit_run_longest = np.max(digit_runs)
digit_run_average = np.mean(digit_runs)
features['method_name_digit_run_longest'] = digit_run_longest
features['method_name_digit_run_avg'] = digit_run_average
if 'interfaces' in data['characteristics']['java']:
features['interfaces'] = data['characteristics']['java']['interfaces']
else:
features['interfaces'] = []
features['interface_count'] = len(features['interfaces'])
features['constant_pool_count'] = data['characteristics']['java']['const_pool_count']
except KeyError as ke:
print 'ERROR:', ke, data['metadata']['sha256']
return features
def load_files(file_list):
import json
features_list = []
for filename in file_list:
with open(filename,'rb') as f:
features = extract_features(json.loads(f.read()))
features_list.append(features)
return features_list
# Good files
import glob
good_list = glob.glob('data/clean/*.results')
good_features = load_files(good_list)
print "Files:", len(good_list)
Files: 500
# Bad files
bad_list = glob.glob('data/malicious/*.results')
bad_features = load_files(bad_list)
print "Files:", len(bad_list)
Files: 520
df_good = pd.DataFrame.from_records(good_features)
df_good.fillna(0, inplace=True)
df_good['label'] = 'benign'
df_good.head()
acc_abstract | acc_annotation | acc_enum | acc_final | acc_interface | acc_public | acc_super | acc_synthetic | ap_count | class name | class_name_digit_run_avg | class_name_digit_run_longest | class_name_length | class_name_lowercase_run_avg | class_name_lowercase_run_longest | class_name_slash_count | class_name_uppercase_run_avg | class_name_uppercase_run_longest | constant_pool_count | entropy | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 3 | com/google/common/collect/ForwardingConcurrentMap | 0 | 0 | 49 | 6.000000 | 9 | 4 | 1.0 | 1 | 54 | 4.990507 | ... |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | org/apache/hadoop/io/compress/GzipCodec$GzipOu... | 0 | 0 | 82 | 4.846154 | 8 | 5 | 1.5 | 5 | 39 | 5.205063 | ... |
2 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 2 | com/google/common/collect/Multisets$Unmodifiab... | 0 | 0 | 62 | 6.625000 | 11 | 4 | 1.0 | 1 | 131 | 4.996721 | ... |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | hu/openig/mechanics/StaticDefensePlanner$1 | 1 | 1 | 42 | 5.666667 | 9 | 3 | 1.0 | 1 | 56 | 5.282413 | ... |
4 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | org/apache/commons/io/LineIterator | 0 | 0 | 34 | 4.666667 | 7 | 4 | 1.0 | 1 | 95 | 5.285082 | ... |
5 rows × 36 columns
df_bad = pd.DataFrame.from_records(bad_features)
df_bad.fillna(0, inplace=True)
df_bad['label'] = 'malicious'
df_bad.head()
acc_final | acc_public | acc_super | ap_count | class name | class_name_digit_run_avg | class_name_digit_run_longest | class_name_length | class_name_lowercase_run_avg | class_name_lowercase_run_longest | class_name_slash_count | class_name_uppercase_run_avg | class_name_uppercase_run_longest | constant_pool_count | entropy | interface_count | interfaces | major version | method names | method_name_digit_run_avg | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 1 | 2 | Main | 0 | 0 | 4 | 3.0 | 3 | 0 | 1.000000 | 1 | 86 | 6.114522 | 0 | [] | 48 | [<init>, init] | 0 | ... |
1 | 0 | 0 | 1 | 1 | YdCdHX/VcZaXVjyy | 0 | 0 | 16 | 1.4 | 3 | 1 | 1.333333 | 2 | 52 | 5.539514 | 0 | [] | 49 | [<init>, ktCgxlqo, <clinit>] | 0 | ... |
2 | 0 | 1 | 1 | 2 | aOcMSp | 0 | 0 | 6 | 1.0 | 1 | 0 | 1.500000 | 2 | 159 | 5.953528 | 0 | [] | 49 | [<init>, gvuNr, <clinit>] | 0 | ... |
3 | 0 | 0 | 1 | 1 | a/zylasqwjlpbqyrwrr | 0 | 0 | 19 | 9.0 | 17 | 1 | 0.000000 | 0 | 478 | 6.348531 | 0 | [] | 49 | [<init>, eiaxyercdfvbgscpbv, yginlmcynkyuohnfh... | 0 | ... |
4 | 0 | 1 | 1 | 2 | tljpjunbjwtqlywm/sdnrybknlf | 0 | 0 | 27 | 13.0 | 16 | 1 | 0.000000 | 0 | 122 | 5.376762 | 0 | [] | 49 | [<init>, dvvwse, <clinit>] | 0 | ... |
5 rows × 31 columns
df = pd.concat([df_bad, df_good], ignore_index=True)
df.fillna(0, inplace=True)
df.boxplot(column='size', by='label')
plt.ylabel('File Size')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x12b873d90>
df.boxplot(column='size', by='label')
plt.ylabel('File Size')
plt.xlabel('')
plt.title('')
plt.suptitle('')
plt.ylim(0, 15000)
(0, 15000)
df.boxplot('entropy', 'label')
plt.ylabel('Entropy')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x12b8ab5d0>
df.boxplot(column='constant_pool_count', by='label')
plt.ylabel('Constant Pool Count')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x12b8b8650>
df.boxplot(column='constant_pool_count', by='label')
plt.xlabel('')
plt.ylabel('Constant Pool Count')
plt.title('')
plt.suptitle('')
plt.ylim(0, 1000)
(0, 1000)
df.boxplot(column='methods_count', by='label')
plt.ylabel('Number of Methods')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x12b8b1210>
df.boxplot(column='interface_count', by='label')
plt.ylabel('Number of Interfaces')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x12b68ced0>
my_seed = 1022
my_tsize = .2
import sklearn.ensemble
clf_simple = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
simple_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count',
'entropy', 'size', 'interface_count', 'major version', 'methods_count', 'minor version']
X = df.as_matrix(simple_features)
y = np.array(df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_simple, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.927 (+/- 0.044)
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_simple.fit(X_train, y_train)
y_pred = clf_simple.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 93.07% (94/101) benign/malicious: 6.93% (7/101) malicious/benign: 10.68% (11/103) malicious/malicious: 89.32% (92/103)
importances = zip(simple_features, clf_simple.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for idx, im in enumerate(importances):
print (str(idx+1) + ':').ljust(4), im[0].ljust(20), round(im[1], 5)
1: entropy 0.29813 2: constant_pool_count 0.24814 3: size 0.14489 4: methods_count 0.08492 5: interface_count 0.08152 6: major version 0.05055 7: ap_count 0.02908 8: acc_public 0.02446 9: acc_final 0.01284 10: acc_abstract 0.00982 11: minor version 0.00623 12: acc_super 0.00541 13: acc_interface 0.00275 14: acc_enum 0.00125 15: acc_synthetic 2e-05 16: acc_annotation -0.0
bad = []
good = []
for strings, label in zip(df['method names'], df['label']):
for name in strings:
d = {'method name': name}
if label == 'malicious' and d not in bad:
bad.append(d)
elif label == 'benign' and d not in good:
good.append(d)
df_method_names_bad = pd.DataFrame.from_records(bad)
df_method_names_good = pd.DataFrame.from_records(good)
df_method_names_bad.head(50)
method name | |
---|---|
0 | <init> |
1 | init |
2 | ktCgxlqo |
3 | <clinit> |
4 | gvuNr |
5 | eiaxyercdfvbgscpbv |
6 | yginlmcynkyuohnfhe |
7 | mtyvzetsjhvnbyz |
8 | fxxhgjttqfavlooxcb |
9 | wyjgamzmowywjihkuuf |
10 | kgthsnqdqutacivcptong |
11 | qgasjqrogibkblyzourtq |
12 | glfouhczfxzyskaystx |
13 | mikczoanebdkwpyb |
14 | bwssduenvebnvgix |
15 | wafrcwijizypmitodmb |
16 | bfznyeevclzzxxqbw |
17 | jmzisxwtxhekbkl |
18 | szivddjiptybevduli |
19 | forwnxmgnutbtdwvptj |
20 | mwwmrvljafpkwzdiy |
21 | vvpbdzrhvvnzaieyi |
22 | qkkxoygluwwlnwbxu |
23 | dvvwse |
24 | c |
25 | k |
26 | main |
27 | writeEmbeddedFile |
28 | bootstrap |
29 | getJreExecutable |
30 | addExtension |
31 | findInDir |
32 | normalize |
33 | dissect |
34 | class$ |
35 | tgznSIAR |
36 | kWfVWtw |
37 | BodFzDax |
38 | xXVBwx |
39 | VdJiGyZfj |
40 | taddhnwrkj |
41 | C |
42 | ALLATORI_DEMO |
43 | jvsamhqyvgekftsj |
44 | knjkb |
45 | B |
46 | cmjnkr |
47 | jmdpes |
48 | tqffjybms |
49 | vtvtmh |
50 rows × 1 columns
df_method_names_good.head(50)
method name | |
---|---|
0 | <init> |
1 | delegate |
2 | putIfAbsent |
3 | remove |
4 | replace |
5 | resetState |
6 | comparator |
7 | createElementSet |
8 | elementSet |
9 | descendingMultiset |
10 | firstEntry |
11 | lastEntry |
12 | pollFirstEntry |
13 | pollLastEntry |
14 | headMultiset |
15 | subMultiset |
16 | tailMultiset |
17 | invoke |
18 | hasNext |
19 | isValidLine |
20 | next |
21 | nextLine |
22 | close |
23 | closeQuietly |
24 | exec |
25 | getInitial |
26 | getIntermed |
27 | getFinal |
28 | max |
29 | outputSchema |
30 | estimateLength |
31 | appendTo |
32 | getXPath |
33 | run |
34 | secToHMS |
35 | contribute |
36 | onBeforeRender |
37 | setCloseEvent |
38 | setSelectEvent |
39 | setChangeEvent |
40 | setSource |
41 | statement |
42 | setDocumentLocator |
43 | startDocument |
44 | endDocument |
45 | startPrefixMapping |
46 | endPrefixMapping |
47 | startElement |
48 | endElement |
49 | characters |
50 rows × 1 columns
df.boxplot('method_name_lowercase_run_longest', 'label')
plt.ylabel('Max length of lower case letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x112560510>
df.boxplot('method_name_lowercase_run_avg', 'label')
plt.ylabel('Avg length of lower case letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x113767090>
df.boxplot('method_name_uppercase_run_longest', 'label')
plt.ylabel('Max length of upper case letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x1136dd650>
df.boxplot('method_name_uppercase_run_avg', 'label')
plt.ylabel('Avg length of upper case letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x10ea06190>
df.boxplot('method_name_digit_run_longest', 'label')
plt.ylabel('Max length of digits')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x10ea53810>
df.boxplot('method_name_digit_run_avg', 'label')
plt.ylabel('Avg length of digits')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x10eabb2d0>
import sklearn.ensemble
clf_methods = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
method_name_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count',
'entropy', 'size', 'interface_count', 'major version', 'methods_count',
'minor version',
'method_name_digit_run_avg', 'method_name_digit_run_longest',
'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest',
'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest']
X = df.as_matrix(method_name_features)
y = np.array(df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_methods, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.950 (+/- 0.037)
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_methods.fit(X_train, y_train)
y_pred = clf_methods.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 94.90% (93/98) benign/malicious: 5.10% (5/98) malicious/benign: 10.38% (11/106) malicious/malicious: 89.62% (95/106)
importances = zip(method_name_features, clf_methods.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) for idx, im in enumerate(importances[0:15]): print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5)
for idx, gcn in enumerate(df_good['class name']):
print gcn
if idx == 19:
break
com/google/common/collect/ForwardingConcurrentMap org/apache/hadoop/io/compress/GzipCodec$GzipOutputStream$ResetableGZIPOutputStream com/google/common/collect/Multisets$UnmodifiableSortedMultiset hu/openig/mechanics/StaticDefensePlanner$1 org/apache/commons/io/LineIterator org/apache/pig/builtin/IntMax org/apache/commons/lang/time/FastDateFormat$StringLiteral hu/openig/screen/items/ResearchProductionScreen$15 org/dom4j/XPathException threadWordlistExec org/odlabs/wiquery/ui/autocomplete/AbstractAutocompleteComponent$InnerAutocomplete org/xml/sax/ContentHandler org/apache/commons/httpclient/protocol/SSLProtocolSocketFactory org/apache/pig/PigException org/junit/runners/Enclosed org/jets3t/service/io/ProgressMonitoredInputStream org/apache/pig/impl/util/CastUtils com/google/common/base/Joiner$2 com/google/common/io/CharStreams org/apache/commons/compress/archivers/cpio/CpioArchiveOutputStream
for idx, gcn in enumerate(df_bad['class name']):
print gcn
if idx == 19:
break
Main YdCdHX/VcZaXVjyy aOcMSp a/zylasqwjlpbqyrwrr tljpjunbjwtqlywm/sdnrybknlf Mainer Main mNIJnGIOkm/Payload aHrMCrboe/chspSxY Main OSrAfQWThe/SHLeanN hhIji/XQDODV a/dwrwbjyhllzu H enudwwlhl/wsshvntsenuwajehdujlchpms Main iACVKaBQCV/HhtBSGn GondadGondadExp Main enudwwlhl/yhfwcgjacjjauyvut
df.boxplot('class_name_length', 'label')
plt.ylabel('Class Name Length')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x12b7d9590>
df.boxplot('class_name_slash_count', 'label')
plt.ylabel('Class Name Slash Count')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x12b64ee10>
df.boxplot('class_name_lowercase_run_longest', 'label')
plt.ylabel('Max Run of Lower Case Letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x11278b750>
df.boxplot('class_name_lowercase_run_avg', 'label')
plt.ylabel('Avg Run of Lower Case Letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')
<matplotlib.text.Text at 0x1127ce7d0>
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
clf_all = sklearn.ensemble.RandomForestClassifier(n_estimators=75)
all_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count',
'entropy', 'interface_count', 'major version', 'methods_count',
'size', 'minor version',
'method_name_digit_run_avg', 'method_name_digit_run_longest',
'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest',
'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest',
'class_name_digit_run_avg', 'class_name_digit_run_longest',
'class_name_length', 'class_name_lowercase_run_avg',
'class_name_lowercase_run_longest', 'class_name_slash_count',
'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest']
X = df.as_matrix(all_features)
y = np.array(df['label'].tolist())
labels = ['good', 'bad']
scores = sklearn.cross_validation.cross_val_score(clf_all, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.993 (+/- 0.018)
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_all.fit(X_train, y_train)
y_pred = clf_all.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 97.06% (99/102) benign/malicious: 2.94% (3/102) malicious/benign: 0.00% (0/102) malicious/malicious: 100.00% (102/102)
y_probs = clf_all.predict_proba(X_test)[:,1]
thres = .80 # This can be set to whatever you'd like
y_pred[y_probs>thres] = 'malicious'
y_pred[y_probs<=thres] = 'benign'
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 99.02% (101/102) benign/malicious: 0.98% (1/102) malicious/benign: 2.94% (3/102) malicious/malicious: 97.06% (99/102)
#### We do the same, but set the threshold lower, to only 20%
y_probs = clf_all.predict_proba(X_test)[:,1]
thres = .20 # This can be set to whatever you'd like
y_pred[y_probs>thres] = 'malicious'
y_pred[y_probs<=thres] = 'benign'
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 95.10% (97/102) benign/malicious: 4.90% (5/102) malicious/benign: 0.00% (0/102) malicious/malicious: 100.00% (102/102)
scores = clf_all.predict_proba(X_test)[:,1]
plt.hist(scores, bins=20)
plt.grid(True)
plt.show()
importances = zip(all_features, clf_all.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
sum = 0
for idx, im in enumerate(importances):
sum += round(im[1], 5)
print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), sum
1: class_name_slash_count 0.31448 0.31448 2: class_name_length 0.28798 0.60246 3: class_name_lowercase_run_longest 0.08185 0.68431 4: entropy 0.06346 0.74777 5: class_name_lowercase_run_avg 0.06043 0.8082 6: constant_pool_count 0.03532 0.84352 7: size 0.02862 0.87214 8: class_name_uppercase_run_longest 0.02839 0.90053 9: class_name_uppercase_run_avg 0.02537 0.9259 10: method_name_lowercase_run_avg 0.01137 0.93727 11: interface_count 0.01073 0.948 12: class_name_digit_run_avg 0.00947 0.95747 13: method_name_lowercase_run_longest 0.00751 0.96498 14: acc_public 0.00604 0.97102 15: methods_count 0.00509 0.97611 16: method_name_uppercase_run_longest 0.00459 0.9807 17: ap_count 0.0039 0.9846 18: class_name_digit_run_longest 0.00336 0.98796 19: major version 0.00286 0.99082 20: method_name_uppercase_run_avg 0.00277 0.99359 21: acc_abstract 0.00181 0.9954 22: method_name_digit_run_avg 0.00148 0.99688 23: acc_final 0.00105 0.99793 24: method_name_digit_run_longest 0.00103 0.99896 25: minor version 0.00054 0.9995 26: acc_interface 0.0005 1.0 27: acc_super 1e-05 1.00001 28: acc_annotation 0.0 1.00001 29: acc_enum 0.0 1.00001 30: acc_synthetic 0.0 1.00001
clf_er = sklearn.ensemble.ExtraTreesClassifier(n_estimators=50)
X_er = df.as_matrix(all_features)
y_er = np.array(df['label'].tolist())
labels = ['benign', 'malicious']
scores = sklearn.cross_validation.cross_val_score(clf_er, X_er, y_er, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.997 (+/- 0.013)
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X_er, y_er, test_size=my_tsize, random_state=my_seed)
clf_er.fit(X_train, y_train)
y_pred = clf_er.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 100.00% (90/90) benign/malicious: 0.00% (0/90) malicious/benign: 0.00% (0/114) malicious/malicious: 100.00% (114/114)
import sklearn.svm
import sklearn.preprocessing
clf_svc = sklearn.svm.SVC()
X_svc = df.as_matrix(all_features)
X_svc = sklearn.preprocessing.scale(X_svc)
y_svc = np.array(df['label'].tolist())
labels = ['benign', 'malicious']
scores = sklearn.cross_validation.cross_val_score(clf_svc, X_svc, y_svc, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.993 (+/- 0.015)
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X_svc, y_svc, test_size=my_tsize, random_state=my_seed)
clf_svc.fit(X_train, y_train)
y_pred = clf_svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 98.89% (89/90) benign/malicious: 1.11% (1/90) malicious/benign: 0.00% (0/114) malicious/malicious: 100.00% (114/114)
# Now we can use scikit learn's cross validation to assess predictive performance.
scores = sklearn.cross_validation.cross_val_score(clf_all, X_all, y_all, cv=20)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.992 (+/- 0.029)
# Now we can use scikit learn's cross validation to assess predictive performance.
scores = sklearn.cross_validation.cross_val_score(clf_er, X_er, y_er, cv=20)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.995 (+/- 0.017)
clf_everything = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
all_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count',
'entropy', 'size', 'interface_count', 'major version', 'methods_count', 'minor version',
'method_name_digit_run_avg', 'method_name_digit_run_longest',
'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest',
'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest',
'class_name_digit_run_avg', 'class_name_digit_run_longest',
'class_name_length', 'class_name_lowercase_run_avg',
'class_name_lowercase_run_longest', 'class_name_slash_count',
'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest']
X_all = df.as_matrix(all_features)
y_all = np.array(df['label'].tolist())
clf_everything.fit(X_all, y_all)
RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, n_jobs=1, oob_score=False, random_state=None, verbose=0)
java_big_pile_df = pd.read_hdf('data/java_clean_df.hd5', 'table')
/opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:520: DeprecationWarning: openFile() is pending deprecation, use open_file() instead. You may use the pt2to3 tool to update your source code. self._handle = tables.openFile(self._path, self._mode, **kwargs) /opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:1017: DeprecationWarning: getNode() is pending deprecation, use get_node() instead. You may use the pt2to3 tool to update your source code. return self._handle.getNode(self.root, key)
clean = 0
gray = 0
bad = 0
for x in java_big_pile_df.as_matrix(all_features):
try:
score = clf_everything.predict_proba(x)[:,1][0]
if score < 0.5:
clean += 1
elif score < 0.8:
gray += 1
else:
bad += 1
except:
print "Sad"
print x
break
print java_big_pile_df.shape
print clean
print gray
print bad
['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface', 'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count', 'entropy', 'size', 'interface_count', 'major version', 'methods_count', 'minor version', 'method_name_digit_run_avg', 'method_name_digit_run_longest', 'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest', 'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest', 'class_name_digit_run_avg', 'class_name_digit_run_longest', 'class_name_length', 'class_name_lowercase_run_avg', 'class_name_lowercase_run_longest', 'class_name_slash_count', 'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest'] (366341, 35) 339771 10971 15599
java_more_bad_df = pd.read_hdf('data/java_malicious_df.hd5', 'table')
/opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:520: DeprecationWarning: openFile() is pending deprecation, use open_file() instead. You may use the pt2to3 tool to update your source code. self._handle = tables.openFile(self._path, self._mode, **kwargs) /opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:1017: DeprecationWarning: getNode() is pending deprecation, use get_node() instead. You may use the pt2to3 tool to update your source code. return self._handle.getNode(self.root, key)
java_big_pile_df.head()
acc_abstract | acc_annotation | acc_enum | acc_final | acc_interface | acc_public | acc_super | acc_synthetic | ap_count | attributes count | class name | class_name_digit_run_avg | class_name_digit_run_longest | class_name_length | class_name_lowercase_run_avg | class_name_lowercase_run_longest | class_name_slash_count | class_name_uppercase_run_avg | class_name_uppercase_run_longest | constant_pool_count | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 1 | com/jidesoft/combobox/DateChooserPanel | 0 | 0 | 38 | 5.333333 | 8 | 3 | 1 | 1 | 1037 | ... |
1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 3 | 0 | org/jmol/modelset/BondIterator | 0 | 0 | 30 | 5.000000 | 8 | 3 | 1 | 1 | 11 | ... |
2 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 2 | org/hibernate/engine/query/ParameterParser | 0 | 0 | 42 | 6.000000 | 9 | 4 | 1 | 1 | 152 | ... |
3 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 1 | com/intellij/updater/Utils | 0 | 0 | 26 | 5.500000 | 8 | 3 | 1 | 1 | 330 | ... |
4 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 2 | 1 | com/kiwisoft/db/driver/SybaseDriver | 0 | 0 | 35 | 4.833333 | 8 | 4 | 1 | 1 | 151 | ... |
5 rows × 35 columns
java_big_pile_df['class_name_length'].describe()
count 366341.000000 mean 48.081181 std 19.812234 min 1.000000 25% 36.000000 50% 47.000000 75% 61.000000 max 161.000000 Name: class_name_length, dtype: float64
Randomize list
java_random_df = java_big_pile_df.reindex(np.random.permutation(java_big_pile_df.index))
java_random_2k_df = java_random_df[0:2000]
java_random_the_rest_df = java_random_df[2000:]
java_random_2k_df['label'] = 'benign'
java_more_bad_df['label'] = 'malicious'
java_4k_df = pd.concat([java_more_bad_df, java_random_2k_df], ignore_index=True)
java_4k_df.fillna(0, inplace=True)
clf_4k = sklearn.ensemble.RandomForestClassifier(n_estimators=75)
all_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
'acc_public', 'acc_super', 'acc_synthetic', 'ap_count',
'class_name_digit_run_avg', 'class_name_digit_run_longest',
'class_name_length', 'class_name_lowercase_run_avg',
'class_name_lowercase_run_longest', 'class_name_slash_count',
'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest',
'constant_pool_count', 'entropy', 'interface_count', 'major version',
'method_name_digit_run_avg', 'method_name_digit_run_longest',
'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest',
'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest',
'methods_count', 'minor version', 'size']
X = java_4k_df.as_matrix(all_features)
y = np.array(java_4k_df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_4k, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.989 (+/- 0.008)
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_4k.fit(X_train, y_train)
y_pred = clf_4k.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats benign/benign: 99.00% (398/402) benign/malicious: 1.00% (4/402) malicious/benign: 0.79% (3/382) malicious/malicious: 99.21% (379/382)
# Feature Selection
# Which features best deferentiated the two classes?
# Here we're going to grab the feature_importances from the classifier itself,
importances = zip(all_features, clf_4k.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
sum = 0
for idx, im in enumerate(importances):
sum += round(im[1], 5)
print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), sum
1: class_name_slash_count 0.25082 0.25082 2: class_name_length 0.22822 0.47904 3: entropy 0.0733 0.55234 4: constant_pool_count 0.06575 0.61809 5: class_name_uppercase_run_avg 0.06179 0.67988 6: size 0.05436 0.73424 7: class_name_lowercase_run_longest 0.05158 0.78582 8: class_name_uppercase_run_longest 0.04584 0.83166 9: method_name_lowercase_run_longest 0.03077 0.86243 10: method_name_lowercase_run_avg 0.02461 0.88704 11: class_name_lowercase_run_avg 0.02118 0.90822 12: interface_count 0.01524 0.92346 13: major version 0.01305 0.93651 14: method_name_uppercase_run_longest 0.01267 0.94918 15: method_name_uppercase_run_avg 0.01198 0.96116 16: methods_count 0.01062 0.97178 17: ap_count 0.00603 0.97781 18: class_name_digit_run_avg 0.00561 0.98342 19: minor version 0.00547 0.98889 20: acc_public 0.00262 0.99151 21: acc_abstract 0.00221 0.99372 22: class_name_digit_run_longest 0.00133 0.99505 23: acc_super 0.00132 0.99637 24: acc_final 0.00128 0.99765 25: method_name_digit_run_avg 0.00089 0.99854 26: method_name_digit_run_longest 0.0008 0.99934 27: acc_interface 0.00065 0.99999 28: acc_annotation 0.0 0.99999 29: acc_enum 0.0 0.99999 30: acc_synthetic 0.0 0.99999
clf_everything_4k = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
X_all = java_4k_df.as_matrix(all_features)
y_all = np.array(java_4k_df['label'].tolist())
clf_everything_4k.fit(X_all, y_all)
RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50, n_jobs=1, oob_score=False, random_state=None, verbose=0)
clean = 0
gray = 0
bad = 0
X_rest = java_random_the_rest_df.as_matrix(all_features)
for x in X_rest:
score = clf_everything_4k.predict_proba(x)[:,1][0]
if score < 0.5:
clean += 1
elif score < 0.8:
gray += 1
else:
bad += 1
print java_random_the_rest_df.shape[0]
print clean
print gray
print bad
364341 359198 3766 1377