import pandas as pd
url = "https://raw.githubusercontent.com/jwjwyoung/RecentUpgradeFailureStudy/main/recent_upgrade_failures.csv"
df = pd.read_csv(url)
df = df.fillna('')
issues = list(df['Issue key'])
app = ['cassandra', 'hbase', 'hdfs','kafka', 'mapreduce','mesos', 'yarn', 'zookeeper']
dic = {a: 0 for a in app}
for a in app:
n = len([i for i in issues if a in i.lower()])
dic[a] = n
table1 = pd.DataFrame([list(dic), dic.values()])
print(table1)
0 1 2 3 4 5 6 7 0 cassandra hbase hdfs kafka mapreduce mesos yarn zookeeper 1 44 13 38 7 1 8 8 4
# set output format
pd.options.display.float_format = '{:,.0f} %'.format
df['priority'] = df['priority'].str.lower()
# bugs in cassandra
ca_issues = df[df['Issue key'].str.contains("CASSANDRA")]
# bugs not in cassandra
non_ca_issues = df[~df['Issue key'].str.contains("CASSANDRA")]
ca_perc = ca_issues.groupby('priority')['Issue key'].count().apply(lambda x:100 * x /len(ca_issues)).to_frame()
non_ca_perc = non_ca_issues.groupby('priority')['Issue key'].count().apply(lambda x:100 * x / (len(non_ca_issues))).to_frame()
non_ca_perc
Issue key | |
---|---|
priority | |
blocker | 38 % |
critical | 15 % |
major | 44 % |
minor | 3 % |
ca_perc
Issue key | |
---|---|
priority | |
low | 7 % |
normal | 73 % |
urgent | 20 % |
keys = ['rolling', 'data', 'performance', 'part', 'incorrect', 'unknown']
cs = []
ccs = []
cps = []
whole_df = df.loc[~((df.symptom.str.lower().str.contains(keys[0]))
| df.symptom.str.lower().str.contains(keys[1])
| df.symptom.str.lower().str.contains(keys[2])
| df.symptom.str.lower().str.contains(keys[3])
| df.symptom.str.lower().str.contains(keys[4])
| df.symptom.str.lower().str.contains(keys[5]))]
severity_dfs = [whole_df]
for k in keys:
filtered_df = df[df['symptom'].str.lower().str.contains(k)]
severity_dfs.append(filtered_df)
for filtered_df in severity_dfs:
filtered_df = filtered_df.fillna('')
c_df = filtered_df[filtered_df['catastropic'].str.lower().str.contains('yes')]
cp_df = c_df[c_df['caught after release? '].str.lower().str.contains('yes') | c_df['caught after release? '].str.lower().str.contains("but it's found ")]
cs.append(len(filtered_df))
ccs.append(len(c_df))
cps.append(len(cp_df))
ccs[-1] = 0
table2_keys = ['Whole cluster down (all nodes crash, master node crash',
'Severe service quality degradation during rolling upgrade',
'Data loss and data corruption',
'Performance degradation (increased latency, wasted computation, etc.)',
'Part of cluster down (part of worker nodes down, secondary master down)',
'Incorrect service result (failed read/write requests, UI error, etc.)',
'Unknown',
]
table2 = pd.DataFrame([table2_keys, cs,ccs,cps]).transpose()
table2.columns = ['Symptom','All', 'Catastrophic', 'Catastrophic in Production']
table2
Symptom | All | Catastrophic | Catastrophic in Production | |
---|---|---|---|---|
0 | Whole cluster down (all nodes crash, master no... | 34 | 34 | 18 |
1 | Severe service quality degradation during roll... | 16 | 16 | 10 |
2 | Data loss and data corruption | 20 | 15 | 12 |
3 | Performance degradation (increased latency, wa... | 10 | 4 | 4 |
4 | Part of cluster down (part of worker nodes dow... | 12 | 7 | 3 |
5 | Incorrect service result (failed read/write re... | 24 | 6 | 4 |
6 | Unknown | 7 | 0 | 0 |
syntax_df = df[df['data syntax or semantics'].str.contains('data syntax') | df['root cause category'].str.contains("1.1") ]
semantics_df = df[df['data syntax or semantics'].str.contains('data semantics') | df['root cause category'].str.contains("1.2") | df['root cause category'].str.contains("1.3") ]
proto_df = syntax_df[syntax_df['root cause'].str.contains("serialization library") | (syntax_df['root cause'] == 'proto')]
enum_df = syntax_df[syntax_df['root cause category'].str.contains("enum") | (syntax_df['root cause'].str.contains('enum'))]
enum_df = len(enum_df)
system_specific = len(syntax_df) - len(proto_df) - enum_df
version_df = semantics_df[semantics_df['root cause'].str.contains('version')]
mishandling_df = semantics_df[semantics_df['root cause'].str.contains('proto logic') | semantics_df['root cause'].str.contains('serialization')]
other = len(semantics_df) - len(version_df) - len(mishandling_df)
keys = ['data defined using serialization lib.',
'enum',
'system-specific-data',
'mishandling of serialization lib.',
'incomplete version handling',
'other semantics issue',
]
table3 = pd.DataFrame([keys, [len(proto_df), enum_df, system_specific, len(mishandling_df), len(version_df), other]]).transpose()
table3.columns = ['type', 'number']
table3
type | number | |
---|---|---|
0 | data defined using serialization lib. | 7 |
1 | enum | 2 |
2 | system-specific-data | 41 |
3 | mishandling of serialization lib. | 6 |
4 | incomplete version handling | 16 |
5 | other semantics issue | 5 |
two_major_df = df[df['major'] == '2 major']
df = pd.read_csv(url)
one_major_df = df[df['major'].notnull()]
one_major_df = one_major_df[one_major_df['major'] != '2 major']
df = df.fillna('')
greater_two_df = df[df['minor'].str.contains("3")]
two_minor_df = df[df['minor'].str.contains("2")]
smaller_one_minor_df = df[df['minor'].str.contains("< 1")]
one_minor_df = df[df['minor'] == "1"] + df[df['minor'] == "1 minor"]
df = pd.read_csv(url)
any_df = df[~df['minor'].notnull() & ~df['major'].notnull()]
c1 = [2, 1, 0, 0, 0, 0, 'any']
c2 = ['any', 'any', '>2', '2', 1, '< 1', 'any']
num = [len(two_major_df),
len(one_major_df),
len(greater_two_df),
len(two_minor_df),
len(one_minor_df),
len(smaller_one_minor_df),
len(any_df)
]
table4 = pd.DataFrame([c1,c2,num,[i / 120 * 100 for i in num]])
table4.columns = [' ', ' ',' ',' ', ' ',' ','']
table4.index = ['major gap', 'minor gap', '# of upgrade failure', 'percentage']
table4
major gap | 2 | 1 | 0 | 0 | 0 % | 0 | any |
---|---|---|---|---|---|---|---|
minor gap | any | any | >2 | 2 | 1 % | < 1 | any |
# of upgrade failure | 3 | 37 | 3 | 8 | 31 % | 6 | 32 |
percentage | 2 % | 31 % | 2 % | 7 % | 26 % | 5 % | 27 % |
df = df.fillna('')
nodes = df[df['min # of nodes'].str.contains(" > 3 ")]
len(nodes)
0
derterministic = df[~df['deterministic? '].str.contains("no")]
percentage = int(len(derterministic) / len(df) * 100)
print("{}% of the upgrade failures are deterministic, not requiring any special timing to trigger".format(percentage))
89% of the upgrade failures are deterministic, not requiring any special timing to trigger
c1 = list(df.columns)[-14]
c2 = list(df.columns)[-12]
c3 = list(df.columns)[-3]
c4 = list(df.columns)[-2]
df = df.fillna('')
new_df = df[df[c1].astype(str) == "no"]
new_df = new_df[new_df[c2].str.contains("no")]
old_df = df[(df[c3].str.contains("1\)") & df[c4].str.contains("yes") )| df[c3].str.contains("2\)")]
total_default_df = pd.concat([new_df, old_df])
print(len(total_default_df))
61