#!/usr/bin/env python
# coding: utf-8
# # 2021
# ---
# In[1]:
# Import necessary libraries
from IPython.core.magic import register_cell_magic
from IPython.display import Markdown
import datetime
from datetime import date
import glob
import json
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from itables import init_notebook_mode, show
import itables.options as opt
# Initialize itables options
opt.dom = "tpir"
opt.style = "table-layout:auto;width:auto"
init_notebook_mode(all_interactive=True, connected=True)
# Register a custom cell magic for markdown
@register_cell_magic
def markdown(line, cell):
return Markdown(cell.format(**globals()))
# Configure logging and warnings
logging.getLogger('matplotlib.font_manager').disabled = True
warnings.filterwarnings("ignore")
# Configure pandas display options
pd.set_option('display.width', 500)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 10)
# In[2]:
def get_nested_value(entry, keys, default='Missing_Data'):
try:
for key in keys:
entry = entry[key]
return entry
except (KeyError, IndexError):
return default
row_accumulator = []
for filename in glob.glob('nvd.jsonl'):
with open(filename, 'r', encoding='utf-8') as f:
nvd_data = json.load(f)
for entry in nvd_data:
new_row = {
'CVE': get_nested_value(entry, ['cve', 'id']),
'Published': get_nested_value(entry, ['cve', 'published']),
'AttackVector': get_nested_value(entry, ['cve', 'metrics', 'cvssMetricV31', 0, 'cvssData', 'attackVector']),
'AttackComplexity': get_nested_value(entry, ['cve', 'metrics', 'cvssMetricV31', 0, 'cvssData', 'attackComplexity']),
'PrivilegesRequired': get_nested_value(entry, ['cve', 'metrics', 'cvssMetricV31', 0, 'cvssData', 'privilegesRequired']),
'UserInteraction': get_nested_value(entry, ['cve', 'metrics', 'cvssMetricV31', 0, 'cvssData', 'userInteraction']),
'Scope': get_nested_value(entry, ['cve', 'metrics', 'cvssMetricV31', 0, 'cvssData', 'scope']),
'ConfidentialityImpact': get_nested_value(entry, ['cve', 'metrics', 'cvssMetricV31', 0, 'cvssData', 'confidentialityImpact']),
'IntegrityImpact': get_nested_value(entry, ['cve', 'metrics', 'cvssMetricV31', 0, 'cvssData', 'integrityImpact']),
'AvailabilityImpact': get_nested_value(entry, ['cve', 'metrics', 'cvssMetricV31', 0, 'cvssData', 'availabilityImpact']),
'BaseScore': get_nested_value(entry, ['cve', 'metrics', 'cvssMetricV31', 0, 'cvssData', 'baseScore'], '0.0'),
'BaseSeverity': get_nested_value(entry, ['cve', 'metrics', 'cvssMetricV31', 0, 'cvssData', 'baseSeverity']),
'ExploitabilityScore': get_nested_value(entry, ['cve', 'metrics', 'cvssMetricV31', 0, 'exploitabilityScore']),
'ImpactScore': get_nested_value(entry, ['cve', 'metrics', 'cvssMetricV31', 0, 'impactScore']),
'CWE': get_nested_value(entry, ['cve', 'weaknesses', 0, 'description', 0, 'value']),
'Description': get_nested_value(entry, ['cve', 'descriptions', 0, 'value'], ''),
'Assigner': get_nested_value(entry, ['cve', 'sourceIdentifier']),
'Tag': get_nested_value(entry, ['cve', 'cveTags', 0, 'tags'], np.nan),
'Status': get_nested_value(entry, ['cve', 'vulnStatus'], '')
}
row_accumulator.append(new_row)
nvd = pd.DataFrame(row_accumulator)
nvd = nvd[~nvd.Status.str.contains('Rejected')]
nvd['Published'] = pd.to_datetime(nvd['Published'])
nvd = nvd.sort_values(by=['Published'])
thisyear = ((nvd['Published'] > '2021-01-01') & (nvd['Published'] < '2022-01-01'))
nvd = nvd.loc[thisyear]
nvd = nvd.sort_values(by=['Published'])
nvd = nvd.reset_index(drop=True)
nvd['BaseScore'] = pd.to_numeric(nvd['BaseScore']);
nvd['BaseScore'] = pd.to_numeric(nvd['BaseScore']);
nvd['BaseScore'] = nvd['BaseScore'].replace(0, np.nan);
nvdcount = nvd['Published'].count()
nvdunique = nvd['Published'].nunique()
startdate = date(2021, 1, 1)
enddate = date(2022, 1, 1)
numberofdays = enddate - startdate
per_day = nvdcount/numberofdays.days
# In[3]:
Markdown(f"Total Number of CVEs: **{nvd['CVE'].count()}**
Average CVEs Per Day: **{per_day.round(2)}**
Average CVSS Score: **{nvd['BaseScore'].mean().round(2)}**")
#
#
# ## CVE Graphs
# In[4]:
Month_Graph = nvd['Published'].groupby(nvd.Published.dt.to_period("M")).agg('count')
Year_Graph = nvd['Published'].groupby(nvd.Published.dt.to_period("Y")).agg('count')
Week_Graph = nvd['Published'].groupby(nvd.Published.dt.to_period("W")).agg('count')
Day_Graph = nvd['Published'].groupby(nvd.Published.dt.to_period("D")).agg('count')
# ### CVE Per Month Graph
# In[5]:
cg = Month_Graph.plot.area(colormap='cividis', figsize=(16, 8), title='Number of CVEs Published Per Month')
plt.grid(True, linestyle='--', linewidth=0.5)
cg.set_ylabel("CVEs")
cg.set_xlabel("Month")
# Add watermark
plt.text(0.01, 0.01, 'cve.icu', transform=cg.transAxes, fontsize=12, color='gray', alpha=0.5)
# Adjust layout and save the chart as a PNG file one directory up
plt.tight_layout()
plt.show()
#
#
# ### CVE Per Week Graph
# In[6]:
cg = Week_Graph.plot.area(colormap='cividis', figsize=(16, 8), title='Number of CVEs Published Per Week')
plt.grid(True, linestyle='--', linewidth=0.5)
cg.set_ylabel("CVEs")
cg.set_xlabel("Week")
# Add watermark
plt.text(0.01, 0.01, 'cve.icu', transform=cg.transAxes, fontsize=12, color='gray', alpha=0.5)
# Adjust layout and save the chart as a PNG file one directory up
plt.tight_layout()
plt.show()
#
#
# ### CVE Per Day Graph
# In[7]:
cg = Day_Graph.plot.area(colormap='cividis', figsize=(16, 8), title='Number of CVEs Published Per Day')
plt.grid(True, linestyle='--', linewidth=0.5)
cg.set_ylabel("CVEs")
cg.set_xlabel("Day")
# Add watermark
plt.text(0.01, 0.01, 'cve.icu', transform=cg.transAxes, fontsize=12, color='gray', alpha=0.5)
# Adjust layout
plt.tight_layout()
plt.show()
#
# ## CVSS Data
# In[8]:
nvd['BaseScore'] = pd.to_numeric(nvd['BaseScore'])
nvd['BaseScore'] = nvd['BaseScore'].replace(0, np.nan)
# Plot the CVSS breakdown with 100 bins
cg = nvd['BaseScore'].plot(kind="hist", bins=100, title='CVSS Breakdown', colormap='cividis', figsize=(16, 8))
plt.grid(True, linestyle='--', linewidth=0.5)
cg.set_ylabel("CVEs")
cg.set_xlabel("CVSS Score")
# Find the most common, least common, and average CVSS scores
most_common_score = nvd['BaseScore'].mode()[0]
least_common_score = nvd['BaseScore'].value_counts().idxmin()
most_common_count = nvd['BaseScore'].value_counts().max()
least_common_count = nvd['BaseScore'].value_counts().min()
average_score = nvd['BaseScore'].mean().round(2)
# Add annotation for the most common, least common, and average CVSS scores
annotation_text = (f'Most Common: {most_common_score} ({most_common_count} CVEs)\n'
f'Least Common: {least_common_score} ({least_common_count} CVEs)\n'
f'Average Score: {average_score}')
plt.text(0.01, 0.98, annotation_text, transform=cg.transAxes, fontsize=10, color='black',
verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", edgecolor='black', facecolor='white'))
# Adjust layout and save the chart as a PNG file one directory up
plt.tight_layout()
plt.show()
# ## CNA Data
#
# ### CNA Assigner Graph
# In[9]:
# Replace specific assigner ID with email
nvd['Assigner'].replace('416baaa9-dc9f-4396-8d5f-8c081fb06d67', 'cve@kernel.org', inplace=True)
# Extract domain names and check for uniqueness
nvd['Domain'] = nvd['Assigner'].apply(lambda x: x.split('@')[-1])
domain_counts = nvd['Domain'].value_counts()
# Modify Assigner column based on domain uniqueness
unique_domains = nvd.groupby('Domain')['Assigner'].nunique()
nvd['Assigner'] = nvd.apply(lambda x: x['Domain'] if unique_domains[x['Domain']] == 1 else f"{x['Domain']} ({x['Assigner'].split('@')[0]})", axis=1)
# Calculate frequency of assigners
nvd_frequency = nvd['Assigner'].value_counts().reset_index()
nvd_frequency.columns = ['Assigner', 'counts']
nvd_frequency = nvd_frequency.head(50)
# Calculate the number of CVEs published by mitre.org
mitre_cves = nvd_frequency[nvd_frequency['Assigner'].str.contains('mitre.org')]['counts'].sum()
# Remove mitre.org from the frequency list
nvd_frequency_no_mitre = nvd_frequency[~nvd_frequency.Assigner.str.contains('mitre.org')]
nvd_frequency_no_mitre = nvd_frequency_no_mitre[nvd_frequency_no_mitre.counts > 100].head(20)
# Plot the top 20 CNAs
plt.figure(figsize=(16, 8))
plt.barh("Assigner", "counts", data=nvd_frequency_no_mitre, color="#001d82")
plt.xlabel("CVEs")
plt.ylabel("Assigner")
plt.title("Top 20 CNAs")
plt.grid(True, linestyle='--', linewidth=0.5)
# Add a text box indicating mitre.org has been removed and the number of CVEs they published
textstr = f'{mitre_cves:,} CVEs published by MITRE not shown'
plt.text(0.99, 0.98, textstr, transform=plt.gca().transAxes, fontsize=10,
verticalalignment='top', horizontalalignment='right',
bbox=dict(boxstyle="round,pad=0.3", edgecolor='black', facecolor='white'))
# Adjust layout and show the chart
plt.tight_layout()
plt.show()
#
#
# ## CWE Data
# In[10]:
# Calculate frequency of CWEs
nvd_cwe = nvd['CWE'].value_counts().reset_index()
nvd_cwe.columns = ['CWE', 'counts']
nvd_cwe = nvd_cwe[~nvd_cwe.CWE.str.contains('Missing_')]
nvd_cwe = nvd_cwe[nvd_cwe.counts > 100].head(25)
# Plot the most common CWEs
plt.figure(figsize=(16, 8))
plt.barh("CWE", "counts", data=nvd_cwe, color="#001d82")
plt.xlabel("Count")
plt.ylabel("CWE")
plt.title("Most Common CWE in CVE Records")
plt.grid(True, linestyle='--', linewidth=0.5)
# Adjust layout and show the chart
plt.tight_layout()
plt.show()
#
#
# ## More CVE Data
#
#
#
# ### Top CNA Assigner
# In[11]:
nvd_frequency.reset_index(drop=True, inplace=True)
show(nvd_frequency, scrollCollapse=True, paging=True)
# ### Top CWEs
# In[12]:
nvd_cwe.reset_index(drop=True, inplace=True)
show(nvd_cwe, scrollCollapse=True, paging=True)
#
#
#
# ### CVEs By Identifier
# In[13]:
years = range(1980, 2025)
cve_counts = []
for year in years:
count = len(nvd[nvd['CVE'].str.contains(f'CVE-{year}-')])
cve_counts.append({'Identifier': f'CVE-{year}', 'Count': count})
cve_df = pd.DataFrame(cve_counts)
# Filter out rows with a count of 0
cve_df = cve_df[cve_df['Count'] != 0]
# Reset the index
cve_df.reset_index(drop=True, inplace=True)
show(cve_df, scrollCollapse=True, paging=False)
# In[14]:
Markdown(f"This report is updated automatically every day, last generated on: **{datetime.datetime.now()}**")