#!/usr/bin/env python
# coding: utf-8
# ### ANALYSING LATENCY IN AFRICA
# USING RIPE NCC RECURRING *TRACEROUTE* MEASUREMENTS FROM PROBES IN AFRICA TO `eu.thethings.network`
# ___
# ### TABLE OF CONTENT
#
# * **[I. Data Loading, preprocessing and cleaning](#data-loading)**
# * [I.1 Measurements](#measurements)
# * [I.2 Probes description](#probes-description)
# * [I.3 Merging measurements and probes](#merge-measurements-probes)
# * [I.4 Validation of Rtts vs. speed of light](#rtt-speed-of-light)
# * **[II. Preliminary Exploratory Data Analysis (EDA)](#eda)**
# * [II.1 Round-Trip-Time (RTT) histograms](#rtts-histogram)
# * [II.2 Round-Trip-Time (RTT) boxplots by country](#boxplots-country)
# * [II.3 Probes locations](#probes-location)
# * [II.4 Packet loss](#packet-loss)
# * **[III. Analysing National Education Research Networks probes latency (NREN)](#nren)**
# * [II.1 NRENs identification](#nrens-identification)
# * [II.2 NREN probes location](#nren-probes-loc)
# * [II.3 Comparing RTTs of NREN probes vs. others](#rtt-nren-vs-others)
# * [II.4 Comparing RTTs of NREN probes vs. others per country](#rtt-nren-vs-others-country)
# * [III.5 Comparing Packet Loss of NREN probes vs. others](#packet-loss-nren-vs-others)
# * [III.6 Comparing Packet Loss of NREN probes vs. others per country](#packet-loss-nren-vs-others-per-country)
# * [III.7 Is there enough evidence at this stage that a NREN make a difference?](#evidence)
# In[1]:
from datetime import datetime
import os.path
import time
import requests
import numpy as np
import pandas as pd
import altair as alt
from vega_datasets import data as alt_data
# for the notebook only (not for JupyterLab) run this command once per session
alt.renderers.enable('notebook')
alt.data_transformers.disable_max_rows()
import utilities as utils
import warnings
warnings.filterwarnings('ignore')
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# In[2]:
# Globar vars
WIDTH = 860
AFRICA_TOPOJSON_URL = 'https://raw.githubusercontent.com/deldersveld/topojson/master/continents/africa.json'
SAMPLE_SIZE = 10000
# ## I. Data Loading, preprocessing and cleaning
#
# ### I.1 Measurements
#
# In[5]:
json_data = utils.get_measurements(measurement_id=17468431, start='01/12/2018', stop='31/12/2018')
# In[4]:
# Number of records
print(str(len(json_data)) + ' results')
# In[5]:
# An exemple of result
json_data[0]
# * **Measurement results data structure**
#
# *Note that according to probe's firmware version, returned fields might differ*
#
# From: https://atlas.ripe.net/docs/data_struct/
#
# * `af`: adress familly with possible values: {4: IPv4, 6: IPv6}
# * `dst_addr`: IP address of the destination (string)
# * `dst_name`: name of the destination (string)
# * `from`: IP address of the probe as known by controller (string)
# * `fw`: firmware version of the probe
# * `group_id`: if the measurement belongs to a group of measurements, the identifier of the group (int)
# * `lts`: last time synchronised. How long ago (in seconds) the clock of the probe was found to be in sync with that of a controller. The value -1 is used to indicate that the probe does not know whether it is in sync (int)
# * `msm_id`: measurement identifier (int)
# * `msm_name`: measurement type "Ping" (string)
# * `paris_id`: variation for the Paris mode of traceroute (int)
# * `prb_id`: source probe ID (int)
# * `proto`: "UDP", "ICMP", or "TCP" (string)
# * `result`: list of hop elements (array of objects). Objects have the following fields:
# * `hop`: hop number (int)
# * `error`: [optional] when an error occurs trying to send a packet. In that case there will not be a result structure. (string)
# * `result`: variable content, depending on type of response (array of objects)
# objects have the following fields:
# * `flags`: "flags" -- (optional) TCP flags in the reply packet, for TCP traceroute, concatenated, in the order 'F' (FIN), 'S' (SYN), 'R' (RST), 'P' (PSH), 'A' (ACK), 'U' (URG) (fw >= 4600) (string)
# * `from`: IPv4 or IPv6 source address in reply (string)
# * `rtt`: round-trip-time of reply, not present when the response is late in ms (float)
# * `size`: size of reply (int)
# * `ttl`: time-to-live in reply (int)
# * `size`: packet size (int)
# * `src_addr`: source address used by probe (string)
# * `timestamp`: Unix timestamp for start of measurement (int)
# * `type`: "traceroute" (string)
#
#
#
# In[6]:
# To check a specific record by attribute
# list(filter(lambda m: m['endtime'] == 1543619009, json_data))
# * **Checking destination IP address and fetching lon, lat for later use**
# In[ ]:
# Let's check quickly all probes trace to the same IP address
ip_dest = set(map(lambda x: x['dst_addr'], json_data))
print('Destination IP address: ', ip_dest)
# Building the url as a long string in one go
url_ipinfo = 'http://ipinfo.io/{}?token=your-token'.format(list(ip_dest)[0])
url_ipinfo
r = requests.get(url_ipinfo)
json_ipinfo = r.json()
print(json_ipinfo)
lat_dest, lon_dest = map(lambda x: float(x), json_ipinfo['loc'].split(','))
# In[ ]:
json_ipinfo
# In[106]:
# Utilities functions and lambdas
# Error codes: https://atlas.ripe.net/docs/data_struct
# Filtering/mapping predicates
has_result = lambda row: not('error' in row['result'][0])
is_late = lambda pack: 'late' in pack # number of packets a reply is late, in this case rtt is not present (int)
is_timed_out = lambda pack: 'x' in pack # {'x': '*'} - Timed out
is_error = lambda pack: 'error' in pack # Network, destination,... unreachable
is_err = lambda pack: 'err' in pack # Network, destination,... unreachable
is_empty = lambda pack: not(bool(pack))
get_result = lambda row: row['result'][0]['result']
get_nb_packets = lambda row: len(get_result(row))
get_max_nb_hops = lambda row: row['result'][0]['hop']
get_rtts = lambda row: list(filter(lambda pack: not(is_late(pack) or
is_timed_out(pack) or
is_error(pack) or
is_err(pack) or
is_empty(pack)), get_result(row)))
get_nb_late_pack = lambda row: len(list(filter(lambda pack: is_late(pack), get_result(row))))
get_nb_timed_out_pack = lambda row: len(list(filter(lambda pack: is_timed_out(pack), get_result(row))))
get_nb_error_pack = lambda row: len(list(filter(lambda pack: (is_error(pack) or
is_err(pack)), get_result(row))))
get_nb_empty_pack = lambda row: len(list(filter(lambda pack: is_empty(pack), get_result(row))))
get_nb_rtts = lambda row: len(get_rtts(row))
get_rtts_mean = lambda row: np.mean(list(map(lambda pack: pack['rtt'], get_rtts(row))))
get_ttl = lambda row: get_rtts(row)[0]['ttl']
to_datetime = lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')
# In[131]:
# Postprocess json data
measurements = []
for row in json_data:
#is_valid = has_result(row) and get_nb_rtts(row)
is_valid = has_result(row)
measurements.append(
{'prb_id': row['prb_id'],
'ip_from': row['from'],
'paris_id': row['paris_id'],
'datetime': to_datetime(row['timestamp']),
'start_time': row['timestamp'],
'end_time': row['endtime'],
'last_time_sync': row['lts'],
'firmware_version': row['fw'],
'nb_packets': get_nb_packets(row) if is_valid else np.NaN,
'nb_rtts': get_nb_rtts(row) if is_valid else np.NaN,
'rtt': get_rtts_mean(row) if is_valid else np.NaN,
'measurement_failed': not(is_valid),
'nb_late_pack': get_nb_late_pack(row) if is_valid else np.NaN,
'nb_timed_out_pack': get_nb_timed_out_pack(row) if is_valid else np.NaN,
'nb_error_pack': get_nb_error_pack(row) if is_valid else np.NaN,
'nb_empty_pack': get_nb_empty_pack(row) if is_valid else np.NaN,
#'nb_hops': get_max_nb_hops(row) - get_ttl(row) if is_valid else np.NaN
})
# In[132]:
# Convert to Pandas dataframe
columns = ['datetime', 'prb_id', 'ip_from', 'paris_id', 'start_time',
'end_time', 'last_time_sync', 'firmware_version', 'nb_packets', 'nb_rtts',
'rtt', 'measurement_failed', 'nb_late_pack', 'nb_timed_out_pack',
'nb_error_pack','nb_empty_pack']
df = pd.DataFrame(measurements, columns=columns)
df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index('datetime', inplace=True)
df.sort_index(inplace=True)
# In[692]:
df.head()
# ### I.2 Probes description
#
# In[693]:
# Get list of probes involved in measurements
probes_id = list(df['prb_id'].unique())
# In[694]:
json_data_probe = utils.get_probes(probes_id)
# In[695]:
# An example of probe's description.
json_data_probe['results'][0]
# In[19]:
# Postprocess json data
probes = []
for i, row in enumerate(json_data_probe['results']):
probes.append(
{'prb_id': row['id'],
'ip_v4': row['address_v4'],
'asn': str(row['asn_v4']),
'country_code': row['country_code'],
'description': row['description'],
'lon': row['geometry']['coordinates'][0],
'lat': row['geometry']['coordinates'][1]})
# In[20]:
# Convert to Pandas dataframe
columns = ['prb_id','ip_v4', 'asn', 'country_code', 'description', 'lon', 'lat']
df_probes = pd.DataFrame(probes, columns=columns)
# In[21]:
print('Number of countries: ', len(df_probes['country_code'].unique()))
print(df_probes['country_code'].unique())
# In[22]:
# Loading country codes
country_codes = pd.read_csv('https://raw.githubusercontent.com/franckalbinet/' +
'latency-internet-africa/master/data/country_codes.csv')
country_codes.head()
# In[23]:
# Joining/merging with country code to get full country name
df_probes = pd.merge(df_probes, country_codes[['name', 'alpha-2']], left_on='country_code',
right_on='alpha-2', how='left')
df_probes = df_probes.drop(['country_code'], axis=1)
df_probes.rename(columns={'alpha-2':'country_code', 'name': 'country_name'}, inplace=True)
df_probes.head()
# * **It looks like we don't get the IP address from all probes. Let's use the ip address provided in measurement results instead.**
# In[24]:
# Note that each probe can have several IP addresses. We keep only one.
ip_from_measurement = df[['prb_id','ip_from']].reset_index().drop(columns=['datetime']).drop_duplicates(subset='prb_id')
# In[25]:
df_probes = pd.merge(df_probes, ip_from_measurement, left_on='prb_id', right_on='prb_id', how='left')
# In[26]:
df_probes.head()
# In[96]:
ipinfo_lookup = []
for ip in df_probes['ip_from'].values:
r = requests.get('http://ipinfo.io/{}?token=your-token'.format(ip))
ipinfo_lookup.append(r.json())
# In[97]:
df_ip_info = pd.DataFrame(ipinfo_lookup)
df_ip_info.head()
# In[98]:
df_probes = pd.merge(df_probes, df_ip_info, left_on='ip_from', right_on='ip', how='left')
df_probes.head()
# In[99]:
df_probes = df_probes[['prb_id', 'ip', 'asn', 'hostname', 'org', 'description', 'country_name', 'country_code', 'region', 'city', 'lon', 'lat']]
# In[100]:
df_probes.head()
# In[101]:
# Saving probes description as csv
utils.df_to_csv(df_probes, prefix_name='probes')
# ### I.3 Merging measurements and probes
#
# In[19]:
df_probes = pd.read_csv('data/probes-2019-08-23.csv')
# In[697]:
# Joining/merging with country code to get full country name
data = pd.merge(df.reset_index(), df_probes, left_on='prb_id', right_on='prb_id', how='left')
# ### I.4 Validation of Rtts vs. speed of light
#
# In[700]:
SPEED_OF_LIGHT = 299792.458 # km/s
DIST = 3620 # km (more or less shortest African point from Dublin)
print("Light takes: ", int(1000*DIST / SPEED_OF_LIGHT), "ms to travel") # in ms
# In[7]:
# To be defined
THRESHOLD_RTT = 30 # in ms
# * **Which probes yield rtts below 30 ms??**
# In[702]:
data[data['rtt'] < 30]['prb_id'].unique()
# * **In which countries??**
# In[703]:
data[data['rtt'] < 30]['country_name'].unique()
# ___
# In[704]:
# Remove missing country names - though can be retrieved easily but those countries are
# not relevant to ous NRENs vs. others analysis as no identified NRENs in those countries.
data.dropna(subset=['country_name'], inplace=True)
# In[705]:
# Dumping data for further use
utils.df_to_csv(data, prefix_name='measurements')
# ## II. Preliminary Exploratory Data Analysis (EDA)
#
# In[3]:
data = pd.read_csv('./data/measurements-2019-09-20.csv')
# In[4]:
# How many measurements and columns
data.shape
# In[5]:
# Quick descriptive statistics
data['rtt'].describe()
# In[97]:
# Time range fetched
print("Measurements from: {} to {}".format(min(data['datetime']),max(data['datetime'])))
# ### II.1 Round-Trip-Time histograms
#
# * **All Rtts Range**
# In[8]:
alt.Chart(data[data['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE))\
.mark_bar()\
.encode(
alt.X("rtt:Q", bin=alt.Bin(step=100), title='Round-Trip-Time (ms)'),
y='count()')\
.properties(
width = WIDTH,
height = 200)\
.save('./img/all-rtts.png')
# In[9]:
utils.show_chart('./img/all-rtts.png')
# * **Focusing on [0, 600] ms range**
# In[712]:
alt.Chart(data.sample(SAMPLE_SIZE))\
.mark_bar()\
.encode(
alt.X("rtt:Q", bin=alt.Bin(extent=[0, 600], step=20), title='Round-Trip-Time (ms)'),
y='count()')\
.properties(
width = WIDTH,
height = 200)\
.save('./img/rtts-focus.png')
# In[10]:
utils.show_chart('./img/rtts-focus.png')
# ### II.2 Round-Trip-Time (RTT) boxplots by country
#
# * **With peaks and outliers**
# In[714]:
alt.Chart(data[data['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE)).mark_boxplot().encode(
alt.X('rtt:Q',title='Round-Trip-Time (ms)'),
alt.Y(field = 'country_name', type = 'nominal',
sort=alt.EncodingSortField(field='rtt', op='max', order='ascending'),
title='Country'))\
.properties(
width = 0.8*WIDTH,
height = 550).save('./img/rtts-boxplot-by-country.png')
# In[11]:
utils.show_chart('./img/rtts-boxplot-by-country.png')
# * **Focusing on Interquartile range and median**
#
# Let's forget outliers and peaks for now and focus on Q1, Q2, Q3.
# In[716]:
points = alt.Chart(data[data['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE)).mark_point(
filled=True,
color='steelblue',
size=80
).encode(
x=alt.X('median(rtt)', title='Round-Trip-Time (ms)'),
y=alt.Y(
'country_name',
sort=alt.EncodingSortField(
field='rtt',
op='median',
order='ascending'
),
title='Country'
)
).properties(
width=0.8*WIDTH,
height=600
)
error_bars = points.mark_rule(size=1.5, color='steelblue').encode(
x='q1(rtt)',
x2='q3(rtt)',
)
(points + error_bars).save('./img/rtts-iqr-by-country.png')
# In[12]:
utils.show_chart('./img/rtts-iqr-by-country.png')
# ### II.1 Probes locations
#
# In[718]:
african_countries = alt.topo_feature(AFRICA_TOPOJSON_URL, feature='continent_Africa_subunits')
# US states background
background = alt.Chart(african_countries).mark_geoshape(
fill='lightgray',
stroke='white'
).properties(
width=WIDTH,
height=500
).project('equirectangular')
# probes positions on background
points = alt.Chart(df_probes).mark_circle(
size=50,
color='steelblue'
).encode(
longitude='lon:Q',
latitude='lat:Q'
)
(background + points).configure_view(stroke="transparent").save('./img/probes-locations.png')
# In[13]:
utils.show_chart('./img/probes-locations.png')
# ### II.4 Packet loss
#
#
# Below is considered a lost packet when:
# * no result;
# * packet late;
# * timed out.
# In[91]:
data_valid = data[data['measurement_failed'] == False]
data_valid['packets_lost'] = data_valid['nb_packets'] - data_valid['nb_rtts']
# In[92]:
def perc_loss(df):
return 100 * sum(df['packets_lost']) / sum(df['nb_packets'])
# In[93]:
print("Percentage of packets lost: {:.2f}".format(perc_loss(data_valid)))
# In[723]:
packet_loss_country = data_valid.groupby(['country_name'])\
.apply(perc_loss)\
.to_frame(name='% of packet loss')\
.sort_values(by='% of packet loss', ascending=False)
print(packet_loss_country)
# In[724]:
alt.Chart(packet_loss_country.reset_index()).mark_bar().encode(
x=alt.X('% of packet loss', title='Packet loss (%)'),
y=alt.Y(
'country_name:N',
sort=alt.EncodingSortField(
field='% of packet loss',
order='descending'
),
title='Country'
)
).properties(
width=0.8*WIDTH,
height=600
).save('./img/packet-loss-per-country.png')
# In[14]:
utils.show_chart('./img/packet-loss-per-country.png')
# ## III. Analysing National Education Research Networks (NREN) probes latency
#
#
# For further information on NRENS: https://en.wikipedia.org/wiki/National_research_and_education_network
# ### III.1 NRENs identification process
#
# To identify the African NRENs, the following resources have been used:
# * https://en.wikipedia.org/wiki/National_research_and_education_network
# * https://www.africaconnect2.net/Partners/African_NRENs/Pages/Home.aspx
# * https://bgpview.io
# * https://bgpview.io/asn/36944#downstreams-v4 (UbuntuNet NRENs)
# In[20]:
nren_probes = [15328, 30177, 13114, 13218, 12344, 14712, 19592, 6369, 33838, 33284, 3461, 13711, 21673]
print(str(len(nren_probes)) + ' NRENs probes identified.')
# * **Adding a new column testing probe's NREN membership**
# In[21]:
data['is_nren'] = data['prb_id'].isin(nren_probes)
data.head(2)
# ### III.2 NREN probes location
#
# In[23]:
df_probes['is_nren'] = df_probes['prb_id'].isin(nren_probes)
# In[731]:
african_countries = alt.topo_feature(AFRICA_TOPOJSON_URL, feature='continent_Africa_subunits')
# US states background
background = alt.Chart(african_countries).mark_geoshape(
fill='lightgray',
stroke='white'
).properties(
width=WIDTH,
height=500
).project('equirectangular')
# probes positions on background
points_not_nren = alt.Chart(df_probes[df_probes['is_nren'] == False]).mark_circle(
size=30, fill='steelblue', stroke='white', strokeWidth=0.5, opacity=0.8
).encode(
longitude='lon:Q',
latitude='lat:Q'
)
points_nren = alt.Chart(df_probes[df_probes['is_nren'] == True]).mark_point(
size=300, fill='#c44e51', opacity=1, shape='triangle-up'
).encode(
longitude='lon:Q',
latitude='lat:Q'
)
# NREN probes in red and non NREN probes in blue
(background + points_nren + points_not_nren)\
.configure_view(stroke="transparent")\
.save('./img/probes-locations-nrens.png')
# In[25]:
utils.show_chart('./img/probes-locations-nrens.png')
# * **Select only countries where NREN probes have been identified**
# In[27]:
countries_nren = list(data[data['is_nren'] == True]['country_code'].unique())
# In[28]:
data_nren = data[data['country_code'].isin(countries_nren)]
# * **Nb. of records per country per type of probes (belonging to nren or not)**
# In[29]:
data_nren[data_nren['rtt'] > THRESHOLD_RTT]\
.groupby(['country_name', 'is_nren'])\
.size()
# ### III.3 Comparing RTTs of NREN probes vs. others
#
# * **What percentage of NRENs measurements?**
# In[94]:
print("Percentage of NRENs measurements: {:.2f}".format(100 * sum(data_nren['is_nren'] == True) / len(data_nren)))
# In[736]:
points = alt.Chart(data_nren[data_nren['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE)).mark_point(
filled=True,
color='steelblue',
size=80
).encode(
x=alt.X('median(rtt)', title='Round-Trip-Time (ms)'),
y=alt.Y(
'is_nren',
sort=alt.EncodingSortField(
field='rtt',
op='median',
order='ascending'
),
title='NREN membership'
)
).properties(
width=0.8*WIDTH,
height=100
)
error_bars = points.mark_rule(size=1.5, color='steelblue').encode(
x='q1(rtt)',
x2='q3(rtt)',
)
(points + error_bars).save('./img/rtt-nrens-vs-others.png')
# In[31]:
utils.show_chart('./img/rtt-nrens-vs-others.png')
# ### III.4 Comparing RTTs of NREN probes vs. others per country
#
# In[181]:
points = alt.Chart(data_nren[data_nren['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE)).mark_point(
filled=True,
color='steelblue',
size=80
).encode(
x=alt.X('median(rtt)', title='Round-Trip-Time (ms)'),
y=alt.Y(
'is_nren',
sort=alt.EncodingSortField(
field='rtt',
op='median',
order='ascending'
),
title='',
axis=None
),
color=alt.Color('is_nren', legend=None, scale=alt.Scale(
domain=['true', 'false'],
range=['#c44e51','steelblue']))
).properties(
width=0.8*WIDTH,
height=80
)
error_bars = points.mark_rule(size=1.5, color='steelblue').encode(
x='q1(rtt)',
x2='q3(rtt)',
)
(points + error_bars)\
.facet(
row='country_name:N')\
.configure_axis(
labelFontSize=15,\
titleFontSize=15,\
gridDash=[4,4]\
)\
.configure_header(
titleFontSize=0,\
labelFontSize=15,\
labelLimit=80,\
title=None
)\
.save('./img/rtt-nrens-vs-others-by-country.png')
# In[182]:
utils.show_chart('./img/rtt-nrens-vs-others-by-country.png')
# ### III.5 Comparing Packet Loss of NREN probes vs. others
#
# * **Comparing mean Packet Loss**
# In[34]:
data_nren_valid = data_nren[data_nren['measurement_failed'] == False]
data_nren_valid['packets_lost'] = data_nren_valid['nb_packets'] - data_nren_valid['nb_rtts']
# In[37]:
packet_loss_nren_country = data_nren_valid.groupby(['is_nren'])\
.apply(perc_loss)\
.to_frame(name='% of packet loss')\
.sort_values(by='% of packet loss', ascending=False)
print(packet_loss_nren_country)
# ### III.6 Comparing Packet Loss of NREN probes vs. others per country
#
# * **Comparing mean Packet Loss per country**
# In[302]:
packet_loss_per_nren_per_country = data_nren_valid.groupby(['country_name', 'is_nren'])\
.apply(perc_loss)\
.to_frame(name='% of packet loss')\
packet_loss_per_nren_per_country
# In[305]:
alt.Chart(packet_loss_per_nren_per_country.reset_index())\
.mark_bar(height=20)\
.encode(
x=alt.X('% of packet loss', title='Packet loss (%)'),
y=alt.Y('is_nren:N', title='', axis=None),
color=alt.Color('is_nren', legend=None, scale=alt.Scale(domain=['true', 'false'], range=['#c44e51','steelblue'])),
row=alt.Row('country_name:N', sort=['Kenya', 'Algeria', 'Sudan', 'Morocco', 'South Africa',\
'Zambia', 'Madagascar', 'Tanzania, United Republic of']))\
.properties(
width=0.8*WIDTH,
height=70)\
.configure_axis(
labelFontSize=15,\
titleFontSize=15,\
gridDash=[4,4]\
)\
.configure_header(
titleFontSize=0,\
labelFontSize=15,\
labelLimit=80,\
title=None
).save('./img/packet-loss-nrens-vs-others-by-country.png')
# In[306]:
utils.show_chart('./img/packet-loss-nrens-vs-others-by-country.png')
# ### III.7 Is there enough evidence at this stage that a NREN make a difference?
#
# * **Hypothesis testing**
#
# H0: there is no difference of mean of % of packet loss between NRENs vs. non-NRENS measurements.
# In[39]:
# Observed difference of % of packet loss between NRENs and not-NRENs
observed_diff = packet_loss_nren_country.diff().values[1][0]
print(observed_diff)
# Let's first simulate that HO Null hypothesis:
# In[62]:
nren_idx = data_nren_valid[data_nren_valid['is_nren'] == True].index.values
# Sample non NREN measurements to get the same number of values as NRENs one as 10x more
not_nren_idx = data_nren_valid[data_nren_valid['is_nren'] == False].sample(len(nren_idx)).index.values
both_idx = np.concatenate((nren_idx, not_nren_idx))
# In[68]:
nb_replicates = 10000
replicates = np.zeros(nb_replicates)
for i in range(nb_replicates):
both_perm_idx = np.random.permutation(both_idx)
perm_sample_nren_idx = both_perm_idx[:len(nren_idx)]
perm_sample_not_nren_idx = both_perm_idx[len(not_nren_idx):]
replicates[i] = perc_loss(data_nren_valid.loc[perm_sample_nren_idx,])-\
perc_loss(data_nren_valid.loc[perm_sample_not_nren_idx,])
# In[69]:
alt.Chart(pd.DataFrame({'x': list(replicates)}))\
.mark_bar()\
.encode(
alt.X("x:Q", title='difference of mean of packet loss between NRENS and not-NRENS under simulated H0'),
y='count()')\
.properties(
width = WIDTH,
height = 200)\
.save('./img/estimate-diff-percentage-packet-loss.png')
# In[70]:
utils.show_chart('./img/estimate-diff-percentage-packet-loss.png')
# What's the probability of obtaining a value of our test statistic (difference of mean of packet loss) that is at least as extreme as what was observed, under the assumption the null hypothesis is true?
# In[71]:
print(100 * sum(replicates <= observed_diff) / len(replicates), '%')