#!/usr/bin/env python # coding: utf-8 # ### ANALYSING LATENCY IN AFRICA # USING RIPE NCC RECURRING *TRACEROUTE* MEASUREMENTS FROM PROBES IN AFRICA TO `eu.thethings.network` # ___ # ### TABLE OF CONTENT # # * **[I. Data Loading, preprocessing and cleaning](#data-loading)** # * [I.1 Measurements](#measurements) # * [I.2 Probes description](#probes-description) # * [I.3 Merging measurements and probes](#merge-measurements-probes) # * [I.4 Validation of Rtts vs. speed of light](#rtt-speed-of-light) # * **[II. Preliminary Exploratory Data Analysis (EDA)](#eda)** # * [II.1 Round-Trip-Time (RTT) histograms](#rtts-histogram) # * [II.2 Round-Trip-Time (RTT) boxplots by country](#boxplots-country) # * [II.3 Probes locations](#probes-location) # * [II.4 Packet loss](#packet-loss) # * **[III. Analysing National Education Research Networks probes latency (NREN)](#nren)** # * [II.1 NRENs identification](#nrens-identification) # * [II.2 NREN probes location](#nren-probes-loc) # * [II.3 Comparing RTTs of NREN probes vs. others](#rtt-nren-vs-others) # * [II.4 Comparing RTTs of NREN probes vs. others per country](#rtt-nren-vs-others-country) # * [III.5 Comparing Packet Loss of NREN probes vs. others](#packet-loss-nren-vs-others) # * [III.6 Comparing Packet Loss of NREN probes vs. others per country](#packet-loss-nren-vs-others-per-country) # * [III.7 Is there enough evidence at this stage that a NREN make a difference?](#evidence) # In[1]: from datetime import datetime import os.path import time import requests import numpy as np import pandas as pd import altair as alt from vega_datasets import data as alt_data # for the notebook only (not for JupyterLab) run this command once per session alt.renderers.enable('notebook') alt.data_transformers.disable_max_rows() import utilities as utils import warnings warnings.filterwarnings('ignore') get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: # Globar vars WIDTH = 860 AFRICA_TOPOJSON_URL = 'https://raw.githubusercontent.com/deldersveld/topojson/master/continents/africa.json' SAMPLE_SIZE = 10000 # ## I. Data Loading, preprocessing and cleaning # # ### I.1 Measurements # # In[5]: json_data = utils.get_measurements(measurement_id=17468431, start='01/12/2018', stop='31/12/2018') # In[4]: # Number of records print(str(len(json_data)) + ' results') # In[5]: # An exemple of result json_data[0] # * **Measurement results data structure** # # *Note that according to probe's firmware version, returned fields might differ* # # From: https://atlas.ripe.net/docs/data_struct/ # # * `af`: adress familly with possible values: {4: IPv4, 6: IPv6} # * `dst_addr`: IP address of the destination (string) # * `dst_name`: name of the destination (string) # * `from`: IP address of the probe as known by controller (string) # * `fw`: firmware version of the probe # * `group_id`: if the measurement belongs to a group of measurements, the identifier of the group (int) # * `lts`: last time synchronised. How long ago (in seconds) the clock of the probe was found to be in sync with that of a controller. The value -1 is used to indicate that the probe does not know whether it is in sync (int) # * `msm_id`: measurement identifier (int) # * `msm_name`: measurement type "Ping" (string) # * `paris_id`: variation for the Paris mode of traceroute (int) # * `prb_id`: source probe ID (int) # * `proto`: "UDP", "ICMP", or "TCP" (string) # * `result`: list of hop elements (array of objects). Objects have the following fields: # * `hop`: hop number (int) # * `error`: [optional] when an error occurs trying to send a packet. In that case there will not be a result structure. (string) # * `result`: variable content, depending on type of response (array of objects) # objects have the following fields: # * `flags`: "flags" -- (optional) TCP flags in the reply packet, for TCP traceroute, concatenated, in the order 'F' (FIN), 'S' (SYN), 'R' (RST), 'P' (PSH), 'A' (ACK), 'U' (URG) (fw >= 4600) (string) # * `from`: IPv4 or IPv6 source address in reply (string) # * `rtt`: round-trip-time of reply, not present when the response is late in ms (float) # * `size`: size of reply (int) # * `ttl`: time-to-live in reply (int) # * `size`: packet size (int) # * `src_addr`: source address used by probe (string) # * `timestamp`: Unix timestamp for start of measurement (int) # * `type`: "traceroute" (string) # # # # In[6]: # To check a specific record by attribute # list(filter(lambda m: m['endtime'] == 1543619009, json_data)) # * **Checking destination IP address and fetching lon, lat for later use** # In[ ]: # Let's check quickly all probes trace to the same IP address ip_dest = set(map(lambda x: x['dst_addr'], json_data)) print('Destination IP address: ', ip_dest) # Building the url as a long string in one go url_ipinfo = 'http://ipinfo.io/{}?token=your-token'.format(list(ip_dest)[0]) url_ipinfo r = requests.get(url_ipinfo) json_ipinfo = r.json() print(json_ipinfo) lat_dest, lon_dest = map(lambda x: float(x), json_ipinfo['loc'].split(',')) # In[ ]: json_ipinfo # In[106]: # Utilities functions and lambdas # Error codes: https://atlas.ripe.net/docs/data_struct # Filtering/mapping predicates has_result = lambda row: not('error' in row['result'][0]) is_late = lambda pack: 'late' in pack # number of packets a reply is late, in this case rtt is not present (int) is_timed_out = lambda pack: 'x' in pack # {'x': '*'} - Timed out is_error = lambda pack: 'error' in pack # Network, destination,... unreachable is_err = lambda pack: 'err' in pack # Network, destination,... unreachable is_empty = lambda pack: not(bool(pack)) get_result = lambda row: row['result'][0]['result'] get_nb_packets = lambda row: len(get_result(row)) get_max_nb_hops = lambda row: row['result'][0]['hop'] get_rtts = lambda row: list(filter(lambda pack: not(is_late(pack) or is_timed_out(pack) or is_error(pack) or is_err(pack) or is_empty(pack)), get_result(row))) get_nb_late_pack = lambda row: len(list(filter(lambda pack: is_late(pack), get_result(row)))) get_nb_timed_out_pack = lambda row: len(list(filter(lambda pack: is_timed_out(pack), get_result(row)))) get_nb_error_pack = lambda row: len(list(filter(lambda pack: (is_error(pack) or is_err(pack)), get_result(row)))) get_nb_empty_pack = lambda row: len(list(filter(lambda pack: is_empty(pack), get_result(row)))) get_nb_rtts = lambda row: len(get_rtts(row)) get_rtts_mean = lambda row: np.mean(list(map(lambda pack: pack['rtt'], get_rtts(row)))) get_ttl = lambda row: get_rtts(row)[0]['ttl'] to_datetime = lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') # In[131]: # Postprocess json data measurements = [] for row in json_data: #is_valid = has_result(row) and get_nb_rtts(row) is_valid = has_result(row) measurements.append( {'prb_id': row['prb_id'], 'ip_from': row['from'], 'paris_id': row['paris_id'], 'datetime': to_datetime(row['timestamp']), 'start_time': row['timestamp'], 'end_time': row['endtime'], 'last_time_sync': row['lts'], 'firmware_version': row['fw'], 'nb_packets': get_nb_packets(row) if is_valid else np.NaN, 'nb_rtts': get_nb_rtts(row) if is_valid else np.NaN, 'rtt': get_rtts_mean(row) if is_valid else np.NaN, 'measurement_failed': not(is_valid), 'nb_late_pack': get_nb_late_pack(row) if is_valid else np.NaN, 'nb_timed_out_pack': get_nb_timed_out_pack(row) if is_valid else np.NaN, 'nb_error_pack': get_nb_error_pack(row) if is_valid else np.NaN, 'nb_empty_pack': get_nb_empty_pack(row) if is_valid else np.NaN, #'nb_hops': get_max_nb_hops(row) - get_ttl(row) if is_valid else np.NaN }) # In[132]: # Convert to Pandas dataframe columns = ['datetime', 'prb_id', 'ip_from', 'paris_id', 'start_time', 'end_time', 'last_time_sync', 'firmware_version', 'nb_packets', 'nb_rtts', 'rtt', 'measurement_failed', 'nb_late_pack', 'nb_timed_out_pack', 'nb_error_pack','nb_empty_pack'] df = pd.DataFrame(measurements, columns=columns) df['datetime'] = pd.to_datetime(df['datetime']) df.set_index('datetime', inplace=True) df.sort_index(inplace=True) # In[692]: df.head() # ### I.2 Probes description # # In[693]: # Get list of probes involved in measurements probes_id = list(df['prb_id'].unique()) # In[694]: json_data_probe = utils.get_probes(probes_id) # In[695]: # An example of probe's description. json_data_probe['results'][0] # In[19]: # Postprocess json data probes = [] for i, row in enumerate(json_data_probe['results']): probes.append( {'prb_id': row['id'], 'ip_v4': row['address_v4'], 'asn': str(row['asn_v4']), 'country_code': row['country_code'], 'description': row['description'], 'lon': row['geometry']['coordinates'][0], 'lat': row['geometry']['coordinates'][1]}) # In[20]: # Convert to Pandas dataframe columns = ['prb_id','ip_v4', 'asn', 'country_code', 'description', 'lon', 'lat'] df_probes = pd.DataFrame(probes, columns=columns) # In[21]: print('Number of countries: ', len(df_probes['country_code'].unique())) print(df_probes['country_code'].unique()) # In[22]: # Loading country codes country_codes = pd.read_csv('https://raw.githubusercontent.com/franckalbinet/' + 'latency-internet-africa/master/data/country_codes.csv') country_codes.head() # In[23]: # Joining/merging with country code to get full country name df_probes = pd.merge(df_probes, country_codes[['name', 'alpha-2']], left_on='country_code', right_on='alpha-2', how='left') df_probes = df_probes.drop(['country_code'], axis=1) df_probes.rename(columns={'alpha-2':'country_code', 'name': 'country_name'}, inplace=True) df_probes.head() # * **It looks like we don't get the IP address from all probes. Let's use the ip address provided in measurement results instead.** # In[24]: # Note that each probe can have several IP addresses. We keep only one. ip_from_measurement = df[['prb_id','ip_from']].reset_index().drop(columns=['datetime']).drop_duplicates(subset='prb_id') # In[25]: df_probes = pd.merge(df_probes, ip_from_measurement, left_on='prb_id', right_on='prb_id', how='left') # In[26]: df_probes.head() # In[96]: ipinfo_lookup = [] for ip in df_probes['ip_from'].values: r = requests.get('http://ipinfo.io/{}?token=your-token'.format(ip)) ipinfo_lookup.append(r.json()) # In[97]: df_ip_info = pd.DataFrame(ipinfo_lookup) df_ip_info.head() # In[98]: df_probes = pd.merge(df_probes, df_ip_info, left_on='ip_from', right_on='ip', how='left') df_probes.head() # In[99]: df_probes = df_probes[['prb_id', 'ip', 'asn', 'hostname', 'org', 'description', 'country_name', 'country_code', 'region', 'city', 'lon', 'lat']] # In[100]: df_probes.head() # In[101]: # Saving probes description as csv utils.df_to_csv(df_probes, prefix_name='probes') # ### I.3 Merging measurements and probes # # In[19]: df_probes = pd.read_csv('data/probes-2019-08-23.csv') # In[697]: # Joining/merging with country code to get full country name data = pd.merge(df.reset_index(), df_probes, left_on='prb_id', right_on='prb_id', how='left') # ### I.4 Validation of Rtts vs. speed of light # # In[700]: SPEED_OF_LIGHT = 299792.458 # km/s DIST = 3620 # km (more or less shortest African point from Dublin) print("Light takes: ", int(1000*DIST / SPEED_OF_LIGHT), "ms to travel") # in ms # In[7]: # To be defined THRESHOLD_RTT = 30 # in ms # * **Which probes yield rtts below 30 ms??** # In[702]: data[data['rtt'] < 30]['prb_id'].unique() # * **In which countries??** # In[703]: data[data['rtt'] < 30]['country_name'].unique() # ___ # In[704]: # Remove missing country names - though can be retrieved easily but those countries are # not relevant to ous NRENs vs. others analysis as no identified NRENs in those countries. data.dropna(subset=['country_name'], inplace=True) # In[705]: # Dumping data for further use utils.df_to_csv(data, prefix_name='measurements') # ## II. Preliminary Exploratory Data Analysis (EDA) # # In[3]: data = pd.read_csv('./data/measurements-2019-09-20.csv') # In[4]: # How many measurements and columns data.shape # In[5]: # Quick descriptive statistics data['rtt'].describe() # In[97]: # Time range fetched print("Measurements from: {} to {}".format(min(data['datetime']),max(data['datetime']))) # ### II.1 Round-Trip-Time histograms # # * **All Rtts Range** # In[8]: alt.Chart(data[data['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE))\ .mark_bar()\ .encode( alt.X("rtt:Q", bin=alt.Bin(step=100), title='Round-Trip-Time (ms)'), y='count()')\ .properties( width = WIDTH, height = 200)\ .save('./img/all-rtts.png') # In[9]: utils.show_chart('./img/all-rtts.png') # * **Focusing on [0, 600] ms range** # In[712]: alt.Chart(data.sample(SAMPLE_SIZE))\ .mark_bar()\ .encode( alt.X("rtt:Q", bin=alt.Bin(extent=[0, 600], step=20), title='Round-Trip-Time (ms)'), y='count()')\ .properties( width = WIDTH, height = 200)\ .save('./img/rtts-focus.png') # In[10]: utils.show_chart('./img/rtts-focus.png') # ### II.2 Round-Trip-Time (RTT) boxplots by country # # * **With peaks and outliers** # In[714]: alt.Chart(data[data['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE)).mark_boxplot().encode( alt.X('rtt:Q',title='Round-Trip-Time (ms)'), alt.Y(field = 'country_name', type = 'nominal', sort=alt.EncodingSortField(field='rtt', op='max', order='ascending'), title='Country'))\ .properties( width = 0.8*WIDTH, height = 550).save('./img/rtts-boxplot-by-country.png') # In[11]: utils.show_chart('./img/rtts-boxplot-by-country.png') # * **Focusing on Interquartile range and median** # # Let's forget outliers and peaks for now and focus on Q1, Q2, Q3. # In[716]: points = alt.Chart(data[data['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE)).mark_point( filled=True, color='steelblue', size=80 ).encode( x=alt.X('median(rtt)', title='Round-Trip-Time (ms)'), y=alt.Y( 'country_name', sort=alt.EncodingSortField( field='rtt', op='median', order='ascending' ), title='Country' ) ).properties( width=0.8*WIDTH, height=600 ) error_bars = points.mark_rule(size=1.5, color='steelblue').encode( x='q1(rtt)', x2='q3(rtt)', ) (points + error_bars).save('./img/rtts-iqr-by-country.png') # In[12]: utils.show_chart('./img/rtts-iqr-by-country.png') # ### II.1 Probes locations # # In[718]: african_countries = alt.topo_feature(AFRICA_TOPOJSON_URL, feature='continent_Africa_subunits') # US states background background = alt.Chart(african_countries).mark_geoshape( fill='lightgray', stroke='white' ).properties( width=WIDTH, height=500 ).project('equirectangular') # probes positions on background points = alt.Chart(df_probes).mark_circle( size=50, color='steelblue' ).encode( longitude='lon:Q', latitude='lat:Q' ) (background + points).configure_view(stroke="transparent").save('./img/probes-locations.png') # In[13]: utils.show_chart('./img/probes-locations.png') # ### II.4 Packet loss # # # Below is considered a lost packet when: # * no result; # * packet late; # * timed out. # In[91]: data_valid = data[data['measurement_failed'] == False] data_valid['packets_lost'] = data_valid['nb_packets'] - data_valid['nb_rtts'] # In[92]: def perc_loss(df): return 100 * sum(df['packets_lost']) / sum(df['nb_packets']) # In[93]: print("Percentage of packets lost: {:.2f}".format(perc_loss(data_valid))) # In[723]: packet_loss_country = data_valid.groupby(['country_name'])\ .apply(perc_loss)\ .to_frame(name='% of packet loss')\ .sort_values(by='% of packet loss', ascending=False) print(packet_loss_country) # In[724]: alt.Chart(packet_loss_country.reset_index()).mark_bar().encode( x=alt.X('% of packet loss', title='Packet loss (%)'), y=alt.Y( 'country_name:N', sort=alt.EncodingSortField( field='% of packet loss', order='descending' ), title='Country' ) ).properties( width=0.8*WIDTH, height=600 ).save('./img/packet-loss-per-country.png') # In[14]: utils.show_chart('./img/packet-loss-per-country.png') # ## III. Analysing National Education Research Networks (NREN) probes latency # # # For further information on NRENS: https://en.wikipedia.org/wiki/National_research_and_education_network # ### III.1 NRENs identification process # # To identify the African NRENs, the following resources have been used: # * https://en.wikipedia.org/wiki/National_research_and_education_network # * https://www.africaconnect2.net/Partners/African_NRENs/Pages/Home.aspx # * https://bgpview.io # * https://bgpview.io/asn/36944#downstreams-v4 (UbuntuNet NRENs) # In[20]: nren_probes = [15328, 30177, 13114, 13218, 12344, 14712, 19592, 6369, 33838, 33284, 3461, 13711, 21673] print(str(len(nren_probes)) + ' NRENs probes identified.') # * **Adding a new column testing probe's NREN membership** # In[21]: data['is_nren'] = data['prb_id'].isin(nren_probes) data.head(2) # ### III.2 NREN probes location # # In[23]: df_probes['is_nren'] = df_probes['prb_id'].isin(nren_probes) # In[731]: african_countries = alt.topo_feature(AFRICA_TOPOJSON_URL, feature='continent_Africa_subunits') # US states background background = alt.Chart(african_countries).mark_geoshape( fill='lightgray', stroke='white' ).properties( width=WIDTH, height=500 ).project('equirectangular') # probes positions on background points_not_nren = alt.Chart(df_probes[df_probes['is_nren'] == False]).mark_circle( size=30, fill='steelblue', stroke='white', strokeWidth=0.5, opacity=0.8 ).encode( longitude='lon:Q', latitude='lat:Q' ) points_nren = alt.Chart(df_probes[df_probes['is_nren'] == True]).mark_point( size=300, fill='#c44e51', opacity=1, shape='triangle-up' ).encode( longitude='lon:Q', latitude='lat:Q' ) # NREN probes in red and non NREN probes in blue (background + points_nren + points_not_nren)\ .configure_view(stroke="transparent")\ .save('./img/probes-locations-nrens.png') # In[25]: utils.show_chart('./img/probes-locations-nrens.png') # * **Select only countries where NREN probes have been identified** # In[27]: countries_nren = list(data[data['is_nren'] == True]['country_code'].unique()) # In[28]: data_nren = data[data['country_code'].isin(countries_nren)] # * **Nb. of records per country per type of probes (belonging to nren or not)** # In[29]: data_nren[data_nren['rtt'] > THRESHOLD_RTT]\ .groupby(['country_name', 'is_nren'])\ .size() # ### III.3 Comparing RTTs of NREN probes vs. others # # * **What percentage of NRENs measurements?** # In[94]: print("Percentage of NRENs measurements: {:.2f}".format(100 * sum(data_nren['is_nren'] == True) / len(data_nren))) # In[736]: points = alt.Chart(data_nren[data_nren['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE)).mark_point( filled=True, color='steelblue', size=80 ).encode( x=alt.X('median(rtt)', title='Round-Trip-Time (ms)'), y=alt.Y( 'is_nren', sort=alt.EncodingSortField( field='rtt', op='median', order='ascending' ), title='NREN membership' ) ).properties( width=0.8*WIDTH, height=100 ) error_bars = points.mark_rule(size=1.5, color='steelblue').encode( x='q1(rtt)', x2='q3(rtt)', ) (points + error_bars).save('./img/rtt-nrens-vs-others.png') # In[31]: utils.show_chart('./img/rtt-nrens-vs-others.png') # ### III.4 Comparing RTTs of NREN probes vs. others per country # # In[181]: points = alt.Chart(data_nren[data_nren['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE)).mark_point( filled=True, color='steelblue', size=80 ).encode( x=alt.X('median(rtt)', title='Round-Trip-Time (ms)'), y=alt.Y( 'is_nren', sort=alt.EncodingSortField( field='rtt', op='median', order='ascending' ), title='', axis=None ), color=alt.Color('is_nren', legend=None, scale=alt.Scale( domain=['true', 'false'], range=['#c44e51','steelblue'])) ).properties( width=0.8*WIDTH, height=80 ) error_bars = points.mark_rule(size=1.5, color='steelblue').encode( x='q1(rtt)', x2='q3(rtt)', ) (points + error_bars)\ .facet( row='country_name:N')\ .configure_axis( labelFontSize=15,\ titleFontSize=15,\ gridDash=[4,4]\ )\ .configure_header( titleFontSize=0,\ labelFontSize=15,\ labelLimit=80,\ title=None )\ .save('./img/rtt-nrens-vs-others-by-country.png') # In[182]: utils.show_chart('./img/rtt-nrens-vs-others-by-country.png') # ### III.5 Comparing Packet Loss of NREN probes vs. others # # * **Comparing mean Packet Loss** # In[34]: data_nren_valid = data_nren[data_nren['measurement_failed'] == False] data_nren_valid['packets_lost'] = data_nren_valid['nb_packets'] - data_nren_valid['nb_rtts'] # In[37]: packet_loss_nren_country = data_nren_valid.groupby(['is_nren'])\ .apply(perc_loss)\ .to_frame(name='% of packet loss')\ .sort_values(by='% of packet loss', ascending=False) print(packet_loss_nren_country) # ### III.6 Comparing Packet Loss of NREN probes vs. others per country # # * **Comparing mean Packet Loss per country** # In[302]: packet_loss_per_nren_per_country = data_nren_valid.groupby(['country_name', 'is_nren'])\ .apply(perc_loss)\ .to_frame(name='% of packet loss')\ packet_loss_per_nren_per_country # In[305]: alt.Chart(packet_loss_per_nren_per_country.reset_index())\ .mark_bar(height=20)\ .encode( x=alt.X('% of packet loss', title='Packet loss (%)'), y=alt.Y('is_nren:N', title='', axis=None), color=alt.Color('is_nren', legend=None, scale=alt.Scale(domain=['true', 'false'], range=['#c44e51','steelblue'])), row=alt.Row('country_name:N', sort=['Kenya', 'Algeria', 'Sudan', 'Morocco', 'South Africa',\ 'Zambia', 'Madagascar', 'Tanzania, United Republic of']))\ .properties( width=0.8*WIDTH, height=70)\ .configure_axis( labelFontSize=15,\ titleFontSize=15,\ gridDash=[4,4]\ )\ .configure_header( titleFontSize=0,\ labelFontSize=15,\ labelLimit=80,\ title=None ).save('./img/packet-loss-nrens-vs-others-by-country.png') # In[306]: utils.show_chart('./img/packet-loss-nrens-vs-others-by-country.png') # ### III.7 Is there enough evidence at this stage that a NREN make a difference? # # * **Hypothesis testing** # # H0: there is no difference of mean of % of packet loss between NRENs vs. non-NRENS measurements. # In[39]: # Observed difference of % of packet loss between NRENs and not-NRENs observed_diff = packet_loss_nren_country.diff().values[1][0] print(observed_diff) # Let's first simulate that HO Null hypothesis: # In[62]: nren_idx = data_nren_valid[data_nren_valid['is_nren'] == True].index.values # Sample non NREN measurements to get the same number of values as NRENs one as 10x more not_nren_idx = data_nren_valid[data_nren_valid['is_nren'] == False].sample(len(nren_idx)).index.values both_idx = np.concatenate((nren_idx, not_nren_idx)) # In[68]: nb_replicates = 10000 replicates = np.zeros(nb_replicates) for i in range(nb_replicates): both_perm_idx = np.random.permutation(both_idx) perm_sample_nren_idx = both_perm_idx[:len(nren_idx)] perm_sample_not_nren_idx = both_perm_idx[len(not_nren_idx):] replicates[i] = perc_loss(data_nren_valid.loc[perm_sample_nren_idx,])-\ perc_loss(data_nren_valid.loc[perm_sample_not_nren_idx,]) # In[69]: alt.Chart(pd.DataFrame({'x': list(replicates)}))\ .mark_bar()\ .encode( alt.X("x:Q", title='difference of mean of packet loss between NRENS and not-NRENS under simulated H0'), y='count()')\ .properties( width = WIDTH, height = 200)\ .save('./img/estimate-diff-percentage-packet-loss.png') # In[70]: utils.show_chart('./img/estimate-diff-percentage-packet-loss.png') # What's the probability of obtaining a value of our test statistic (difference of mean of packet loss) that is at least as extreme as what was observed, under the assumption the null hypothesis is true? # In[71]: print(100 * sum(replicates <= observed_diff) / len(replicates), '%')