USING RIPE NCC RECURRING TRACEROUTE MEASUREMENTS FROM PROBES IN AFRICA TO eu.thethings.network
___
from datetime import datetime
import os.path
import time
import requests
import numpy as np
import pandas as pd
import altair as alt
from vega_datasets import data as alt_data
# for the notebook only (not for JupyterLab) run this command once per session
alt.renderers.enable('notebook')
alt.data_transformers.disable_max_rows()
import utilities as utils
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2
# Globar vars
WIDTH = 860
AFRICA_TOPOJSON_URL = 'https://raw.githubusercontent.com/deldersveld/topojson/master/continents/africa.json'
SAMPLE_SIZE = 10000
json_data = utils.get_measurements(measurement_id=17468431, start='01/12/2018', stop='31/12/2018')
Fetching data from: https://atlas.ripe.net/api/v2/measurements/17468431/results/?start=1543618800&stop=1546210800&probe_ids=None
# Number of records
print(str(len(json_data)) + ' results')
607324 results
# An exemple of result
json_data[0]
{'af': 4, 'dst_addr': '52.169.76.255', 'dst_name': '52.169.76.255', 'endtime': 1543618976, 'from': '169.0.103.214', 'fw': 4940, 'group_id': 17468431, 'lts': 8, 'msm_id': 17468431, 'msm_name': 'Traceroute', 'paris_id': 3, 'prb_id': 21682, 'proto': 'TCP', 'result': [{'hop': 64, 'result': [{'flags': 'SA', 'from': '52.169.76.255', 'hdropts': [{'mss': 1440}], 'rtt': 176.065, 'size': 4, 'ttl': 50}, {'flags': 'SA', 'from': '52.169.76.255', 'hdropts': [{'mss': 1440}], 'rtt': 176.794, 'size': 4, 'ttl': 50}, {'flags': 'SA', 'from': '52.169.76.255', 'hdropts': [{'mss': 1440}], 'rtt': 178.843, 'size': 4, 'ttl': 50}]}], 'size': 0, 'src_addr': '192.168.3.14', 'stored_timestamp': 1543619072, 'timestamp': 1543618975, 'type': 'traceroute'}
Note that according to probe's firmware version, returned fields might differ
From: https://atlas.ripe.net/docs/data_struct/
af
: adress familly with possible values: {4: IPv4, 6: IPv6}dst_addr
: IP address of the destination (string)dst_name
: name of the destination (string)from
: IP address of the probe as known by controller (string)fw
: firmware version of the probegroup_id
: if the measurement belongs to a group of measurements, the identifier of the group (int)lts
: last time synchronised. How long ago (in seconds) the clock of the probe was found to be in sync with that of a controller. The value -1 is used to indicate that the probe does not know whether it is in sync (int)msm_id
: measurement identifier (int)msm_name
: measurement type "Ping" (string)paris_id
: variation for the Paris mode of traceroute (int)prb_id
: source probe ID (int)proto
: "UDP", "ICMP", or "TCP" (string)result
: list of hop elements (array of objects). Objects have the following fields:hop
: hop number (int)error
: [optional] when an error occurs trying to send a packet. In that case there will not be a result structure. (string)result
: variable content, depending on type of response (array of objects)objects have the following fields:
* flags
: "flags" -- (optional) TCP flags in the reply packet, for TCP traceroute, concatenated, in the order 'F' (FIN), 'S' (SYN), 'R' (RST), 'P' (PSH), 'A' (ACK), 'U' (URG) (fw >= 4600) (string)
* from
: IPv4 or IPv6 source address in reply (string)
* rtt
: round-trip-time of reply, not present when the response is late in ms (float)
* size
: size of reply (int)
* ttl
: time-to-live in reply (int)
* size
: packet size (int)
* src_addr
: source address used by probe (string)
* timestamp
: Unix timestamp for start of measurement (int)
* type
: "traceroute" (string)
# To check a specific record by attribute
# list(filter(lambda m: m['endtime'] == 1543619009, json_data))
# Let's check quickly all probes trace to the same IP address
ip_dest = set(map(lambda x: x['dst_addr'], json_data))
print('Destination IP address: ', ip_dest)
# Building the url as a long string in one go
url_ipinfo = 'http://ipinfo.io/{}?token=your-token'.format(list(ip_dest)[0])
url_ipinfo
r = requests.get(url_ipinfo)
json_ipinfo = r.json()
print(json_ipinfo)
lat_dest, lon_dest = map(lambda x: float(x), json_ipinfo['loc'].split(','))
Destination IP address: {'52.169.76.255'} {'ip': '52.169.76.255', 'city': 'Dublin', 'region': 'Leinster', 'country': 'IE', 'loc': '53.3331,-6.2489', 'org': 'AS8075 Microsoft Corporation', 'postal': 'D02'}
json_ipinfo
{'city': 'Dublin', 'country': 'IE', 'ip': '52.169.76.255', 'loc': '53.3331,-6.2489', 'org': 'AS8075 Microsoft Corporation', 'postal': 'D02', 'region': 'Leinster'}
# Utilities functions and lambdas
# Error codes: https://atlas.ripe.net/docs/data_struct
# Filtering/mapping predicates
has_result = lambda row: not('error' in row['result'][0])
is_late = lambda pack: 'late' in pack # number of packets a reply is late, in this case rtt is not present (int)
is_timed_out = lambda pack: 'x' in pack # {'x': '*'} - Timed out
is_error = lambda pack: 'error' in pack # Network, destination,... unreachable
is_err = lambda pack: 'err' in pack # Network, destination,... unreachable
is_empty = lambda pack: not(bool(pack))
get_result = lambda row: row['result'][0]['result']
get_nb_packets = lambda row: len(get_result(row))
get_max_nb_hops = lambda row: row['result'][0]['hop']
get_rtts = lambda row: list(filter(lambda pack: not(is_late(pack) or
is_timed_out(pack) or
is_error(pack) or
is_err(pack) or
is_empty(pack)), get_result(row)))
get_nb_late_pack = lambda row: len(list(filter(lambda pack: is_late(pack), get_result(row))))
get_nb_timed_out_pack = lambda row: len(list(filter(lambda pack: is_timed_out(pack), get_result(row))))
get_nb_error_pack = lambda row: len(list(filter(lambda pack: (is_error(pack) or
is_err(pack)), get_result(row))))
get_nb_empty_pack = lambda row: len(list(filter(lambda pack: is_empty(pack), get_result(row))))
get_nb_rtts = lambda row: len(get_rtts(row))
get_rtts_mean = lambda row: np.mean(list(map(lambda pack: pack['rtt'], get_rtts(row))))
get_ttl = lambda row: get_rtts(row)[0]['ttl']
to_datetime = lambda x: datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')
# Postprocess json data
measurements = []
for row in json_data:
#is_valid = has_result(row) and get_nb_rtts(row)
is_valid = has_result(row)
measurements.append(
{'prb_id': row['prb_id'],
'ip_from': row['from'],
'paris_id': row['paris_id'],
'datetime': to_datetime(row['timestamp']),
'start_time': row['timestamp'],
'end_time': row['endtime'],
'last_time_sync': row['lts'],
'firmware_version': row['fw'],
'nb_packets': get_nb_packets(row) if is_valid else np.NaN,
'nb_rtts': get_nb_rtts(row) if is_valid else np.NaN,
'rtt': get_rtts_mean(row) if is_valid else np.NaN,
'measurement_failed': not(is_valid),
'nb_late_pack': get_nb_late_pack(row) if is_valid else np.NaN,
'nb_timed_out_pack': get_nb_timed_out_pack(row) if is_valid else np.NaN,
'nb_error_pack': get_nb_error_pack(row) if is_valid else np.NaN,
'nb_empty_pack': get_nb_empty_pack(row) if is_valid else np.NaN,
#'nb_hops': get_max_nb_hops(row) - get_ttl(row) if is_valid else np.NaN
})
# Convert to Pandas dataframe
columns = ['datetime', 'prb_id', 'ip_from', 'paris_id', 'start_time',
'end_time', 'last_time_sync', 'firmware_version', 'nb_packets', 'nb_rtts',
'rtt', 'measurement_failed', 'nb_late_pack', 'nb_timed_out_pack',
'nb_error_pack','nb_empty_pack']
df = pd.DataFrame(measurements, columns=columns)
df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index('datetime', inplace=True)
df.sort_index(inplace=True)
df.head()
prb_id | ip_from | paris_id | start_time | end_time | last_time_sync | firmware_version | nb_packets | nb_rtts | rtt | measurement_failed | nb_late_pack | nb_timed_out_pack | nb_error_pack | nb_empty_pack | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
datetime | |||||||||||||||
2018-11-30 23:02:49 | 35067 | 41.248.208.162 | 8 | 1543618969 | 1543618970 | 204 | 4780 | 3.0 | 3.0 | 94.480667 | False | 0.0 | 0.0 | 0.0 | 0.0 |
2018-11-30 23:02:55 | 21682 | 169.0.103.214 | 3 | 1543618975 | 1543618976 | 8 | 4940 | 3.0 | 3.0 | 177.234000 | False | 0.0 | 0.0 | 0.0 | 0.0 |
2018-11-30 23:03:00 | 33073 | 196.15.205.17 | 3 | 1543618980 | 1543618980 | 48 | 4940 | 3.0 | 3.0 | 172.731000 | False | 0.0 | 0.0 | 0.0 | 0.0 |
2018-11-30 23:03:02 | 50355 | 155.93.192.173 | 4 | 1543618982 | 1543618982 | 49 | 4940 | 3.0 | 3.0 | 178.559000 | False | 0.0 | 0.0 | 0.0 | 0.0 |
2018-11-30 23:03:03 | 11383 | 196.1.95.103 | 3 | 1543618983 | 1543618984 | 91 | 4940 | 3.0 | 3.0 | 96.144667 | False | 0.0 | 0.0 | 0.0 | 0.0 |
# Get list of probes involved in measurements
probes_id = list(df['prb_id'].unique())
json_data_probe = utils.get_probes(probes_id)
# An example of probe's description.
json_data_probe['results'][0]
{'address_v4': '196.1.95.16', 'address_v6': '2001:4278:1000:1::16', 'asn_v4': 8346, 'asn_v6': 8346, 'country_code': 'SN', 'description': 'UCAD Probe', 'first_connected': 1291147153, 'geometry': {'type': 'Point', 'coordinates': [-17.4415, 14.6715]}, 'id': 239, 'is_anchor': False, 'is_public': True, 'last_connected': 1568990096, 'prefix_v4': '196.1.92.0/22', 'prefix_v6': '2001:4278::/32', 'status': {'id': 1, 'name': 'Connected', 'since': '2019-09-19T22:17:28Z'}, 'status_since': 1568931448, 'tags': [{'name': 'Office', 'slug': 'office'}, {'name': 'No NAT', 'slug': 'no-nat'}, {'name': 'system: V1', 'slug': 'system-v1'}, {'name': 'system: Resolves A Correctly', 'slug': 'system-resolves-a-correctly'}, {'name': 'system: Resolves AAAA Correctly', 'slug': 'system-resolves-aaaa-correctly'}, {'name': 'system: IPv4 Works', 'slug': 'system-ipv4-works'}, {'name': "system: IPv6 Doesn't Work", 'slug': 'system-ipv6-doesnt-work'}, {'name': 'system: IPv4 Capable', 'slug': 'system-ipv4-capable'}, {'name': 'system: IPv6 Capable', 'slug': 'system-ipv6-capable'}], 'total_uptime': 236903509, 'type': 'Probe'}
# Postprocess json data
probes = []
for i, row in enumerate(json_data_probe['results']):
probes.append(
{'prb_id': row['id'],
'ip_v4': row['address_v4'],
'asn': str(row['asn_v4']),
'country_code': row['country_code'],
'description': row['description'],
'lon': row['geometry']['coordinates'][0],
'lat': row['geometry']['coordinates'][1]})
# Convert to Pandas dataframe
columns = ['prb_id','ip_v4', 'asn', 'country_code', 'description', 'lon', 'lat']
df_probes = pd.DataFrame(probes, columns=columns)
print('Number of countries: ', len(df_probes['country_code'].unique()))
print(df_probes['country_code'].unique())
Number of countries: 36 ['SN' 'ZA' 'MU' 'TN' 'BW' 'CM' 'GH' 'TZ' 'KE' 'BF' 'UG' 'RW' 'NG' 'CG' 'BJ' 'MA' 'BI' 'MZ' 'ET' 'ZM' 'SZ' 'AO' 'MW' 'LS' 'NA' 'MG' 'SD' 'SS' 'SC' 'CD' 'DZ' 'ZW' 'GM' 'TG' 'EG' 'DJ']
# Loading country codes
country_codes = pd.read_csv('https://raw.githubusercontent.com/franckalbinet/' +
'latency-internet-africa/master/data/country_codes.csv')
country_codes.head()
name | alpha-2 | alpha-3 | country-code | iso_3166-2 | region | sub-region | intermediate-region | region-code | sub-region-code | intermediate-region-code | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Afghanistan | AF | AFG | 4 | ISO 3166-2:AF | Asia | Southern Asia | NaN | 142.0 | 34.0 | NaN |
1 | Åland Islands | AX | ALA | 248 | ISO 3166-2:AX | Europe | Northern Europe | NaN | 150.0 | 154.0 | NaN |
2 | Albania | AL | ALB | 8 | ISO 3166-2:AL | Europe | Southern Europe | NaN | 150.0 | 39.0 | NaN |
3 | Algeria | DZ | DZA | 12 | ISO 3166-2:DZ | Africa | Northern Africa | NaN | 2.0 | 15.0 | NaN |
4 | American Samoa | AS | ASM | 16 | ISO 3166-2:AS | Oceania | Polynesia | NaN | 9.0 | 61.0 | NaN |
# Joining/merging with country code to get full country name
df_probes = pd.merge(df_probes, country_codes[['name', 'alpha-2']], left_on='country_code',
right_on='alpha-2', how='left')
df_probes = df_probes.drop(['country_code'], axis=1)
df_probes.rename(columns={'alpha-2':'country_code', 'name': 'country_name'}, inplace=True)
df_probes.head()
prb_id | ip_v4 | asn | description | lon | lat | country_name | country_code | |
---|---|---|---|---|---|---|---|---|
0 | 239 | 196.1.95.16 | 8346 | UCAD Probe | -17.4415 | 14.6715 | Senegal | SN |
1 | 242 | None | 328453 | 1 Belvedere | 18.4895 | -33.9815 | South Africa | ZA |
2 | 446 | 196.192.112.229 | 37708 | AFRINIC Mauritius | 57.4995 | -20.2395 | Mauritius | MU |
3 | 473 | 196.4.161.3 | 3741 | Internet Solutions [http://www.is.co.za] - Ros... | 28.0405 | -26.1515 | South Africa | ZA |
4 | 504 | 193.95.97.132 | 2609 | ATI, KASBAH, 1G/s | 10.1675 | 36.7995 | Tunisia | TN |
# Note that each probe can have several IP addresses. We keep only one.
ip_from_measurement = df[['prb_id','ip_from']].reset_index().drop(columns=['datetime']).drop_duplicates(subset='prb_id')
df_probes = pd.merge(df_probes, ip_from_measurement, left_on='prb_id', right_on='prb_id', how='left')
df_probes.head()
prb_id | ip_v4 | asn | description | lon | lat | country_name | country_code | ip_from | |
---|---|---|---|---|---|---|---|---|---|
0 | 239 | 196.1.95.16 | 8346 | UCAD Probe | -17.4415 | 14.6715 | Senegal | SN | 196.1.95.16 |
1 | 242 | None | 328453 | 1 Belvedere | 18.4895 | -33.9815 | South Africa | ZA | 196.210.11.182 |
2 | 446 | 196.192.112.229 | 37708 | AFRINIC Mauritius | 57.4995 | -20.2395 | Mauritius | MU | 196.192.112.229 |
3 | 473 | 196.4.161.3 | 3741 | Internet Solutions [http://www.is.co.za] - Ros... | 28.0405 | -26.1515 | South Africa | ZA | 196.4.161.3 |
4 | 504 | 193.95.97.132 | 2609 | ATI, KASBAH, 1G/s | 10.1675 | 36.7995 | Tunisia | TN | 193.95.97.132 |
ipinfo_lookup = []
for ip in df_probes['ip_from'].values:
r = requests.get('http://ipinfo.io/{}?token=your-token'.format(ip))
ipinfo_lookup.append(r.json())
df_ip_info = pd.DataFrame(ipinfo_lookup)
df_ip_info.head()
city | country | hostname | ip | loc | org | postal | region | |
---|---|---|---|---|---|---|---|---|
0 | SN | dkr-sn.probe.atlas.ucad.sn | 196.1.95.16 | 14.0000,-14.0000 | AS8346 SONATEL-AS Autonomous System | NaN | ||
1 | Cape Town | ZA | 196-210-11-182.dynamic.isadsl.co.za | 196.210.11.182 | -33.9258,18.4232 | AS3741 Internet Solutions | 7945 | Western Cape |
2 | MU | p446.probes.atlas.ripe.net | 196.192.112.229 | -20.2833,57.5500 | AS37708 African Network Information Center - (... | NaN | ||
3 | ZA | ripe-ncc.is.co.za | 196.4.161.3 | -29.0000,24.0000 | AS3741 Internet Solutions | NaN | ||
4 | Tunis | TN | NaN | 193.95.66.40 | 36.8190,10.1658 | AS2609 Tunisia BackBone AS | NaN | Tūnis |
df_probes = pd.merge(df_probes, df_ip_info, left_on='ip_from', right_on='ip', how='left')
df_probes.head()
prb_id | ip_v4 | asn | description | lon | lat | country_name | country_code | ip_from | city | country | hostname | ip | loc | org | postal | region | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 239 | 196.1.95.16 | 8346 | UCAD Probe | -17.4415 | 14.6715 | Senegal | SN | 196.1.95.16 | SN | dkr-sn.probe.atlas.ucad.sn | 196.1.95.16 | 14.0000,-14.0000 | AS8346 SONATEL-AS Autonomous System | NaN | ||
1 | 242 | None | 328453 | 1 Belvedere | 18.4895 | -33.9815 | South Africa | ZA | 196.210.11.182 | Cape Town | ZA | 196-210-11-182.dynamic.isadsl.co.za | 196.210.11.182 | -33.9258,18.4232 | AS3741 Internet Solutions | 7945 | Western Cape |
2 | 446 | 196.192.112.229 | 37708 | AFRINIC Mauritius | 57.4995 | -20.2395 | Mauritius | MU | 196.192.112.229 | MU | p446.probes.atlas.ripe.net | 196.192.112.229 | -20.2833,57.5500 | AS37708 African Network Information Center - (... | NaN | ||
3 | 473 | 196.4.161.3 | 3741 | Internet Solutions [http://www.is.co.za] - Ros... | 28.0405 | -26.1515 | South Africa | ZA | 196.4.161.3 | ZA | ripe-ncc.is.co.za | 196.4.161.3 | -29.0000,24.0000 | AS3741 Internet Solutions | NaN | ||
4 | 567 | 193.95.66.40 | 2609 | ATI, BELVEDERE, 1G/s | 10.1805 | 36.8205 | Tunisia | TN | 193.95.66.40 | Tunis | TN | NaN | 193.95.66.40 | 36.8190,10.1658 | AS2609 Tunisia BackBone AS | NaN | Tūnis |
df_probes = df_probes[['prb_id', 'ip', 'asn', 'hostname', 'org', 'description', 'country_name', 'country_code', 'region', 'city', 'lon', 'lat']]
df_probes.head()
prb_id | ip | asn | hostname | org | description | country_name | country_code | region | city | lon | lat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 239 | 196.1.95.16 | 8346 | dkr-sn.probe.atlas.ucad.sn | AS8346 SONATEL-AS Autonomous System | UCAD Probe | Senegal | SN | -17.4415 | 14.6715 | ||
1 | 242 | 196.210.11.182 | 328453 | 196-210-11-182.dynamic.isadsl.co.za | AS3741 Internet Solutions | 1 Belvedere | South Africa | ZA | Western Cape | Cape Town | 18.4895 | -33.9815 |
2 | 446 | 196.192.112.229 | 37708 | p446.probes.atlas.ripe.net | AS37708 African Network Information Center - (... | AFRINIC Mauritius | Mauritius | MU | 57.4995 | -20.2395 | ||
3 | 473 | 196.4.161.3 | 3741 | ripe-ncc.is.co.za | AS3741 Internet Solutions | Internet Solutions [http://www.is.co.za] - Ros... | South Africa | ZA | 28.0405 | -26.1515 | ||
4 | 567 | 193.95.66.40 | 2609 | NaN | AS2609 Tunisia BackBone AS | ATI, BELVEDERE, 1G/s | Tunisia | TN | Tūnis | Tunis | 10.1805 | 36.8205 |
# Saving probes description as csv
utils.df_to_csv(df_probes, prefix_name='probes')
df_probes = pd.read_csv('data/probes-2019-08-23.csv')
# Joining/merging with country code to get full country name
data = pd.merge(df.reset_index(), df_probes, left_on='prb_id', right_on='prb_id', how='left')
SPEED_OF_LIGHT = 299792.458 # km/s
DIST = 3620 # km (more or less shortest African point from Dublin)
print("Light takes: ", int(1000*DIST / SPEED_OF_LIGHT), "ms to travel") # in ms
Light takes: 12 ms to travel
# To be defined
THRESHOLD_RTT = 30 # in ms
data[data['rtt'] < 30]['prb_id'].unique()
array([29158, 35742, 32674, 18514, 33985, 29991, 33290, 14867, 23538, 21610, 14958, 33090, 4274, 596, 15883, 35743, 31505, 13299, 35074])
data[data['rtt'] < 30]['country_name'].unique()
array(['Kenya', 'South Africa', 'Togo', 'Gambia', 'Cameroon', 'Congo (Democratic Republic of the)', 'Sudan', 'Tanzania, United Republic of', nan, 'Seychelles', 'Ghana', 'Botswana', 'Tunisia', 'Egypt'], dtype=object)
# Remove missing country names - though can be retrieved easily but those countries are
# not relevant to ous NRENs vs. others analysis as no identified NRENs in those countries.
data.dropna(subset=['country_name'], inplace=True)
# Dumping data for further use
utils.df_to_csv(data, prefix_name='measurements')
data = pd.read_csv('./data/measurements-2019-09-20.csv')
# How many measurements and columns
data.shape
(581762, 28)
# Quick descriptive statistics
data['rtt'].describe()
count 576684.000000 mean 167.272890 std 123.419625 min 0.585333 25% 139.629500 50% 170.072167 75% 183.718333 max 6292.648333 Name: rtt, dtype: float64
# Time range fetched
print("Measurements from: {} to {}".format(min(data['datetime']),max(data['datetime'])))
Measurements from: 2018-11-30 23:02:49 to 2018-12-30 22:54:10
alt.Chart(data[data['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE))\
.mark_bar()\
.encode(
alt.X("rtt:Q", bin=alt.Bin(step=100), title='Round-Trip-Time (ms)'),
y='count()')\
.properties(
width = WIDTH,
height = 200)\
.save('./img/all-rtts.png')
utils.show_chart('./img/all-rtts.png')
alt.Chart(data.sample(SAMPLE_SIZE))\
.mark_bar()\
.encode(
alt.X("rtt:Q", bin=alt.Bin(extent=[0, 600], step=20), title='Round-Trip-Time (ms)'),
y='count()')\
.properties(
width = WIDTH,
height = 200)\
.save('./img/rtts-focus.png')
utils.show_chart('./img/rtts-focus.png')
alt.Chart(data[data['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE)).mark_boxplot().encode(
alt.X('rtt:Q',title='Round-Trip-Time (ms)'),
alt.Y(field = 'country_name', type = 'nominal',
sort=alt.EncodingSortField(field='rtt', op='max', order='ascending'),
title='Country'))\
.properties(
width = 0.8*WIDTH,
height = 550).save('./img/rtts-boxplot-by-country.png')
utils.show_chart('./img/rtts-boxplot-by-country.png')
Focusing on Interquartile range and median
Let's forget outliers and peaks for now and focus on Q1, Q2, Q3.
points = alt.Chart(data[data['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE)).mark_point(
filled=True,
color='steelblue',
size=80
).encode(
x=alt.X('median(rtt)', title='Round-Trip-Time (ms)'),
y=alt.Y(
'country_name',
sort=alt.EncodingSortField(
field='rtt',
op='median',
order='ascending'
),
title='Country'
)
).properties(
width=0.8*WIDTH,
height=600
)
error_bars = points.mark_rule(size=1.5, color='steelblue').encode(
x='q1(rtt)',
x2='q3(rtt)',
)
(points + error_bars).save('./img/rtts-iqr-by-country.png')
utils.show_chart('./img/rtts-iqr-by-country.png')
african_countries = alt.topo_feature(AFRICA_TOPOJSON_URL, feature='continent_Africa_subunits')
# US states background
background = alt.Chart(african_countries).mark_geoshape(
fill='lightgray',
stroke='white'
).properties(
width=WIDTH,
height=500
).project('equirectangular')
# probes positions on background
points = alt.Chart(df_probes).mark_circle(
size=50,
color='steelblue'
).encode(
longitude='lon:Q',
latitude='lat:Q'
)
(background + points).configure_view(stroke="transparent").save('./img/probes-locations.png')
utils.show_chart('./img/probes-locations.png')
data_valid = data[data['measurement_failed'] == False]
data_valid['packets_lost'] = data_valid['nb_packets'] - data_valid['nb_rtts']
def perc_loss(df):
return 100 * sum(df['packets_lost']) / sum(df['nb_packets'])
print("Percentage of packets lost: {:.2f}".format(perc_loss(data_valid)))
Percentage of packets lost: 1.99
packet_loss_country = data_valid.groupby(['country_name'])\
.apply(perc_loss)\
.to_frame(name='% of packet loss')\
.sort_values(by='% of packet loss', ascending=False)
print(packet_loss_country)
% of packet loss country_name Congo (Democratic Republic of the) 27.323449 Egypt 12.523072 Kenya 4.948249 Ethiopia 3.988362 Algeria 2.344592 Morocco 1.936620 South Africa 1.932597 Sudan 1.902569 Cameroon 1.687548 Madagascar 1.372378 Senegal 1.276268 Zimbabwe 1.048065 Tunisia 0.942401 Zambia 0.883190 Togo 0.830149 Burundi 0.807505 Benin 0.537149 Uganda 0.508824 Tanzania, United Republic of 0.216664 Rwanda 0.147547 Gambia 0.146935 Congo 0.138889 Ghana 0.077187 Malawi 0.077160 Angola 0.058377 South Sudan 0.058092 Mauritius 0.035770 Nigeria 0.027006 Djibouti 0.025621 Mozambique 0.025077 Eswatini 0.023148 Botswana 0.011575 Seychelles 0.000000 Lesotho 0.000000
alt.Chart(packet_loss_country.reset_index()).mark_bar().encode(
x=alt.X('% of packet loss', title='Packet loss (%)'),
y=alt.Y(
'country_name:N',
sort=alt.EncodingSortField(
field='% of packet loss',
order='descending'
),
title='Country'
)
).properties(
width=0.8*WIDTH,
height=600
).save('./img/packet-loss-per-country.png')
utils.show_chart('./img/packet-loss-per-country.png')
For further information on NRENS: https://en.wikipedia.org/wiki/National_research_and_education_network
To identify the African NRENs, the following resources have been used:
nren_probes = [15328, 30177, 13114, 13218, 12344, 14712, 19592, 6369, 33838, 33284, 3461, 13711, 21673]
print(str(len(nren_probes)) + ' NRENs probes identified.')
13 NRENs probes identified.
data['is_nren'] = data['prb_id'].isin(nren_probes)
data.head(2)
datetime | prb_id | ip_from | paris_id | start_time | end_time | last_time_sync | firmware_version | nb_packets | nb_rtts | ... | hostname | org | description | country_name | country_code | region | city | lon | lat | is_nren | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-11-30 23:02:49 | 35067 | 41.248.208.162 | 8 | 1543618969 | 1543618970 | 204 | 4780 | 3.0 | 3.0 | ... | NaN | AS36903 Office National des Postes et Telecomm... | Rue Bruxelles | Morocco | MA | Rabat-Salé-Kénitra | Rabat | -6.2395 | 33.8205 | False |
1 | 2018-11-30 23:02:55 | 21682 | 169.0.103.214 | 3 | 1543618975 | 1543618976 | 8 | 4940 | 3.0 | 3.0 | ... | 169-0-103-214.ip.afrihost.co.za | AS37611 Afrihost (Pty) Ltd | Strand | South Africa | ZA | Western Cape | Cape Town | 18.8315 | -34.1015 | False |
2 rows × 28 columns
df_probes['is_nren'] = df_probes['prb_id'].isin(nren_probes)
african_countries = alt.topo_feature(AFRICA_TOPOJSON_URL, feature='continent_Africa_subunits')
# US states background
background = alt.Chart(african_countries).mark_geoshape(
fill='lightgray',
stroke='white'
).properties(
width=WIDTH,
height=500
).project('equirectangular')
# probes positions on background
points_not_nren = alt.Chart(df_probes[df_probes['is_nren'] == False]).mark_circle(
size=30, fill='steelblue', stroke='white', strokeWidth=0.5, opacity=0.8
).encode(
longitude='lon:Q',
latitude='lat:Q'
)
points_nren = alt.Chart(df_probes[df_probes['is_nren'] == True]).mark_point(
size=300, fill='#c44e51', opacity=1, shape='triangle-up'
).encode(
longitude='lon:Q',
latitude='lat:Q'
)
# NREN probes in red and non NREN probes in blue
(background + points_nren + points_not_nren)\
.configure_view(stroke="transparent")\
.save('./img/probes-locations-nrens.png')
utils.show_chart('./img/probes-locations-nrens.png')
countries_nren = list(data[data['is_nren'] == True]['country_code'].unique())
data_nren = data[data['country_code'].isin(countries_nren)]
data_nren[data_nren['rtt'] > THRESHOLD_RTT]\
.groupby(['country_name', 'is_nren'])\
.size()
country_name is_nren Algeria False 5180 True 5748 Kenya False 25262 True 5756 Madagascar False 5748 True 5571 Morocco False 8540 True 2808 South Africa False 229449 True 4316 Sudan False 4121 True 2713 Tanzania, United Republic of False 11636 True 2837 Zambia False 10790 True 2826 dtype: int64
print("Percentage of NRENs measurements: {:.2f}".format(100 * sum(data_nren['is_nren'] == True) / len(data_nren)))
Percentage of NRENs measurements: 9.38
points = alt.Chart(data_nren[data_nren['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE)).mark_point(
filled=True,
color='steelblue',
size=80
).encode(
x=alt.X('median(rtt)', title='Round-Trip-Time (ms)'),
y=alt.Y(
'is_nren',
sort=alt.EncodingSortField(
field='rtt',
op='median',
order='ascending'
),
title='NREN membership'
)
).properties(
width=0.8*WIDTH,
height=100
)
error_bars = points.mark_rule(size=1.5, color='steelblue').encode(
x='q1(rtt)',
x2='q3(rtt)',
)
(points + error_bars).save('./img/rtt-nrens-vs-others.png')
utils.show_chart('./img/rtt-nrens-vs-others.png')
points = alt.Chart(data_nren[data_nren['rtt'] > THRESHOLD_RTT].sample(SAMPLE_SIZE)).mark_point(
filled=True,
color='steelblue',
size=80
).encode(
x=alt.X('median(rtt)', title='Round-Trip-Time (ms)'),
y=alt.Y(
'is_nren',
sort=alt.EncodingSortField(
field='rtt',
op='median',
order='ascending'
),
title='',
axis=None
),
color=alt.Color('is_nren', legend=None, scale=alt.Scale(
domain=['true', 'false'],
range=['#c44e51','steelblue']))
).properties(
width=0.8*WIDTH,
height=80
)
error_bars = points.mark_rule(size=1.5, color='steelblue').encode(
x='q1(rtt)',
x2='q3(rtt)',
)
(points + error_bars)\
.facet(
row='country_name:N')\
.configure_axis(
labelFontSize=15,\
titleFontSize=15,\
gridDash=[4,4]\
)\
.configure_header(
titleFontSize=0,\
labelFontSize=15,\
labelLimit=80,\
title=None
)\
.save('./img/rtt-nrens-vs-others-by-country.png')
utils.show_chart('./img/rtt-nrens-vs-others-by-country.png')
data_nren_valid = data_nren[data_nren['measurement_failed'] == False]
data_nren_valid['packets_lost'] = data_nren_valid['nb_packets'] - data_nren_valid['nb_rtts']
packet_loss_nren_country = data_nren_valid.groupby(['is_nren'])\
.apply(perc_loss)\
.to_frame(name='% of packet loss')\
.sort_values(by='% of packet loss', ascending=False)
print(packet_loss_nren_country)
% of packet loss is_nren False 2.231209 True 0.818880
packet_loss_per_nren_per_country = data_nren_valid.groupby(['country_name', 'is_nren'])\
.apply(perc_loss)\
.to_frame(name='% of packet loss')\
packet_loss_per_nren_per_country
% of packet loss | ||
---|---|---|
country_name | is_nren | |
Algeria | False | 4.107719 |
True | 0.700231 | |
Kenya | False | 5.917119 |
True | 0.109954 | |
Madagascar | False | 0.416667 |
True | 2.341549 | |
Morocco | False | 2.188217 |
True | 1.163611 | |
South Africa | False | 1.955328 |
True | 0.690290 | |
Sudan | False | 2.535381 |
True | 0.613422 | |
Tanzania, United Republic of | False | 0.259367 |
True | 0.000000 | |
Zambia | False | 1.030548 |
True | 0.317759 |
alt.Chart(packet_loss_per_nren_per_country.reset_index())\
.mark_bar(height=20)\
.encode(
x=alt.X('% of packet loss', title='Packet loss (%)'),
y=alt.Y('is_nren:N', title='', axis=None),
color=alt.Color('is_nren', legend=None, scale=alt.Scale(domain=['true', 'false'], range=['#c44e51','steelblue'])),
row=alt.Row('country_name:N', sort=['Kenya', 'Algeria', 'Sudan', 'Morocco', 'South Africa',\
'Zambia', 'Madagascar', 'Tanzania, United Republic of']))\
.properties(
width=0.8*WIDTH,
height=70)\
.configure_axis(
labelFontSize=15,\
titleFontSize=15,\
gridDash=[4,4]\
)\
.configure_header(
titleFontSize=0,\
labelFontSize=15,\
labelLimit=80,\
title=None
).save('./img/packet-loss-nrens-vs-others-by-country.png')
utils.show_chart('./img/packet-loss-nrens-vs-others-by-country.png')
H0: there is no difference of mean of % of packet loss between NRENs vs. non-NRENS measurements.
# Observed difference of % of packet loss between NRENs and not-NRENs
observed_diff = packet_loss_nren_country.diff().values[1][0]
print(observed_diff)
-1.4123289321193973
Let's first simulate that HO Null hypothesis:
nren_idx = data_nren_valid[data_nren_valid['is_nren'] == True].index.values
# Sample non NREN measurements to get the same number of values as NRENs one as 10x more
not_nren_idx = data_nren_valid[data_nren_valid['is_nren'] == False].sample(len(nren_idx)).index.values
both_idx = np.concatenate((nren_idx, not_nren_idx))
nb_replicates = 10000
replicates = np.zeros(nb_replicates)
for i in range(nb_replicates):
both_perm_idx = np.random.permutation(both_idx)
perm_sample_nren_idx = both_perm_idx[:len(nren_idx)]
perm_sample_not_nren_idx = both_perm_idx[len(not_nren_idx):]
replicates[i] = perc_loss(data_nren_valid.loc[perm_sample_nren_idx,])-\
perc_loss(data_nren_valid.loc[perm_sample_not_nren_idx,])
alt.Chart(pd.DataFrame({'x': list(replicates)}))\
.mark_bar()\
.encode(
alt.X("x:Q", title='difference of mean of packet loss between NRENS and not-NRENS under simulated H0'),
y='count()')\
.properties(
width = WIDTH,
height = 200)\
.save('./img/estimate-diff-percentage-packet-loss.png')
utils.show_chart('./img/estimate-diff-percentage-packet-loss.png')
What's the probability of obtaining a value of our test statistic (difference of mean of packet loss) that is at least as extreme as what was observed, under the assumption the null hypothesis is true?
print(100 * sum(replicates <= observed_diff) / len(replicates), '%')
0.0 %