Machine Learning for Time Series (Master MVA)
Import
import re
from math import asin, cos, radians, sin, sqrt
import contextily as cx
import geopandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from loadmydata.load_molene_meteo import load_molene_meteo_dataset
from matplotlib.dates import DateFormatter
from pygsp import graphs
from scipy.linalg import eigh
from scipy.spatial.distance import pdist, squareform
CRS = "EPSG:4326"
STATION_LIST = [
"ARZAL",
"AURAY",
"BELLE ILE-LE TALUT",
"BIGNAN",
"BREST-GUIPAVAS",
"BRIGNOGAN",
"DINARD",
"GUERANDE",
"ILE DE GROIX",
"ILE-DE-BREHAT",
"KERPERT",
"LANDIVISIAU",
"LANNAERO",
"LANVEOC",
"LORIENT-LANN BIHOUE",
"LOUARGAT",
"MERDRIGNAC",
"NOIRMOUTIER EN",
"OUESSANT-STIFF",
"PLEUCADEUC",
"PLEYBER-CHRIST SA",
"PLOERMEL",
"PLOUDALMEZEAU",
"PLOUGUENAST",
"PLOUMANAC'H",
"POMMERIT-JAUDY",
"PONTIVY",
"PTE DE CHEMOULIN",
"PTE DE PENMARCH",
"PTE DU RAZ",
"QUIMPER",
"QUINTENIC",
"ROSTRENEN",
"SAINT-CAST-LE-G",
"SARZEAU SA",
"SIBIRIL S A",
"SIZUN",
"SPEZET",
"ST BRIEUC",
"ST NAZAIRE-MONTOIR",
"ST-SEGAL S A",
"THEIX",
"VANNES-SENE",
]
Utility functions
def fig_ax(figsize=(15, 3)):
return plt.subplots(figsize=figsize)
def get_line_graph(n_nodes=10) -> graphs.Graph:
"""Return a line graph."""
adjacency_matrix = np.eye(n_nodes)
adjacency_matrix = np.c_[adjacency_matrix[:, -1], adjacency_matrix[:, :-1]]
adjacency_matrix += adjacency_matrix.T
line_graph = graphs.Graph(adjacency_matrix)
line_graph.set_coordinates(kind="ring2D")
line_graph.compute_laplacian("combinatorial")
return line_graph
def get_grid_graph(n_nodes_height=10, n_nodes_width=10) -> graphs.Graph:
"""Return a 2D grid graph."""
g = graphs.Grid2d(n_nodes_height, n_nodes_width)
xx, yy = np.meshgrid(np.arange(n_nodes_height), np.arange(n_nodes_width))
coords = np.array((xx.ravel(), yy.ravel())).T
g.set_coordinates(coords)
g.compute_laplacian("combinatorial")
return g
def dms2dd(s):
"""Convert longitude and latitude strings to float."""
# https://stackoverflow.com/a/50193328
# example: s = """48°51'18"""
degrees, minutes, seconds = re.split("[°'\"]+", s[:-1])
direction = s[-1]
dd = float(degrees) + float(minutes) / 60 + float(seconds) / (60 * 60)
if direction in ("S", "W"):
dd *= -1
return dd
def get_geodesic_distance(point_1, point_2) -> float:
"""
Calculate the great circle distance (in km) between two points
on the earth (specified in decimal degrees)
https://stackoverflow.com/a/4913653
"""
lon1, lat1 = point_1
lon2, lat2 = point_2
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
c = 2 * asin(sqrt(a))
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
return c * r
def get_exponential_similarity(condensed_distance_matrix, bandwidth, threshold):
exp_similarity = np.exp(-(condensed_distance_matrix**2) / bandwidth / bandwidth)
res_arr = np.where(exp_similarity > threshold, exp_similarity, 0.0)
return res_arr
A graph $G$ is a set of $N$ nodes connected with edges. A graph signal is a $\mathbb{R}^N$ vector that is supported by the nodes of the graph $G$.
Graph Signal Processing (GSP) is the set of methods of methods to study such objects.
Let us illustrate the basic principles of GSP on two toy graphs: the line graph and the 2D grid graph.
line_graph = get_line_graph(50) # 50 nodes
line_graph.plot()
grid_graph = get_grid_graph(20, 20) # 20 by 20 grid
grid_graph.plot()
We can now generate noisy signals on those two graphs.
fig, (ax_0, ax_1) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
# generate a noisy sinusoid
tt = np.linspace(0, 6 * np.pi, line_graph.N)
signal_line = 5 * np.sin(tt) + np.random.normal(size=line_graph.N)
# plot
line_graph.plot_signal(signal_line, ax=ax_0)
ax_1.plot(signal_line)
fig, (ax_0, ax_1) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
# generate a noisy sinusoid
x = np.linspace(-2 * np.pi, 2 * np.pi, 20)
y = np.linspace(-2 * np.pi, 2 * np.pi, 20)
xx, yy = np.meshgrid(x, y, sparse=True)
z = 5 * np.sin(xx + yy)
z += np.random.normal(size=z.shape)
signal_grid = z.flatten()
# plot
ax_0.contourf(x, y, z)
grid_graph.plot_signal(signal_grid, ax=ax_1)
Question
Recall the definition of the Laplacian matrix.
Question
Compute the Laplacian matrix for both the line graph and the grid graph.
Verify your result with the Laplacian matrix provided by the Graph class (available in g.L.todense() for a graph g).
In the GSP setting, the Fourier transform derives from the Laplacian $L$ eigendecomposition:
$$ L = U D U^T $$where $U$ contains (orthonormal) eigenvectors $u_i$ and $D$ is a diagonal matrix containing the eigenvalues.
For a graph signal $f$, the associated Fourier transform $\hat{f}$ is given by:
$$ \hat{f}:=U^T f. $$To illustre this definition, we can compute the Fourier basis on the two graph examples.
On the line graph, we compute and display the eigenvalues and the first eigenvectors.
# Laplacian eigendecomposition
eigenvals_line, eigenvects_line = eigh(line_graph.L.todense())
fig, (ax_0, ax_1) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
ax_0.plot(range(1, eigenvals_line.size + 1), eigenvals_line, "-*")
ax_0.set_xlabel("Eigenvalue number")
ax_0.set_ylabel("Eigenvalue")
ax_0.set_title("Line graph")
for k in range(5):
ax_1.plot(eigenvects_line[:, 2 * k], label=f"Eigenvector {2*k+1}")
_ = plt.legend()
Question
What do you observe on the shape of the eigenvectors?
Question
For the grid graph, compute and display the first and last eigenvectors.
Question
Visually, which one the two eigenvectors is the smoothest?
Recall the definition of a graph signal smoothness.
Question
Compute and plot the smoothness of each eigenvector of the Laplacian (of the grid graph).
What do you observe? Is this expected?
Using the Fourier basis, we can now compute the Fourier transform of each signal.
fig, (ax_0, ax_1) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
# Fourier transform
signal_line_fourier = eigenvects_line.T.dot(signal_line)
signal_grid_fourier = eigenvects_grid.T.dot(signal_grid)
# plot
ax_0.plot(abs(signal_line_fourier), "*-")
ax_0.set_title("Fourier transform (signal on the line graph)")
ax_1.plot(abs(signal_grid_fourier), "*-")
_ = ax_1.set_title("Fourier transform (signal on the grid graph)")
Question
Given the Fourier representation of a signal, how can we recover the original signal?
Since there is a frequency representation, we can filter the signals, as in the classical signal processing setting. For instance, let us set to 0 all Fourier coefficients above a certain cut-off frequency.
# we keep only 20% of the Fourier coefficients
# filtering the line graph signal
fourier_transform_filtered = np.zeros(signal_line_fourier.size)
fourier_transform_filtered[: signal_line_fourier.size // 5] = signal_line_fourier[
: signal_line_fourier.size // 5
]
signal_line_filtered = eigenvects_line.dot(fourier_transform_filtered)
# plot
fig, (ax_0, ax_1) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
ax_0.plot(abs(signal_line_fourier), "*-", label="Original Fourier transform")
ax_0.plot(
abs(fourier_transform_filtered),
"*-",
label="Filtered Fourier transform",
alpha=0.5,
)
ax_0.set_title("Fourier transform (signal on the line graph)")
# filtering the grid graph signal
fourier_transform_filtered = np.zeros(signal_grid_fourier.size)
fourier_transform_filtered[: signal_grid_fourier.size // 5] = signal_grid_fourier[
: signal_grid_fourier.size // 5
]
signal_grid_filtered = eigenvects_grid.dot(fourier_transform_filtered)
# plot
ax_1.plot(abs(signal_grid_fourier), "*-", label="Original Fourier transform")
ax_1.plot(
abs(fourier_transform_filtered),
"*-",
label="Filtered Fourier transform",
alpha=0.5,
)
_ = ax_1.set_title("Fourier transform (signal on the grid graph)")
_ = plt.legend()
We then reconstruct the filtered Fourier transform.
fig, (ax_0, ax_1) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
line_graph.plot_signal(signal_line_filtered, ax=ax_0)
ax_0.set_title("Reconstruction")
grid_graph.plot_signal(signal_grid_filtered, ax=ax_1)
_ = ax_1.set_title("Reconstruction")
fig, ax = fig_ax()
ax.plot(signal_line, label="Original signal")
ax.plot(signal_line_filtered, label="Filtered signal")
_ = plt.legend()
We can use this procedure to compress signals that are supported on arbitrary graphs.
Let us illustrate a few GSP methods on a real-world data set.
data_df, stations_df, description = load_molene_meteo_dataset()
print(description)
# only keep a subset of stations
keep_cond = stations_df.Nom.isin(STATION_LIST)
stations_df = stations_df[keep_cond]
keep_cond = data_df.station_name.isin(STATION_LIST)
data_df = data_df[keep_cond].reset_index().drop("index", axis="columns")
# convert temperature from Kelvin to Celsius
data_df["temp"] = data_df.t - 273.15 # temperature in Celsius
# convert pandas df to geopandas df
stations_gdf = geopandas.GeoDataFrame(
stations_df,
geometry=geopandas.points_from_xy(stations_df.Longitude, stations_df.Latitude),
).set_crs(CRS)
stations_gdf.head()
data_df.head()
Pivot the table. We now have a multivariate time serie.
temperature_df = data_df.pivot(index="date", values="temp", columns="station_name")
temperature_df.head()
Plot the position of the grounds stations on a map.
ax = stations_gdf.geometry.plot(figsize=(10, 10))
cx.add_basemap(ax, crs=stations_gdf.crs.to_string(), zoom=8)
ax.set_axis_off()
We can start by checking for some malfunctions in the stations. To that end, we simply count the number of NaNs.
temperature_df.isna().sum(axis=0).sort_values(ascending=False).head()
After this, we can look at the (geodesic) distance between stations.
stations_np = stations_df[["Longitude", "Latitude"]].to_numpy()
dist_mat_condensed = pdist(stations_np, metric=get_geodesic_distance)
dist_mat_square = squareform(dist_mat_condensed)
fig, ax = fig_ax((10, 8))
_ = sns.heatmap(
dist_mat_square,
xticklabels=stations_df.Nom,
yticklabels=stations_df.Nom,
ax=ax,
)
Question
What are the two closest stations?
Question
What are the two most distant stations?
We can plot the temperature evolution for the two closest stations.
Question
Plot the temperature evolution for the two closest stations.
Question
Do the same for the two most distant stations.
This network of sensors can be modeled as a graph, and the temperature signal, as a serie of graph signals.
To that end, we need to define a graph.
Question
Give two ways to derive an adjacency matrix from a distance matrix?
Question
The number of connected components of a graph is the multiplicity of the eigenvalue 0 of the Laplacian matrix. Can you explain why? Write a function `is_connected` which returns True if the input graph is connected and False otherwise.
def is_connected(graph) -> bool:
...
threshold = 1 # km
adjacency_matrix = squareform((dist_mat_condensed < threshold).astype(int))
G = graphs.Graph(adjacency_matrix)
print(
f"The graph is {'not ' if not is_connected(G) else ''}connected, with {G.N} nodes, {G.Ne} edges"
)
ax = stations_gdf.geometry.plot(figsize=(10, 10))
cx.add_basemap(ax, crs=stations_gdf.crs.to_string(), zoom=8)
ax.set_axis_off()
G.set_coordinates(stations_np)
G.plot(ax=ax)
Question
Plot the number of edges with respect to the threshold.
What is approximatively the lowest threshold possible in order to have a connected graph?
Question
Plot the average degree with respect to the threshold.
Each vertex can be connected to other vertices by edges weighted by a Gaussian kernel: $$ W_{ij} = \exp\left(-\frac{\|c_i-c_j\|^2}{\sigma^2}\right) \quad\text{if}\quad \exp\left(-\frac{\|c_i-c_j\|^2}{\sigma^2}\right)>\lambda,\ 0\ \text{otherwise} $$ where the $c_i$ are the station positions, $\sigma$ is the so-called bandwidth parameter, and $\lambda>0$ is a threshold.
sigma = np.median(dist_mat_condensed) # median heuristic
threshold = 0.85
adjacency_matrix_gaussian = squareform(
get_exponential_similarity(dist_mat_condensed, sigma, threshold)
)
G_gaussian = graphs.Graph(adjacency_matrix_gaussian)
print(
f"The graph is {'not ' if not is_connected(G_gaussian) else ''}connected, with {G_gaussian.N} nodes, {G_gaussian.Ne} edges"
)
ax = stations_gdf.geometry.plot(figsize=(10, 10))
cx.add_basemap(ax, crs=stations_gdf.crs.to_string(), zoom=8)
ax.set_axis_off()
G_gaussian.set_coordinates(stations_np)
G_gaussian.plot(ax=ax)
Question
What is the influence of the threshold.
Choose an appropriate value for the threshold.
The correlation between the signals can also define a graph.
_ = sns.heatmap(temperature_df.corr())
Question
Describe how to create a graph using the signal correlation. Code it.
ax = stations_gdf.geometry.plot(figsize=(10, 10))
cx.add_basemap(ax, crs=stations_gdf.crs.to_string(), zoom=8)
ax.set_axis_off()
G_corr.set_coordinates(stations_np)
G_corr.plot(ax=ax)
Note that stations that are very far apart can be connected. This unveils a different structure.
In this section, we set the graph to the one that comes from the Gaussian kernel.
Let us study the signal smoothness, at each hour of measure.
# drop the NaNs
temperature_df_no_nan = temperature_df.dropna(axis=0, how="any")
Question
Compute the smoothness for a specific hour of measure.
# choose a specific hour
choosen_hour = pd.to_datetime("2014-01-01 01:00:00")
Question
Display the smoothess evolution with time.
This displays interesting patterns.
Question
Show the state of the network, when the signal is the smoothest.
Question
Do the same, when the signal is the least smooth.