1 Data Inspection

In [1]:
# Python libs
import warnings
import joblib
from pathlib import Path
from math import ceil
import numpy as np
import pandas as pd
from scipy import signal

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Signal processing
import librosa
import pywt
from nnAudio.features.cqt import CQT1992v2
import torch

# Module settings
#mpl.rc("figure", dpi=144)
sns.set()
#pd.set_option('display.max_columns', None)  # show all columns
In [2]:
DATA_INPUT_DIR = "data/"
TRAIN_DIR = DATA_INPUT_DIR + "train/"
TEST_DIR = DATA_INPUT_DIR + "test/"
TRAIN_LABELS_CSV = DATA_INPUT_DIR + "training_labels.csv"
In [3]:
labels = pd.read_csv(TRAIN_LABELS_CSV)
labels
Out[3]:
id target
0 00000e74ad 1
1 00001f4945 0
2 0000661522 0
3 00007a006a 0
4 0000a38978 1
... ... ...
559995 ffff9a5645 1
559996 ffffab0c27 0
559997 ffffcf161a 1
559998 ffffd2c403 0
559999 fffff2180b 0

560000 rows × 2 columns

In [4]:
labels['target'].value_counts()
_ = sns.countplot(x='target', data=labels)
Out[4]:
0    280070
1    279930
Name: target, dtype: int64
  • There are 560000 training time series.
  • There are 280070 non-GW series and 279930 GW series, almost equally distributed.
  • Each time series
In [4]:
def plot_sample(X, sample_id: str, label: int):
    fig = plt.figure(figsize=(15, 8))
    grid = mpl.gridspec.GridSpec(4, 2, width_ratios=[4, 1], hspace=0.5)
    time = np.arange(0, 4096) / 2048

    ax00 = fig.add_subplot(grid[0, 0])
    _ = ax00.set(xlabel='')
    _ = ax00.set(title="LIGO Hanford")
    _ = ax00.plot(time, X[0], c=sns.color_palette()[0])

    ax01 = fig.add_subplot(grid[0, 1])
    _ = ax01.set(xlabel='')
    _ = sns.histplot(y=X[0], ax=ax01, kde=True, color=sns.color_palette()[0])

    ax10 = fig.add_subplot(grid[1, 0])
    _ = ax10.set(xlabel='')
    _ = ax10.set(title="LIGO Livingston")
    _ = ax10.plot(time, X[1], c=sns.color_palette()[1])

    ax11 = fig.add_subplot(grid[1, 1])
    _ = ax11.set(xlabel='')
    _ = sns.histplot(y=X[1], ax=ax11, kde=True, color=sns.color_palette()[1])

    ax20 = fig.add_subplot(grid[2, 0])
    _ = ax20.set(xlabel='')
    _ = ax20.set(title="Virgo")
    _ = ax20.plot(time, X[2], c=sns.color_palette()[2])

    ax21 = fig.add_subplot(grid[2, 1])
    _ = ax21.set(xlabel='')
    _ = sns.histplot(y=X[2], ax=ax21, kde=True, color=sns.color_palette()[2])

    ax3 = fig.add_subplot(grid[3, 0])
    ax3.set(xlabel='Time (s)')
    _ = ax3.plot(time, X[0], label="LIGO Hanford")
    _ = ax3.plot(time, X[1], label="LIGO Livingston")
    _ = ax3.plot(time, X[2], label="Virgo")
    plt.legend(loc='right', bbox_to_anchor=(1.3, 0.5))
    
    ylim = ax3.get_ylim()
    for ax in [ax00, ax01, ax10, ax11, ax20, ax21]:
        ax.set(ylim=ylim)

    fig.suptitle(f"{sample_id}, label={label}", y=0.94)
In [5]:
def get_random_sample_ids(labels, n: int):
    sample_ids = {}
    for label, g in labels.groupby('target'):
        sample_ids[label] = np.random.choice(g['id'], n)    
    return sample_ids

def get_data_by_sample_id(sample_id: str, train_or_test: str='train'):
    path = f"{DATA_INPUT_DIR}{train_or_test}/{sample_id[0]}/{sample_id[1]}/{sample_id[2]}/{sample_id}.npy"
    return np.load(path)

def get_label_by_sample_id(sample_id, labels):
    y = labels.query(f"id == '{sample_id}'")['target'].iloc[0]
    return y
In [60]:
sample_ids = get_random_sample_ids(labels, n=3)
for label in (0, 1):
    for sample_id in sample_ids[label]:
        X = get_data_by_sample_id(sample_id, 'train')
        y = get_label_by_sample_id(sample_id, labels)
        plot_sample(X, sample_id, label=y)
In [8]:
sample_ids = get_random_sample_ids(labels, n=3)
sample_ids
Out[8]:
{0: array(['d351acade4', 'dd3eaf0c8d', 'd64eae144c'], dtype=object),
 1: array(['653e8bb6c3', '11661508e9', '702cf16e0b'], dtype=object)}
In [20]:
_ = plt.psd(get_data_by_sample_id('d351acade4'), NFFT=4096, Fs=2048, label='0')
_ = plt.psd(get_data_by_sample_id('653e8bb6c3'), NFFT=4096, Fs=2048, label='1')
_ = plt.legend()

2.1 Spectrograms by STFT

In [45]:
def plot_spectrograms(sample_X, sample_y, sample_id):
    global sites
    fig, axes = plt.subplots(1, 3, figsize=(18,5))
    _ = plt.suptitle(f"{sample_id}, label={sample_y}")
    for i, site in enumerate(sites):
        X = sample_X[i]
        X = X / X.max()
        X_stft = librosa.stft(X)
        X_stft_db = librosa.amplitude_to_db(abs(X_stft))
        ax = plt.subplot(1, 3, i+1)
        _ = ax.set(title=site)
        _ = specshow(X_stft_db, sr=2048, x_axis='time', y_axis='log', vmin=-30, vmax=50)
        _ = plt.colorbar()
In [48]:
sample_ids = get_random_sample_ids(labels, n=3)

sites = "LIGO Hanford", "LIGO Livingston", "Virgo"
for label in (0, 1):
    for sample_id in sample_ids[label]:
        sample_X = get_data_by_sample_id(sample_id, 'train')
        sample_y = get_label_by_sample_id(sample_id, labels)
        plot_spectrograms(sample_X, sample_y, sample_id)