Notebook

Data scraping¶

This notebook was used to scrap all the files of interest on the SLF archive website. We first define some utility functions to extract all the directory hierarchy from the HTML pages. Then we filter which files we are interested into and dump their URL into a file. We finally download each file using a small multithreaded python script.

In [1]:

import requests
from bs4 import BeautifulSoup
import json
from typing import Dict, List
import pandas as pd

%matplotlib inline

In [2]:

base_url = 'https://www.slf.ch/'
archive_parent = 'fr/bulletin-davalanches-et-situation-nivologique/archives.html?tx_wslavalanches_archiv%5Bpath%5D=%2Fuser_upload%2Fimport%2Flwdarchiv%2Fpublic%2F&tx_wslavalanches_archiv%5Baction%5D=showArchiv&tx_wslavalanches_archiv%5Bcontroller%5D=Avalanche&cHash=c71751a643ec4629e21b0306033ccd59'

In [3]:

def extract_folders(url: str) -> Dict[str, str]:
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    folders = [folder.find('a') for folder in soup.findAll(class_='folder')]
    return {str(folder.contents[2]).split(',')[0].strip(): folder['href'] for folder in folders}

def extract_folders_rec(url: str, max_level: int = 3, curr_level: int = 0):
    full_url = base_url + url
    subfolders = extract_folders(full_url)
    if curr_level < max_level:
        for subfolder in subfolders:
            subfolder_url = subfolders[subfolder]
            subfolders[subfolder] = extract_folders_rec(subfolder_url, max_level, curr_level + 1)
    return subfolders

In [4]:

# show first level of archive
hierarchy = extract_folders_rec(base_url + archive_parent, max_level=1)
hierarchy = [(folder, subfolder, subfolder_url) for folder, content in hierarchy.items() for subfolder, subfolder_url in content.items()]
hierarchy = pd.DataFrame(data, columns=['folder', 'subfolder', 'url'])
hierarchy = hierarchy.set_index(['folder', 'subfolder'])
hierarchy

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-6251e4e7dbed> in <module>()
      2 hierarchy = extract_folders_rec(base_url + archive_parent, max_level=1)
      3 hierarchy = [(folder, subfolder, subfolder_url) for folder, content in hierarchy.items() for subfolder, subfolder_url in content.items()]
----> 4 hierarchy = pd.DataFrame(data, columns=['folder', 'subfolder', 'url'])
      5 hierarchy = hierarchy.set_index(['folder', 'subfolder'])
      6 hierarchy

NameError: name 'data' is not defined

The following functions are filters for which folders to extract. It follows the following rules:

language: files are often duplicated for the 4 languages (de, fr, it, en). When it is the case we download only one set in the following order of preference: en - fr - de. German is the default (always present).
too specific: some files are not interesting for now (too specific or too regional). We don't download the snowprofiles and the regional snow report,
color or black and white: maps are available in color and in black and white. Colors are easier for computer vision algorithm, so we drop the black and white map.

In [ ]:

def folders_filter(folders: Dict[str, str]) -> Dict[str, str]:
    # language picking
    if 'en' in folders:
        return {'en': folders['en']}
    if 'fr' in folders:
        return {'fr': folders['fr']}
    if 'de' in folders:
        return {'de': folders['de']}
    
    new_folders = folders.copy()
    for key in folders:
        if 'regional' in key.lower() or 'régional' in key.lower():
            new_folders.pop(key)
        if 'icône' in key.lower() or 'icone' in key.lower():
            new_folders.pop(key)
        if 'Schneedeckenstabilität' in key:
            new_folders.pop(key)
    return new_folders

def files_filter(files_url: List[str]) -> List[str]:
    new_files_url = []
    for fu in files_url:
        f = path.basename(fu)
        if 'bw' not in f or 'bw.txt' in f:
            new_files_url.append(fu)
    return new_files_url

In [ ]:

from os import path
import os

def extract_files(url: str):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    files = [folder.find('a')['href'] for folder in soup.findAll(class_='linkedListPoint')]
    return files

# u = 'https://www.slf.ch/fr/bulletin-davalanches-et-situation-nivologique/archives.html?tx_wslavalanches_archiv%5Bpath%5D=%2Fuser_upload%2Fimport%2Flwdarchiv%2Fpublic%2F2018%2Fhstop%2Ffr%2Fgif%2F&tx_wslavalanches_archiv%5Baction%5D=showArchiv&tx_wslavalanches_archiv%5Bcontroller%5D=Avalanche&cHash=3a2d286c7428ec5abc465a7412ad6f65'
# extract_files(u)

def fetch_all(url: str, dest: str, curr_path: str = '', count: int = 0):
    fs = extract_files(url)
    fs = files_filter(fs)
    for file_url in fs:
        dest_file = path.join(dest, curr_path, path.basename(file_url))
        if not os.path.exists(dest_file):
            content = requests.get(path.join(base_url, file_url)).content
            with open(dest_file, 'wb') as f:
                f.write(content)
            count += 1
        print('count {}\t{}'.format(count, path.join(curr_path, path.basename(file_url))), end='\r')

    sub_directories = extract_folders(url)
    sub_directories = folders_filter(sub_directories)
    for name, sub_url in sub_directories.items():
        new_path = path.join(curr_path, name)
        create_dir(path.join(dest, new_path))
        count = fetch_all(base_url + sub_url, dest, curr_path=new_path, count=count)
        
    return count

def create_dir(d):
    if not os.path.exists(d):
        os.makedirs(d)

In [ ]:

def files_generator(url: str):
    fs = extract_files(url)
    fs = files_filter(fs)
    for file_url in fs:
        yield file_url

    sub_directories = extract_folders(url)
    sub_directories = folders_filter(sub_directories)
    for name, sub_url in sub_directories.items():
        for f in files_generator(base_url + sub_url):
            yield f
    

In [ ]:

entry_point = 'https://www.slf.ch/fr/bulletin-davalanches-et-situation-nivologique/archives.html?tx_wslavalanches_archiv%5Bpath%5D=%2Fuser_upload%2Fimport%2Flwdarchiv%2Fpublic%2F&tx_wslavalanches_archiv%5Baction%5D=showArchiv&tx_wslavalanches_archiv%5Bcontroller%5D=Avalanche&cHash=c71751a643ec4629e21b0306033ccd59'
destination = '../data2/'
# no bw, en-fr-de in order, no profile, no regional, no icone

# fetch_all(entry_point, destination)
with open('files_to_download', 'w') as dest:
    dest.writelines(map(lambda x: base_url + x + '\n', files_generator(entry_point)))

Now we can use the python script ../src/dowload.py to fetch the ~30'000 files in the directory structure.

python3 src/download.py notebooks/files_to_download ./data/slf --prefix https://www.slf.ch/fileadmin/user_upload/import/lwdarchiv/public/ --nproc 4

In [ ]: