This notebook was used to scrap all the files of interest on the SLF archive website. We first define some utility functions to extract all the directory hierarchy from the HTML pages. Then we filter which files we are interested into and dump their URL into a file. We finally download each file using a small multithreaded python script.
import requests
from bs4 import BeautifulSoup
import json
from typing import Dict, List
import pandas as pd
%matplotlib inline
base_url = 'https://www.slf.ch/'
archive_parent = 'fr/bulletin-davalanches-et-situation-nivologique/archives.html?tx_wslavalanches_archiv%5Bpath%5D=%2Fuser_upload%2Fimport%2Flwdarchiv%2Fpublic%2F&tx_wslavalanches_archiv%5Baction%5D=showArchiv&tx_wslavalanches_archiv%5Bcontroller%5D=Avalanche&cHash=c71751a643ec4629e21b0306033ccd59'
def extract_folders(url: str) -> Dict[str, str]:
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
folders = [folder.find('a') for folder in soup.findAll(class_='folder')]
return {str(folder.contents[2]).split(',')[0].strip(): folder['href'] for folder in folders}
def extract_folders_rec(url: str, max_level: int = 3, curr_level: int = 0):
full_url = base_url + url
subfolders = extract_folders(full_url)
if curr_level < max_level:
for subfolder in subfolders:
subfolder_url = subfolders[subfolder]
subfolders[subfolder] = extract_folders_rec(subfolder_url, max_level, curr_level + 1)
return subfolders
# show first level of archive
hierarchy = extract_folders_rec(base_url + archive_parent, max_level=1)
hierarchy = [(folder, subfolder, subfolder_url) for folder, content in hierarchy.items() for subfolder, subfolder_url in content.items()]
hierarchy = pd.DataFrame(data, columns=['folder', 'subfolder', 'url'])
hierarchy = hierarchy.set_index(['folder', 'subfolder'])
hierarchy
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-4-6251e4e7dbed> in <module>() 2 hierarchy = extract_folders_rec(base_url + archive_parent, max_level=1) 3 hierarchy = [(folder, subfolder, subfolder_url) for folder, content in hierarchy.items() for subfolder, subfolder_url in content.items()] ----> 4 hierarchy = pd.DataFrame(data, columns=['folder', 'subfolder', 'url']) 5 hierarchy = hierarchy.set_index(['folder', 'subfolder']) 6 hierarchy NameError: name 'data' is not defined
The following functions are filters for which folders to extract. It follows the following rules:
def folders_filter(folders: Dict[str, str]) -> Dict[str, str]:
# language picking
if 'en' in folders:
return {'en': folders['en']}
if 'fr' in folders:
return {'fr': folders['fr']}
if 'de' in folders:
return {'de': folders['de']}
new_folders = folders.copy()
for key in folders:
if 'regional' in key.lower() or 'régional' in key.lower():
new_folders.pop(key)
if 'icône' in key.lower() or 'icone' in key.lower():
new_folders.pop(key)
if 'Schneedeckenstabilität' in key:
new_folders.pop(key)
return new_folders
def files_filter(files_url: List[str]) -> List[str]:
new_files_url = []
for fu in files_url:
f = path.basename(fu)
if 'bw' not in f or 'bw.txt' in f:
new_files_url.append(fu)
return new_files_url
from os import path
import os
def extract_files(url: str):
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
files = [folder.find('a')['href'] for folder in soup.findAll(class_='linkedListPoint')]
return files
# u = 'https://www.slf.ch/fr/bulletin-davalanches-et-situation-nivologique/archives.html?tx_wslavalanches_archiv%5Bpath%5D=%2Fuser_upload%2Fimport%2Flwdarchiv%2Fpublic%2F2018%2Fhstop%2Ffr%2Fgif%2F&tx_wslavalanches_archiv%5Baction%5D=showArchiv&tx_wslavalanches_archiv%5Bcontroller%5D=Avalanche&cHash=3a2d286c7428ec5abc465a7412ad6f65'
# extract_files(u)
def fetch_all(url: str, dest: str, curr_path: str = '', count: int = 0):
fs = extract_files(url)
fs = files_filter(fs)
for file_url in fs:
dest_file = path.join(dest, curr_path, path.basename(file_url))
if not os.path.exists(dest_file):
content = requests.get(path.join(base_url, file_url)).content
with open(dest_file, 'wb') as f:
f.write(content)
count += 1
print('count {}\t{}'.format(count, path.join(curr_path, path.basename(file_url))), end='\r')
sub_directories = extract_folders(url)
sub_directories = folders_filter(sub_directories)
for name, sub_url in sub_directories.items():
new_path = path.join(curr_path, name)
create_dir(path.join(dest, new_path))
count = fetch_all(base_url + sub_url, dest, curr_path=new_path, count=count)
return count
def create_dir(d):
if not os.path.exists(d):
os.makedirs(d)
def files_generator(url: str):
fs = extract_files(url)
fs = files_filter(fs)
for file_url in fs:
yield file_url
sub_directories = extract_folders(url)
sub_directories = folders_filter(sub_directories)
for name, sub_url in sub_directories.items():
for f in files_generator(base_url + sub_url):
yield f
entry_point = 'https://www.slf.ch/fr/bulletin-davalanches-et-situation-nivologique/archives.html?tx_wslavalanches_archiv%5Bpath%5D=%2Fuser_upload%2Fimport%2Flwdarchiv%2Fpublic%2F&tx_wslavalanches_archiv%5Baction%5D=showArchiv&tx_wslavalanches_archiv%5Bcontroller%5D=Avalanche&cHash=c71751a643ec4629e21b0306033ccd59'
destination = '../data2/'
# no bw, en-fr-de in order, no profile, no regional, no icone
# fetch_all(entry_point, destination)
with open('files_to_download', 'w') as dest:
dest.writelines(map(lambda x: base_url + x + '\n', files_generator(entry_point)))
Now we can use the python script ../src/dowload.py
to fetch the ~30'000 files in the directory structure.
python3 src/download.py notebooks/files_to_download ./data/slf --prefix https://www.slf.ch/fileadmin/user_upload/import/lwdarchiv/public/ --nproc 4