Notebook
import camelot import glob files = glob.glob("../../Downloads/A_*.pdf") results = [] for fname in files: print(fname) tables = camelot.read_pdf(fname, pages='17-end', flavor='stream') for t in tables: df = t.df if df.loc[0, 0] == "1. SUPERFICIES ET DENSITÉS EN 1999": district_name = df.loc[2, 0] assert df.loc[10, 0] == 'POPULATION TOTALE EN 1999' population = df.loc[10, 1] print(t.page) print(district_name) results.append([district_name, population]) df = pd.DataFrame(results, columns=['district_name', 'population']) df['population'] = df['population'].str.replace(' ', '').astype('int64') df.to_csv("datasets/paris-population.csv", index=False)