Python/pdfquery: Scraping the FIFA World Player of the Year votes (here: compare with pdftotext
and text processing)
import csv
import pathlib
import re
import subprocess
import urllib.request
URL = 'https://raw.githubusercontent.com/mneedham/fifa/master/fboaward_menplayer2014_neutral.pdf'
PDF = pathlib.Path(URL.rpartition('/')[2])
TXT = PDF.with_suffix('.txt')
CSV = PDF.with_suffix('.csv')
def load_pages(filepath=TXT, encoding='utf-8', delimiter='\f'):
text = filepath.read_text(encoding=encoding)
return text.split(delimiter)
if not PDF.exists():
urllib.request.urlretrieve(URL, PDF)
if not TXT.exists(): # requires one of popper-utils, miktex-poppler-bin, xpdf
subprocess.run(['pdftotext', '-layout', PDF, TXT], check=True)
pages = load_pages()
len(pages)
18
def iterrows(pages):
first = None
for p in pages:
if not p:
continue
title, _, rest = p.partition('\n\n\n')
header, _, rest = rest.partition('\n')
header = re.split(r' {2,}', header)
if first is None:
yield header
first = title, header
else:
assert title, header == first
table, _, footer = rest.partition('\n ')
table, footer = (s.strip() for s in (table, footer))
page, total = map(int, re.match(r'(\d+) / (\d+)$', footer).groups())
for line in table.splitlines():
if not line:
continue
if line.startswith(' '):
assert line.endswith(' Grenadines')
continue
row = re.split(r' {2,}', line)
assert len(row) == len(header)
if row[1] == 'St. Vincent and the':
row[1] += ' Grenadines'
yield row
rows = list(iterrows(pages))
rows[:2]
[['Vote', 'Country', 'Name', 'First (5 points)', 'Second (3 points)', 'Third (1 point)'], ['Captain', 'Afghanistan', 'Amiri Islam', 'Messi Lionel', 'Cristiano Ronaldo', 'Ibrahimovic Zlatan']]
rows[163]
['Captain', 'Sweden', 'Ibrahimovic Zlatan', 'Messi Lionel', 'Neuer Manuel', 'Cristiano Ronaldo']
def write_csv(rows, filepath=CSV, encoding='utf-8', dialect=csv.excel, has_header=True):
with filepath.open('w', encoding=encoding, newline='') as f:
writer = csv.writer(f, dialect=dialect)
for ndata, r in enumerate(rows, 0 if has_header else 1):
writer.writerow(r)
return ndata
write_csv(rows)
544
import pandas as pd
df = pd.read_csv(CSV, encoding='utf-8', dialect=csv.excel)
df.info()
assert df.set_index(['Vote', 'Country']).index.is_unique
df.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 544 entries, 0 to 543 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Vote 544 non-null object 1 Country 544 non-null object 2 Name 544 non-null object 3 First (5 points) 544 non-null object 4 Second (3 points) 544 non-null object 5 Third (1 point) 544 non-null object dtypes: object(6) memory usage: 25.6+ KB
Vote | Country | Name | First (5 points) | Second (3 points) | Third (1 point) | |
---|---|---|---|---|---|---|
0 | Captain | Afghanistan | Amiri Islam | Messi Lionel | Cristiano Ronaldo | Ibrahimovic Zlatan |
1 | Captain | Albania | Cana Lorik | Cristiano Ronaldo | Robben Arjen | Mueller Thomas |
2 | Captain | Algeria | Bougherra Madjid | Cristiano Ronaldo | Robben Arjen | Benzema Karim |
3 | Captain | American Samoa | Luvu Rafe Talalelei | Neymar | Robben Arjen | Cristiano Ronaldo |
4 | Captain | Andorra | Sonejee Masand Oscar | Cristiano Ronaldo | Mueller Thomas | Kroos Toni |
df['Vote'].value_counts().to_frame('Votes').sort_index()
Votes | |
---|---|
Vote | |
Captain | 182 |
Coach | 181 |
Media | 181 |
pd.set_option('display.max_rows', 250)
df.groupby('Country')['Vote'].value_counts().unstack(fill_value=0)
Vote | Captain | Coach | Media |
---|---|---|---|
Country | |||
Afghanistan | 1 | 1 | 0 |
Albania | 1 | 1 | 1 |
Algeria | 1 | 1 | 1 |
American Samoa | 1 | 1 | 0 |
Andorra | 1 | 1 | 1 |
Angola | 1 | 1 | 1 |
Anguilla | 1 | 1 | 0 |
Antigua and Barbuda | 1 | 1 | 1 |
Argentina | 1 | 1 | 1 |
Armenia | 1 | 1 | 1 |
Aruba | 1 | 1 | 1 |
Australia | 1 | 1 | 1 |
Austria | 1 | 1 | 1 |
Azerbaijan | 1 | 1 | 1 |
Bahamas | 1 | 1 | 0 |
Bahrain | 1 | 1 | 1 |
Bangladesh | 1 | 1 | 1 |
Barbados | 1 | 1 | 0 |
Belarus | 1 | 1 | 1 |
Belgium | 1 | 1 | 1 |
Belize | 1 | 1 | 1 |
Benin | 0 | 0 | 1 |
Bermuda | 1 | 1 | 1 |
Bhutan | 1 | 1 | 1 |
Bolivia | 1 | 1 | 1 |
Bosnia and Herzegovina | 1 | 1 | 0 |
Bosnia-Herzegovina | 0 | 0 | 1 |
Botswana | 0 | 0 | 1 |
Brazil | 1 | 1 | 1 |
British Virgin Islands | 1 | 1 | 0 |
Brunei Darussalam | 1 | 1 | 1 |
Bulgaria | 1 | 1 | 1 |
Burkina Faso | 1 | 1 | 1 |
Burma | 0 | 0 | 1 |
Burundi | 1 | 1 | 0 |
Cambodia | 1 | 1 | 1 |
Cameroon | 1 | 1 | 1 |
Canada | 1 | 1 | 1 |
Cape Verde Islands | 1 | 1 | 1 |
Cayman Islands | 1 | 0 | 1 |
Chad | 1 | 1 | 1 |
Chile | 1 | 1 | 1 |
China PR | 1 | 1 | 1 |
Chinese Taipei | 1 | 1 | 0 |
Colombia | 1 | 1 | 1 |
Comoros | 1 | 1 | 1 |
Congo | 1 | 1 | 1 |
Congo DR | 1 | 1 | 1 |
Cook Islands | 1 | 1 | 0 |
Costa Rica | 1 | 1 | 1 |
Croatia | 1 | 1 | 1 |
Cuba | 1 | 1 | 1 |
Curaçao | 1 | 1 | 1 |
Cyprus | 1 | 1 | 1 |
Czech Republic | 1 | 1 | 1 |
Côte d'Ivoire | 0 | 1 | 1 |
Denmark | 1 | 1 | 1 |
Djibouti | 1 | 1 | 1 |
Dominica | 1 | 1 | 0 |
Dominican Republic | 1 | 1 | 1 |
Ecuador | 1 | 1 | 1 |
Egypt | 1 | 1 | 1 |
El Salvador | 1 | 1 | 1 |
England | 1 | 1 | 1 |
Eritrea | 0 | 0 | 1 |
Estonia | 1 | 1 | 1 |
Ethiopia | 1 | 1 | 1 |
FYR Macedonia | 1 | 1 | 1 |
Faroe Islands | 1 | 1 | 1 |
Fiji | 1 | 1 | 1 |
Finland | 1 | 1 | 1 |
France | 1 | 1 | 1 |
Gabon | 1 | 1 | 1 |
Gambia | 0 | 0 | 1 |
Georgia | 1 | 1 | 1 |
Germany | 1 | 1 | 1 |
Ghana | 1 | 1 | 1 |
Greece | 1 | 1 | 1 |
Grenada | 1 | 1 | 1 |
Guam | 1 | 1 | 0 |
Guatemala | 1 | 1 | 1 |
Guinea | 0 | 0 | 1 |
Guinea-Bissau | 0 | 0 | 1 |
Guyana | 1 | 1 | 1 |
Haiti | 0 | 0 | 1 |
Honduras | 1 | 1 | 1 |
Hong Kong | 1 | 1 | 1 |
Hungary | 1 | 1 | 1 |
Iceland | 1 | 1 | 1 |
India | 1 | 1 | 1 |
Indonesia | 1 | 1 | 1 |
Iran | 1 | 1 | 1 |
Iraq | 0 | 0 | 1 |
Ireland | 0 | 0 | 1 |
Israel | 1 | 1 | 1 |
Italy | 1 | 1 | 1 |
Jamaica | 1 | 1 | 0 |
Japan | 1 | 1 | 1 |
Jordan | 1 | 1 | 0 |
Kazakhstan | 1 | 1 | 1 |
Kenya | 1 | 1 | 1 |
Korea Republic | 1 | 1 | 0 |
Kuwait | 1 | 1 | 1 |
Kyrgyzstan | 1 | 1 | 1 |
Laos | 1 | 1 | 0 |
Latvia | 1 | 1 | 1 |
Lebanon | 1 | 1 | 1 |
Lesotho | 1 | 1 | 1 |
Liberia | 1 | 1 | 1 |
Libya | 0 | 0 | 1 |
Liechtenstein | 1 | 1 | 1 |
Lithuania | 1 | 1 | 1 |
Luxembourg | 1 | 1 | 1 |
Macau | 1 | 1 | 1 |
Madagascar | 1 | 1 | 1 |
Malawi | 1 | 1 | 1 |
Malaysia | 0 | 0 | 1 |
Maldives | 1 | 1 | 1 |
Mali | 1 | 1 | 1 |
Malta | 1 | 1 | 1 |
Mauritania | 1 | 1 | 1 |
Mauritius | 1 | 1 | 1 |
Mexico | 1 | 1 | 1 |
Moldova | 1 | 1 | 1 |
Mongolia | 1 | 1 | 1 |
Montenegro | 1 | 1 | 1 |
Montserrat | 1 | 1 | 0 |
Morocco | 1 | 1 | 1 |
Mozambique | 1 | 1 | 1 |
Myanmar | 1 | 1 | 0 |
Namibia | 1 | 1 | 1 |
Nepal | 1 | 1 | 1 |
Netherlands | 1 | 1 | 1 |
New Caledonia | 1 | 1 | 1 |
New Zealand | 1 | 1 | 1 |
Nicaragua | 1 | 1 | 1 |
Niger | 1 | 1 | 1 |
Nigeria | 1 | 1 | 1 |
North Korea | 0 | 0 | 1 |
Northern Ireland | 0 | 0 | 1 |
Norway | 1 | 1 | 1 |
Oman | 1 | 1 | 1 |
Pakistan | 1 | 0 | 1 |
Palestine | 1 | 1 | 1 |
Panama | 1 | 1 | 1 |
Paraguay | 1 | 1 | 1 |
Peru | 1 | 1 | 1 |
Philippines | 1 | 1 | 1 |
Poland | 1 | 1 | 1 |
Portugal | 1 | 1 | 1 |
Puerto Rico | 1 | 1 | 1 |
Qatar | 1 | 1 | 1 |
Republic of Ireland | 1 | 1 | 0 |
Romania | 1 | 1 | 1 |
Russia | 1 | 1 | 1 |
Rwanda | 1 | 1 | 1 |
Samoa | 1 | 1 | 0 |
San Marino | 1 | 1 | 1 |
Saudi Arabia | 1 | 1 | 1 |
Scotland | 1 | 1 | 1 |
Senegal | 1 | 1 | 1 |
Serbia | 1 | 1 | 1 |
Seychelles | 1 | 1 | 1 |
Sierra Leone | 1 | 1 | 1 |
Singapore | 1 | 1 | 1 |
Slovakia | 1 | 1 | 1 |
Slovenia | 1 | 1 | 1 |
Solomon Islands | 1 | 1 | 0 |
South Africa | 1 | 1 | 1 |
South Korea | 0 | 0 | 1 |
South Sudan | 1 | 1 | 0 |
Spain | 1 | 1 | 1 |
Sri Lanka | 1 | 1 | 0 |
St Kitts & Nevis | 0 | 0 | 1 |
St. Kitts and Nevis | 1 | 1 | 0 |
St. Lucia | 1 | 1 | 1 |
St. Vincent and the Grenadines | 1 | 1 | 0 |
Sudan | 1 | 1 | 1 |
Suriname | 1 | 1 | 1 |
Swaziland | 1 | 1 | 1 |
Sweden | 1 | 1 | 1 |
Switzerland | 1 | 1 | 1 |
Syria | 1 | 1 | 1 |
Tahiti | 1 | 1 | 1 |
Tajikistan | 1 | 1 | 1 |
Tanzania | 1 | 1 | 1 |
Thailand | 1 | 1 | 1 |
Togo | 0 | 0 | 1 |
Trinidad and Tobago | 0 | 0 | 1 |
Tunisia | 0 | 0 | 1 |
Turkey | 1 | 1 | 1 |
Turkmenistan | 0 | 0 | 1 |
Turks and Caicos Islands | 1 | 1 | 1 |
US Virgin Islands | 1 | 1 | 0 |
USA | 1 | 1 | 1 |
Uganda | 0 | 0 | 1 |
Ukraine | 1 | 1 | 1 |
United Arab Emirates | 1 | 1 | 1 |
Uruguay | 1 | 1 | 1 |
Uzbekistan | 1 | 1 | 1 |
Vanuatu | 0 | 0 | 1 |
Venezuela | 1 | 1 | 1 |
Vietnam | 1 | 1 | 1 |
Wales | 1 | 1 | 1 |
Yemen | 0 | 0 | 1 |
Zambia | 1 | 1 | 1 |
Zimbabwe | 1 | 1 | 1 |
df.head()
Vote | Country | Name | First (5 points) | Second (3 points) | Third (1 point) | |
---|---|---|---|---|---|---|
0 | Captain | Afghanistan | Amiri Islam | Messi Lionel | Cristiano Ronaldo | Ibrahimovic Zlatan |
1 | Captain | Albania | Cana Lorik | Cristiano Ronaldo | Robben Arjen | Mueller Thomas |
2 | Captain | Algeria | Bougherra Madjid | Cristiano Ronaldo | Robben Arjen | Benzema Karim |
3 | Captain | American Samoa | Luvu Rafe Talalelei | Neymar | Robben Arjen | Cristiano Ronaldo |
4 | Captain | Andorra | Sonejee Masand Oscar | Cristiano Ronaldo | Mueller Thomas | Kroos Toni |
pf = (df.melt('Vote', df.columns[-3:].tolist(), var_name='Rank', value_name='_Name')
.rename(columns={'_Name': 'Name'})
.assign(Points=lambda x: x['Rank'].str.extract(r'(\d+)', expand=False).astype(int))
.pivot_table('Points', ['Vote', 'Name'], 'Rank', aggfunc=[len, 'sum'], fill_value=0)
.assign(Total=lambda x: x['sum'].sum(axis=1)))
pf.groupby(level='Name').sum().sort_values(by='Total', ascending=False)
len | sum | Total | |||||
---|---|---|---|---|---|---|---|
Rank | First (5 points) | Second (3 points) | Third (1 point) | First (5 points) | Second (3 points) | Third (1 point) | |
Name | |||||||
Cristiano Ronaldo | 303 | 96 | 41 | 1515 | 288 | 41 | 1844 |
Messi Lionel | 55 | 132 | 101 | 275 | 396 | 101 | 772 |
Neuer Manuel | 85 | 89 | 77 | 425 | 267 | 77 | 769 |
Robben Arjen | 17 | 69 | 59 | 85 | 207 | 59 | 351 |
Mueller Thomas | 22 | 35 | 50 | 110 | 105 | 50 | 265 |
Lahm Philipp | 12 | 17 | 31 | 60 | 51 | 31 | 142 |
Neymar | 6 | 13 | 39 | 30 | 39 | 39 | 108 |
Rodriguez James | 6 | 8 | 18 | 30 | 24 | 18 | 72 |
Kroos Toni | 5 | 11 | 12 | 25 | 33 | 12 | 70 |
Di Maria Angel | 4 | 8 | 19 | 20 | 24 | 19 | 63 |
Costa Diego | 2 | 9 | 13 | 10 | 27 | 13 | 50 |
Bale Gareth | 5 | 6 | 6 | 25 | 18 | 6 | 49 |
Ibrahimovic Zlatan | 3 | 8 | 10 | 15 | 24 | 10 | 49 |
Toure Yaya | 0 | 9 | 15 | 0 | 27 | 15 | 42 |
Goetze Mario | 5 | 5 | 1 | 25 | 15 | 1 | 41 |
Benzema Karim | 1 | 7 | 11 | 5 | 21 | 11 | 37 |
Iniesta Andres | 3 | 4 | 6 | 15 | 12 | 6 | 33 |
Schweinsteiger Bastian | 2 | 3 | 9 | 10 | 9 | 9 | 28 |
Mascherano Javier | 2 | 4 | 5 | 10 | 12 | 5 | 27 |
Courtois Thibaut | 4 | 1 | 2 | 20 | 3 | 2 | 25 |
Hazard Eden | 0 | 4 | 11 | 0 | 12 | 11 | 23 |
Pogba Paul | 1 | 4 | 3 | 5 | 12 | 3 | 20 |
Ramos Sergio | 1 | 2 | 5 | 5 | 6 | 5 | 16 |
pf.loc['Captain'].sort_values(by='Total', ascending=False).head()
len | sum | Total | |||||
---|---|---|---|---|---|---|---|
Rank | First (5 points) | Second (3 points) | Third (1 point) | First (5 points) | Second (3 points) | Third (1 point) | |
Name | |||||||
Cristiano Ronaldo | 99 | 37 | 13 | 495 | 111 | 13 | 619 |
Messi Lionel | 31 | 44 | 25 | 155 | 132 | 25 | 312 |
Neuer Manuel | 22 | 13 | 27 | 110 | 39 | 27 | 176 |
Robben Arjen | 3 | 25 | 22 | 15 | 75 | 22 | 112 |
Mueller Thomas | 3 | 11 | 16 | 15 | 33 | 16 | 64 |
pf.loc['Coach'].sort_values(by='Total', ascending=False).head()
len | sum | Total | |||||
---|---|---|---|---|---|---|---|
Rank | First (5 points) | Second (3 points) | Third (1 point) | First (5 points) | Second (3 points) | Third (1 point) | |
Name | |||||||
Cristiano Ronaldo | 89 | 31 | 14 | 445 | 93 | 14 | 552 |
Messi Lionel | 20 | 49 | 35 | 100 | 147 | 35 | 282 |
Neuer Manuel | 32 | 31 | 25 | 160 | 93 | 25 | 278 |
Robben Arjen | 10 | 26 | 20 | 50 | 78 | 20 | 148 |
Mueller Thomas | 8 | 8 | 17 | 40 | 24 | 17 | 81 |
pf.loc['Media'].sort_values(by='Total', ascending=False).head()
len | sum | Total | |||||
---|---|---|---|---|---|---|---|
Rank | First (5 points) | Second (3 points) | Third (1 point) | First (5 points) | Second (3 points) | Third (1 point) | |
Name | |||||||
Cristiano Ronaldo | 115 | 28 | 14 | 575 | 84 | 14 | 673 |
Neuer Manuel | 31 | 45 | 25 | 155 | 135 | 25 | 315 |
Messi Lionel | 4 | 39 | 41 | 20 | 117 | 41 | 178 |
Mueller Thomas | 11 | 16 | 17 | 55 | 48 | 17 | 120 |
Robben Arjen | 4 | 18 | 17 | 20 | 54 | 17 | 91 |