pip install bs4
pip install html5lib
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import requests
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"}
#driver = webdriver.Chrome()
#driver.get(URL)
#time.sleep(3)
#html = driver.page_source
#soup = BeautifulSoup(html, 'html.parser')
#r = requests.get(url=URL, headers=headers)
#driver.switch_to.frame("sp_message_iframe_764224")
#element = driver.find_element(By.XPATH, '/html/body/div/div[2]/div[3]/div[2]/button').click()
element = driver.find_element(by=By.XPATH, value= '//*[title="ACCEPT ALL]' )
element.click()
players_list = []
age_list = []
position_list = []
value_list = []
badge_list = []
for pagenum in range(1, 5):
URL = "https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/" +str(pagenum)
r = requests.get(url=URL, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
players = soup.find_all("td", class_="hauptlink")
ages = soup.find_all("td", class_="zentriert")
nationality = soup.find_all("td", class_="zentriert")
values = soup.find_all("td", class_="rechts hauptlink")
for pagenum in range(1, 5):
URL = "https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/" + str(pagenum)
r = requests.get(url=URL, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
players = soup.find_all("td", class_="hauptlink")
for player in players:
players_list.append(player.text)
ages = soup.find_all("td", class_="zentriert")
for age in ages:
age_list.append(age.text)
nationalities = soup.find_all("td", class_="zentriert")
for nation in nationality:
flags = nation.find_all('img')
if flags:
first_flag = flags[0]
first_nationality = first_flag['title']
nationality_list.append(first_nationality)
values = soup.find_all("td", class_="rechts hauptlink")
for value in values:
value_list.append(value.text)
badges = soup.find_all("td", class_="zentriert")
for badge in badges:
flags = badge.find_all('img')
if flags:
first_flag = flags[0]
first_badge = first_flag['title']
badge_list.append(first_badge)
print(first_nationality)
nationality_list
players
for player in players:
players_list.append(player.text)
players_list
i = 1
while i < len(players_list):
del players_list[i]
i += 1
print(players_list)
ages = soup.find_all("td", class_="zentriert")
ages
for age in ages:
age_list.append(age.text)
age_list
new_age_list = []
for i in range(2, len(age_list), 4):
new_age_list.append(age_list[i])
print(new_age_list)
new_age_list
nationality = soup.find_all("td", class_="zentriert")
nationality
for pagenum in range(1, 5):
URL = "https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/" + str(pagenum)
r = requests.get(url=URL, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find("table", class_='items')
tbody = table.find("tbody")
badges = tbody.find_all("td", class_="zentriert")
for badge in badges:
flags = badge.find_all('img')
if flags:
first_flag = flags[0]
first_badge = first_flag['title']
badge_list.append(first_badge)
print(first_badge)
badge_list
del badge_list[-5:]
indexes_to_delete = [219, 218, 217, 216, 215, 164, 163, 162, 161, 160, 109, 108, 107, 106, 105, 54, 53, 52, 51, 50]
for index in indexes_to_delete:
del badge_list[index]
badge_list
nationality_list = badge_list[::2]
club_list = badge_list[1::2]
nationality_list
club_list
for nation in nationality:
flags = nation.find('img')
if flags:
first_flag = flags['0']
first_nationality = first_flag['title']
nationality_list.append(first_nationality)
print(first_nationality)
i = 1
while i < len(nationality_list):
del nationality_list[i]
i += 1
print(nationality_list)
values = soup.find_all("td", class_="rechts hauptlink")
values
for value in values:
value_list.append(value.text)
value_list
all_players = []
for pagenum in range(1, 5):
URL = "https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/" + str(pagenum)
r = requests.get(url=URL, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find("table", class_='items')
tbody = table.find("tbody")
rows = tbody.find_all("tr")
for row in rows:
cols = row.find_all('td')
cols = [col.text.strip() for col in cols]
all_players.append(cols)
print(all_players)
print(len(all_players))
all_players
final_players = [all_players[i] for i in range(len(all_players)) if i % 3 == 0]
print(final_players)
final_players[2]
import pandas as pd
df = pd.DataFrame(final_players, columns=['Rank', 'null', 'null1', 'Player', 'Position', 'null3', 'Age', 'null4', 'Value'])
df.head()
df['Nationality'] = nationality_list
df['Club'] = club_list
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
URL_TEMPLATE = 'https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/{}'
headers = {'User-Agent': USER_AGENT}
def scrape_webpage(url):
response = requests.get(url=url, headers=headers)
return BeautifulSoup(response.content, 'html.parser')
all_players = []
for pagenum in range(1, 5):
url = URL_TEMPLATE.format(pagenum)
soup = scrape_webpage(url)
table = soup.find("table", class_='items')
tbody = table.find("tbody")
rows = tbody.find_all("tr")
for row in rows:
cols = row.find_all('td')
cols = [col.text.strip() for col in cols]
all_players.append(cols)
final_players = [all_players[i] for i in range(len(all_players)) if i % 3 == 0]
print(final_players)
all_players = []
for pagenum in range(1, 5):
URL = "https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/" + str(pagenum)
r = requests.get(url=URL, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
table = soup.find("table", class_='items')
tbody = table.find("tbody")
rows = tbody.find_all("tr")
for row in rows:
cols = row.find_all('td')
cols = [col.text.strip() for col in cols]
all_players.append(cols)
final_players = [all_players[i] for i in range(len(all_players)) if i % 3 == 0]