#!/usr/bin/env python # coding: utf-8 # In[ ]: pip install bs4 # In[ ]: pip install html5lib # In[ ]: from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from bs4 import BeautifulSoup import time import requests # In[ ]: # In[ ]: headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"} #driver = webdriver.Chrome() #driver.get(URL) #time.sleep(3) #html = driver.page_source #soup = BeautifulSoup(html, 'html.parser') #r = requests.get(url=URL, headers=headers) #driver.switch_to.frame("sp_message_iframe_764224") #element = driver.find_element(By.XPATH, '/html/body/div/div[2]/div[3]/div[2]/button').click() # In[ ]: element = driver.find_element(by=By.XPATH, value= '//*[title="ACCEPT ALL]' ) element.click() # In[ ]: players_list = [] age_list = [] position_list = [] value_list = [] badge_list = [] # In[ ]: for pagenum in range(1, 5): URL = "https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/" +str(pagenum) r = requests.get(url=URL, headers=headers) soup = BeautifulSoup(r.content, 'html.parser') players = soup.find_all("td", class_="hauptlink") ages = soup.find_all("td", class_="zentriert") nationality = soup.find_all("td", class_="zentriert") values = soup.find_all("td", class_="rechts hauptlink") # In[ ]: for pagenum in range(1, 5): URL = "https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/" + str(pagenum) r = requests.get(url=URL, headers=headers) soup = BeautifulSoup(r.content, 'html.parser') players = soup.find_all("td", class_="hauptlink") for player in players: players_list.append(player.text) ages = soup.find_all("td", class_="zentriert") for age in ages: age_list.append(age.text) nationalities = soup.find_all("td", class_="zentriert") for nation in nationality: flags = nation.find_all('img') if flags: first_flag = flags[0] first_nationality = first_flag['title'] nationality_list.append(first_nationality) values = soup.find_all("td", class_="rechts hauptlink") for value in values: value_list.append(value.text) # In[ ]: badges = soup.find_all("td", class_="zentriert") for badge in badges: flags = badge.find_all('img') if flags: first_flag = flags[0] first_badge = first_flag['title'] badge_list.append(first_badge) print(first_nationality) # In[ ]: nationality_list # In[ ]: players # In[ ]: for player in players: players_list.append(player.text) # In[ ]: players_list # In[ ]: i = 1 while i < len(players_list): del players_list[i] i += 1 print(players_list) # In[ ]: ages = soup.find_all("td", class_="zentriert") # In[ ]: ages # In[ ]: for age in ages: age_list.append(age.text) # In[ ]: age_list # In[ ]: new_age_list = [] for i in range(2, len(age_list), 4): new_age_list.append(age_list[i]) print(new_age_list) # In[ ]: new_age_list # In[ ]: nationality = soup.find_all("td", class_="zentriert") # In[ ]: nationality # In[ ]: for pagenum in range(1, 5): URL = "https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/" + str(pagenum) r = requests.get(url=URL, headers=headers) soup = BeautifulSoup(r.content, 'html.parser') table = soup.find("table", class_='items') tbody = table.find("tbody") badges = tbody.find_all("td", class_="zentriert") for badge in badges: flags = badge.find_all('img') if flags: first_flag = flags[0] first_badge = first_flag['title'] badge_list.append(first_badge) print(first_badge) # In[ ]: badge_list # In[ ]: del badge_list[-5:] # In[ ]: indexes_to_delete = [219, 218, 217, 216, 215, 164, 163, 162, 161, 160, 109, 108, 107, 106, 105, 54, 53, 52, 51, 50] for index in indexes_to_delete: del badge_list[index] # In[ ]: badge_list # In[ ]: nationality_list = badge_list[::2] club_list = badge_list[1::2] # In[ ]: nationality_list # In[ ]: club_list # In[ ]: for nation in nationality: flags = nation.find('img') if flags: first_flag = flags['0'] first_nationality = first_flag['title'] nationality_list.append(first_nationality) print(first_nationality) # In[ ]: i = 1 while i < len(nationality_list): del nationality_list[i] i += 1 print(nationality_list) # In[ ]: values = soup.find_all("td", class_="rechts hauptlink") # In[ ]: values # In[ ]: for value in values: value_list.append(value.text) # In[ ]: value_list # In[ ]: all_players = [] for pagenum in range(1, 5): URL = "https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/" + str(pagenum) r = requests.get(url=URL, headers=headers) soup = BeautifulSoup(r.content, 'html.parser') table = soup.find("table", class_='items') tbody = table.find("tbody") rows = tbody.find_all("tr") for row in rows: cols = row.find_all('td') cols = [col.text.strip() for col in cols] all_players.append(cols) print(all_players) print(len(all_players)) # In[ ]: all_players # In[ ]: final_players = [all_players[i] for i in range(len(all_players)) if i % 3 == 0] print(final_players) # In[ ]: final_players[2] # In[ ]: import pandas as pd df = pd.DataFrame(final_players, columns=['Rank', 'null', 'null1', 'Player', 'Position', 'null3', 'Age', 'null4', 'Value']) # In[ ]: df.head() # In[ ]: # In[ ]: df['Nationality'] = nationality_list # In[ ]: df['Club'] = club_list # In[ ]: USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36' URL_TEMPLATE = 'https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/{}' headers = {'User-Agent': USER_AGENT} # In[ ]: def scrape_webpage(url): response = requests.get(url=url, headers=headers) return BeautifulSoup(response.content, 'html.parser') # In[ ]: all_players = [] for pagenum in range(1, 5): url = URL_TEMPLATE.format(pagenum) soup = scrape_webpage(url) table = soup.find("table", class_='items') tbody = table.find("tbody") rows = tbody.find_all("tr") for row in rows: cols = row.find_all('td') cols = [col.text.strip() for col in cols] all_players.append(cols) final_players = [all_players[i] for i in range(len(all_players)) if i % 3 == 0] print(final_players) # In[ ]: all_players = [] for pagenum in range(1, 5): URL = "https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/" + str(pagenum) r = requests.get(url=URL, headers=headers) soup = BeautifulSoup(r.content, 'html.parser') table = soup.find("table", class_='items') tbody = table.find("tbody") rows = tbody.find_all("tr") for row in rows: cols = row.find_all('td') cols = [col.text.strip() for col in cols] all_players.append(cols) final_players = [all_players[i] for i in range(len(all_players)) if i % 3 == 0] # In[ ]: