import pandas as pd
from bs4 import BeautifulSoup
import requests
import csv
import selenium
from selenium import webdriver
def get_url(search_term):
template = f'https://www.amazon.com/s?k={search_term}&crid=2LAC2KLUYYL46&sprefix=laptop%2Caps%2C257&ref=nb_sb_noss_2'
return template
def extract_sku_data(item):
try:
description = item.find('span', class_='a-size-medium a-color-base a-text-normal').text
except:
description = 'empty'
pass
try:
price = item.find('span', class_='a-offscreen').text
except:
price = 'empty'
pass
try:
rating = item.find('span', class_='').text
except:
rating = 'empty'
pass
try:
sku_link = item.find("a", class_="a-size-base a-link-normal s-no-hover s-underline-text s-underline-link-text s-link-style a-text-normal")
sku_link = sku_link['href']
except:
sku_link = 'empty'
pass
elem = {
"description": description,
"price": price,
"rating": rating,
"sku_link":sku_link #.split("ref")[0]
}
return elem
def scrape_page(driver, url):
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
items = soup.find_all('div', {'data-asin': True, 'data-component-type':True})
# for item in items:
data = []
for item in items:
item_id = item.get("data-asin")
elem = extract_sku_data(item)
data.append(elem)
# get next page link
next_page = soup.find(class_="s-pagination-item s-pagination-next s-pagination-button s-pagination-separator")
if next_page:
next_page = "https://www.amazon.com" + next_page['href']
else:
next_page = "last-page"
return next_page, pd.DataFrame(data)
# Function to clean up rating column
def clean_rating(rating):
try:
rating_value = float(rating)
if 0 <= rating_value <= 5: # Assuming rating is within the range of 0 to 5
return rating_value
else:
return None
except ValueError:
return None
# initialize the browser
driver = webdriver.Firefox()
# define a search term
search_term = 'laptop'
url = get_url(search_term)
url
'https://www.amazon.com/s?k=laptop&crid=2LAC2KLUYYL46&sprefix=laptop%2Caps%2C257&ref=nb_sb_noss_2'
data = []
while url != "last-page":
print(url)
url, page_data = scrape_page(driver, url)
data.append(page_data)
https://www.amazon.com/s?k=laptop&crid=2LAC2KLUYYL46&sprefix=laptop%2Caps%2C257&ref=nb_sb_noss_2 https://www.amazon.com/s?k=laptop&page=2&crid=2LAC2KLUYYL46&qid=1692980273&sprefix=laptop%2Caps%2C257&ref=sr_pg_1 https://www.amazon.com/s?k=laptop&page=3&crid=2LAC2KLUYYL46&qid=1692980276&sprefix=laptop%2Caps%2C257&ref=sr_pg_2 https://www.amazon.com/s?k=laptop&page=4&crid=2LAC2KLUYYL46&qid=1692980279&sprefix=laptop%2Caps%2C257&ref=sr_pg_3 https://www.amazon.com/s?k=laptop&page=5&crid=2LAC2KLUYYL46&qid=1692980283&sprefix=laptop%2Caps%2C257&ref=sr_pg_4 https://www.amazon.com/s?k=laptop&page=6&crid=2LAC2KLUYYL46&qid=1692980285&sprefix=laptop%2Caps%2C257&ref=sr_pg_5 https://www.amazon.com/s?k=laptop&page=7&crid=2LAC2KLUYYL46&qid=1692980288&sprefix=laptop%2Caps%2C257&ref=sr_pg_6 https://www.amazon.com/s?k=laptop&page=8&crid=2LAC2KLUYYL46&qid=1692980290&sprefix=laptop%2Caps%2C257&ref=sr_pg_7 https://www.amazon.com/s?k=laptop&page=9&crid=2LAC2KLUYYL46&qid=1692980293&sprefix=laptop%2Caps%2C257&ref=sr_pg_8 https://www.amazon.com/s?k=laptop&page=10&crid=2LAC2KLUYYL46&qid=1692980295&sprefix=laptop%2Caps%2C257&ref=sr_pg_9 https://www.amazon.com/s?k=laptop&page=11&crid=2LAC2KLUYYL46&qid=1692980298&sprefix=laptop%2Caps%2C257&ref=sr_pg_10 https://www.amazon.com/s?k=laptop&page=12&crid=2LAC2KLUYYL46&qid=1692980300&sprefix=laptop%2Caps%2C257&ref=sr_pg_11 https://www.amazon.com/s?k=laptop&page=13&crid=2LAC2KLUYYL46&qid=1692980303&sprefix=laptop%2Caps%2C257&ref=sr_pg_12 https://www.amazon.com/s?k=laptop&page=14&crid=2LAC2KLUYYL46&qid=1692980305&sprefix=laptop%2Caps%2C257&ref=sr_pg_13 https://www.amazon.com/s?k=laptop&page=15&crid=2LAC2KLUYYL46&qid=1692980308&sprefix=laptop%2Caps%2C257&ref=sr_pg_14 https://www.amazon.com/s?k=laptop&page=16&crid=2LAC2KLUYYL46&qid=1692980311&sprefix=laptop%2Caps%2C257&ref=sr_pg_15 https://www.amazon.com/s?k=laptop&page=17&crid=2LAC2KLUYYL46&qid=1692980313&sprefix=laptop%2Caps%2C257&ref=sr_pg_16 https://www.amazon.com/s?k=laptop&page=18&crid=2LAC2KLUYYL46&qid=1692980316&sprefix=laptop%2Caps%2C257&ref=sr_pg_17 https://www.amazon.com/s?k=laptop&page=19&crid=2LAC2KLUYYL46&qid=1692980318&sprefix=laptop%2Caps%2C257&ref=sr_pg_18 https://www.amazon.com/s?k=laptop&page=20&crid=2LAC2KLUYYL46&qid=1692980321&sprefix=laptop%2Caps%2C257&ref=sr_pg_19
df = pd.concat(data)
df['rating'] = df.rating.str.extract(r'(\d+\.\d)')
df['rating'] = pd.to_numeric(df['rating'])
# Applying the clean_rating function to the 'rating' column
df['rating'] = df['rating'].apply(clean_rating)
df.head()
description | price | rating | sku_link | |
---|---|---|---|---|
0 | 2023 Newest Upgraded IdeaPad 1i Laptops for St... | $329.99 | NaN | /sspa/click?ie=UTF8&spc=MTo0MjYwODg4ODE0ODAwNj... |
1 | Acer Aspire 1 A115-32-C96U Slim Laptop | 15.6"... | $229.99 | NaN | /sspa/click?ie=UTF8&spc=MTo0MjYwODg4ODE0ODAwNj... |
2 | Lenovo 2023 High Performance 15'' FHD IPS Lapt... | $269.99 | 4.3 | /Lenovo-Performance-15-Laptop-Super-Fast/dp/B0... |
3 | Acer Aspire 5 A515-56-347N Slim Laptop - 15.6"... | $299.99 | 4.3 | /Acer-Aspire-A515-56-347N-Slim-Laptop/dp/B0BL8... |
4 | Lenovo IdeaPad 3 – (2023) - Everyday Notebook ... | $280.98 | 4.5 | /Lenovo-IdeaPad-Everyday-Notebook-i3-1115G/dp/... |
df.to_csv('amazon_laptop.csv', index=False)