#!/usr/bin/env python # coding: utf-8 # # Demo: How to scrape multiple things from multiple pages # # The goal is to scrape info about the **five top-grossing movies** for each year, for 10 years. I want the title and rank of the movie, and also, how much money did it gross at the box office. In the end I will put the scraped data into a CSV file. # In[32]: from bs4 import BeautifulSoup import requests # In[33]: url = 'https://www.boxofficemojo.com/year/2018/' page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') # Using [Developer Tools](https://developers.google.com/web/tools/chrome-devtools#elements), I discover the data I want is in an HTML **table.** I also discover that it is the only table on the page. # # I store it in a variable named `table`. # In[34]: table = soup.find( 'table' ) # I use trial-and-error testing with `print()` to discover whether I can get row and cell data cleanly from the table. # In[35]: # get all the rows from that one table rows = table.find_all('tr') # some more trial-and-error testing to find out which row holds the first movie print(rows[1]) # now that I have the right row, get all the cells in that row cells = rows[1].find_all('td') # see whether I can print the movie title cleanly title = cells[1].text print(title) # Next I try a for-loop to see if I can cleanly get the first five movies in the table. # In[36]: # get top 5 movies on this page - I know the first row is [1] for i in range(1, 6): cells = rows[i].find_all('td') title = cells[1].text print(title) # Try a similar for-loop to get total gross for the top five movies. Developer Tools show me this value is in the eighth cell in each row. # In[37]: # I would like to get the total gross number also for i in range(1, 6): cells = rows[i].find_all('td') gross = cells[7].text print(gross) # Now I test getting all the values I want from each row, and it works! # In[38]: # next I want to get rank (1-5), title and gross all on one line for i in range(1, 6): cells = rows[i].find_all('td') print(cells[0].text, cells[1].text, cells[7].text) # I want this same data for each of 10 years, so first I will create list of the years I want. # In[39]: # create a list of the 10 years I want years = [] start = 2019 for i in range(0, 10): years.append(start - i) print(years) # Still prepping for the 10 years, I create a base URL to use when I open each year's page. # In[40]: # create base url base_url = 'https://www.boxofficemojo.com/year/' # test it # print(base_url + years[0] + '/') -- ERROR! print( base_url + str(years[0]) + '/') # Now I *should* have all the pieces I need ... I will test the code with a print statement -- # In[41]: # collect all necessary pieces (tested above) to make a loop that gets # top 5 movies for each of the 10 years in my list of years for year in years: url = base_url + str(year) + '/' page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') table = soup.find( 'table' ) rows = table.find_all('tr') for i in range(1, 6): cells = rows[i].find_all('td') print(cells[0].text, cells[1].text, cells[7].text) # When I see the result, I realize I need to make two adjustments. # # 1. Each line needs to have the year also # 2. Maybe I should clean the gross so it's a pure integer # # I can get rid of the dollar sign and the commas with a combination of two string methods -- # `.strip()` and `.replace()` # In[42]: # testing the clean-up code num = '$293,004,164' print(num.strip('$').replace(',', '')) # In[43]: # testing a way to add the year to each line, using a list with only two years in it to save time miniyears = [2017, 2014] for year in miniyears: url = base_url + str(year) + '/' page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') table = soup.find( 'table' ) rows = table.find_all('tr') for i in range(1, 6): cells = rows[i].find_all('td') gross = cells[7].text.strip('$').replace(',', '') print(year, cells[0].text, cells[1].text, gross) # Now that I know it all works, I want to save the data in a CSV file. # # Python has a handy **built-in module** for reading and writing CSVs. We need to import it before we can use it. # In[44]: import csv # open new file for writing - this creates the file csvfile = open("movies.csv", 'w', newline='', encoding='utf-8') # make a new variable, c, for Python's CSV writer object - c = csv.writer(csvfile) # write a header row to the csv c.writerow( ['year', 'rank', 'title', 'gross'] ) # modified code from above for year in years: url = base_url + str(year) + '/' page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') table = soup.find( 'table' ) rows = table.find_all('tr') for i in range(1, 6): cells = rows[i].find_all('td') gross = cells[7].text.strip('$').replace(',', '') # print(year, cells[0].text, cells[1].text, gross) # instead of printing, I need to make a LIST and write that list to the CSV as one row # I use the same cells that I had printed before c.writerow( [year, cells[0].text, cells[1].text, gross] ) # close the file csvfile.close() print("The CSV is done!") # The result is a CSV file, named movies.csv, that has 51 rows: the header row plus 5 movies for each year from 2010 through 2019. It has four columns: year, rank, title, and gross. # # Note that **only the final cell above** is needed to create this CSV, by scraping 10 separate web pages. Everything *above* the final cell above is just instruction, demonstration. It is intended to show the problem-solving you need to go through to get to a desired scraping result. # # You would not need to keep all the other work. Those cells could be deleted.