#!/usr/bin/env python
# coding: utf-8

# # Demo: How to scrape multiple things from multiple pages
# 
# The goal is to scrape info about the **five top-grossing movies** for each year, for 10 years. I want the title and rank of the movie, and also, how much money did it gross at the box office. In the end I will put the scraped data into a CSV file.

# In[32]:


from bs4 import BeautifulSoup
import requests


# In[33]:


url = 'https://www.boxofficemojo.com/year/2018/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')


# Using [Developer Tools](https://developers.google.com/web/tools/chrome-devtools#elements), I discover the data I want is in an HTML **table.** I also discover that it is the only table on the page.
# 
# I store it in a variable named `table`.

# In[34]:


table = soup.find( 'table' )


# I use trial-and-error testing with `print()` to discover whether I can get row and cell data cleanly from the table. 

# In[35]:


# get all the rows from that one table
rows = table.find_all('tr')
# some more trial-and-error testing to find out which row holds the first movie
print(rows[1])
# now that I have the right row, get all the cells in that row
cells = rows[1].find_all('td')
# see whether I can print the movie title cleanly
title = cells[1].text
print(title)


# Next I try a for-loop to see if I can cleanly get the first five movies in the table.

# In[36]:


# get top 5 movies on this page - I know the first row is [1]
for i in range(1, 6):
    cells = rows[i].find_all('td')
    title = cells[1].text
    print(title)


# Try a similar for-loop to get total gross for the top five movies. Developer Tools show me this value is in the eighth cell in each row.

# In[37]:


# I would like to get the total gross number also
for i in range(1, 6):
    cells = rows[i].find_all('td')
    gross = cells[7].text
    print(gross)


# Now I test getting all the values I want from each row, and it works!

# In[38]:


# next I want to get rank (1-5), title and gross all on one line
for i in range(1, 6):
    cells = rows[i].find_all('td')
    print(cells[0].text, cells[1].text, cells[7].text)


# I want this same data for each of 10 years, so first I will create list of the years I want.

# In[39]:


# create a list of the 10 years I want
years = []
start = 2019
for i in range(0, 10):
    years.append(start - i)
print(years)


# Still prepping for the 10 years, I create a base URL to use when I open each year's page.

# In[40]:


# create base url
base_url = 'https://www.boxofficemojo.com/year/'
# test it
# print(base_url + years[0] + '/') -- ERROR!
print( base_url + str(years[0]) + '/')


# Now I *should* have all the pieces I need ... I will test the code with a print statement --

# In[41]:


# collect all necessary pieces (tested above) to make a loop that gets 
# top 5 movies for each of the 10 years in my list of years

for year in years:
    url = base_url + str(year) + '/'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    table = soup.find( 'table' )
    rows = table.find_all('tr')
    for i in range(1, 6):
        cells = rows[i].find_all('td')
        print(cells[0].text, cells[1].text, cells[7].text)


# When I see the result, I realize I need to make two adjustments.
# 
# 1. Each line needs to have the year also
# 2. Maybe I should clean the gross so it's a pure integer
# 
# I can get rid of the dollar sign and the commas with a combination of two string methods -- 
# `.strip()` and `.replace()`

# In[42]:


# testing the clean-up code

num = '$293,004,164'
print(num.strip('$').replace(',', ''))


# In[43]:


# testing a way to add the year to each line, using a list with only two years in it to save time

miniyears = [2017, 2014]
for year in miniyears:
    url = base_url + str(year) + '/'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    table = soup.find( 'table' )
    rows = table.find_all('tr')
    for i in range(1, 6):
        cells = rows[i].find_all('td')
        gross = cells[7].text.strip('$').replace(',', '')
        print(year, cells[0].text, cells[1].text, gross)


# Now that I know it all works, I want to save the data in a CSV file. 
# 
# Python has a handy **built-in module** for reading and writing CSVs. We need to import it before we can use it.

# In[44]:


import csv

# open new file for writing - this creates the file
csvfile = open("movies.csv", 'w', newline='', encoding='utf-8')

# make a new variable, c, for Python's CSV writer object -
c = csv.writer(csvfile)

# write a header row to the csv
c.writerow( ['year', 'rank', 'title', 'gross'] )

# modified code from above
for year in years:
    url = base_url + str(year) + '/'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    table = soup.find( 'table' )
    rows = table.find_all('tr')
    for i in range(1, 6):
        cells = rows[i].find_all('td')
        gross = cells[7].text.strip('$').replace(',', '')
        # print(year, cells[0].text, cells[1].text, gross)
        # instead of printing, I need to make a LIST and write that list to the CSV as one row
        # I use the same cells that I had printed before 
        c.writerow( [year, cells[0].text, cells[1].text, gross] )

# close the file
csvfile.close()

print("The CSV is done!")


# The result is a CSV file, named movies.csv, that has 51 rows: the header row plus 5 movies for each year from 2010 through 2019. It has four columns: year, rank, title, and gross.
# 
# Note that **only the final cell above** is needed to create this CSV, by scraping 10 separate web pages. Everything *above* the final cell above is just instruction, demonstration. It is intended to show the problem-solving you need to go through to get to a desired scraping result.
# 
# You would not need to keep all the other work. Those cells could be deleted.