#!/usr/bin/env python # coding: utf-8 # # Scraping Dataquest for Course List # In[1]: import requests from bs4 import BeautifulSoup from random import randint from time import sleep import re # In[2]: path_url = "https://www.dataquest.io/path/data-scientist-v2/" r = requests.get(path_url) # Requesting a web page # print(r.content) # Printing the raw HTLM soup = BeautifulSoup(r.content, 'html.parser') #Initializing the parser # print(soup.prettify()) # In[3]: # Scraping the path page for getting the url of its courses courses = soup.find_all("div", "course-list-card-item") course_urls = [] course_list = [] for c in courses: course_url = c.find('a', href = re.compile(r'/([a-z]|[A-Z])\w+')).attrs['href'] course_urls.append(course_url) course = c.find('a').text course_list.append(course) print(course_urls) print(course_list) # In[4]: # Scraping only one course page # course_url = "https://www.dataquest.io/course/variables-data-types-and-lists-in-python/" # r = requests.get(course_url) # parser = BeautifulSoup(r.content, 'html.parser') # # print(parser.prettify()) # missions = parser.find_all("div", "course-list-card-item") # mission_urls = [] # mission_list = [] # for m in missions: # url = m.find('a', href = re.compile(r'/([a-z]|[A-Z])\w+')).attrs['href'] # mission_urls.append(url) # mission = m.find('a').text # mission_list.append(mission) # print(mission_urls) # print(mission_list) # In[5]: # Scraping all course pages and storing the result in a list courses = [] for url, course in zip(course_urls, course_list): r = requests.get(url) parser = BeautifulSoup(r.content, 'html.parser') missions = parser.find_all("div", "course-list-card-item") for m in missions: mission_url = m.find('a', href = re.compile(r'/([a-z]|[A-Z])\w+')).attrs['href'] mission = m.find('a').text courses.append([course, mission, mission_url]) sleep(randint(1,10)) print(courses) # In[6]: # Writing the list to a CSV file import csv with open('dataquest_courses.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['Course', 'Mission', 'Mission url']) writer.writerows(courses) f.close()