#!/usr/bin/env python # coding: utf-8 # # Part 3: Parse HTML Code With Beautiful Soup # # - Find Elements by ID # - Find Elements by HTML Class Name # - Extract Text From HTML Elements # - Extract Attributes From HTML Elements # In[1]: # scrape the site import requests url = "https://www.indeed.com/jobs?q=python&l=new+york" response = requests.get(url) # After scraping the HTML content, you continue working to pick out the info you need. # In[2]: from bs4 import BeautifulSoup # In[3]: soup = BeautifulSoup(response.content) # In[4]: soup # What a soup!!! 🍜 Let's be picky and thin it out. # ## Find Elements By ID # # `id` attributes uniquely identify HTML elements. Let's find one we need with Developer Tools! # In[5]: results = soup.find(id='resultsCol') # In[6]: results # Better, but let's drill down some more # ## Find Elements By Class Name # # The job postings all have the same HTML `class`. Let's find all that are on this page. # In[7]: jobs = results.find_all('div', class_='result') # In[8]: len(jobs) # how many? # In[9]: jobs[0] # let's check out just one of them # ## Extract Text From HTML Elements # # Next, let's target a specific text from the site and extract it from the surrounding HTML # In[10]: title = jobs[0].find('h2') title # In[11]: title_link = title.find('a') title_link # In[12]: link_text = title_link.text link_text # In[13]: # clean it up link_text.strip() # And now for all jobs, in a concise list comprehension: # In[14]: job_titles = [job.find('h2').find('a').text.strip() for job in jobs] # In[15]: job_titles # ## Extract Attributes From HTML Elements # # Apart from text content, HTML attributes can contain important information you want to parse, for example the URL where a link points to. Let's learn how to extract them. # In[16]: title_link # In[22]: title_link['href'] # That's a **relative link**. In order to be able to access the resource, you will need to assemble the absolute URL. # In[18]: base_url = "https://www.indeed.com" job_url = base_url + title_link['href'] job_url # With this, you are now able to access the specifc job posting, for example by using `requests` again: # In[19]: job_site = requests.get(job_url) job_soup = BeautifulSoup(job_site.content) # In[20]: job_soup.text # You could set up a pipeline that follows the job posting details links and fetches the more detailed job description from there. You could set up some parameters by which to highlight or discard listings that contain certain key phrases. # # There's a lot you can do to customize this automated job search script to your own specific interests.