https://github.com/SmugZombie/FBO_Parser/blob/master/fbo_parse.py
<div class="solt">Janitorial Service </div>,
<div class="soln">70FBR919Q00000072 </div>,
<div class="solcc">S -- Utilities and housekeeping services </div>
# import requests
# def html_download(url, method="get", params=None, data=None):
# userAgent = {"user-agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0"}
# return requests.request(method, url, params=params, data=data, headers= userAgent)
# from bs4 import BeautifulSoup
# url = "https://www.fbo.gov/index.php?s=opportunity&mode=list&tab=list&tabmode=list&pp=100"
# html = html_download(url)
# dom = BeautifulSoup(html.text, "html.parser") # 오류시
# type(html.text), type(dom)
# with open('fbo.html', "w") as f:
# f.write(html.text)
with open('fbo.html', "r") as f:
html = f.read()
from bs4 import BeautifulSoup
dom = BeautifulSoup(html, "html.parser") # 오류시
type(html), type(dom)
(str, bs4.BeautifulSoup)
# 링크가 포함된 페이지만 추출하기
link_pages = [_ for _ in dom.find("", {"class":"list"}).find_all("a")
if str(_['href']).find("s=opportunity&mode=form&id") != -1]
# 추출 내용 중 제목만 추출
[_.find("", {"class":'solt'}).text for _ in link_pages][:5]
['Janitorial Service ', '40--SWEEP WIRE,FAIRED STBD ', '40--WIRE ROPE ASSEMBLY, ', '51--JACK,SCREW,MECHANIC ', 'Notice of intent to sole source: Azure Biosystems Inc. ']
# 추출 내용 중 ID만 추출
[_.find("", {"class":'soln'}).text.strip() for _ in link_pages][:5]
['70FBR919Q00000072', 'SPE4A619T38X9', 'SPE4A619T828X', 'SPE4A619T13F9', '12905B19R8027']
# 추출 내용 중 품목명 추출
[_.find("", {"class":'solcc'}).text.strip() for _ in link_pages][:5]
['S -- Utilities and housekeeping services', '40 -- Rope, cable, chain & fittings', '40 -- Rope, cable, chain & fittings', '51 -- Hand tools', '99 -- Miscellaneous']
# 개별 페이지 링크 주소만 추출
href_links = [_['href'] for _ in link_pages][:5]
href_links[0]
'?s=opportunity&mode=form&id=8901b7da304242cf60579b4a94870a6d&tab=core&_cview=0'
url = "https://www.fbo.gov/index.php?s=opportunity&mode=list&tab=list&tabmode=list&pp=100"
import pandas as pd
import requests
fbo_links = pd.DataFrame()
fbo_links['id'] = [_.find("", {"class":'soln'}).text.strip() for _ in link_pages]
fbo_links['soln'] = [_.find("", {"class":'solt'}).text.strip() for _ in link_pages]
fbo_links['solcc'] = [_.find("", {"class":'solcc'}).text.strip() for _ in link_pages]
# fbo_links['href'] = [_['href'] for _ in link_pages]
fbo_links['href'] = [requests.compat.urljoin(url, _['href']) for _ in link_pages]
fbo_links.head()
id | soln | solcc | href | |
---|---|---|---|---|
0 | 70FBR919Q00000072 | Janitorial Service | S -- Utilities and housekeeping services | https://www.fbo.gov/index.php?s=opportunity&mo... |
1 | SPE4A619T38X9 | 40--SWEEP WIRE,FAIRED STBD | 40 -- Rope, cable, chain & fittings | https://www.fbo.gov/index.php?s=opportunity&mo... |
2 | SPE4A619T828X | 40--WIRE ROPE ASSEMBLY, | 40 -- Rope, cable, chain & fittings | https://www.fbo.gov/index.php?s=opportunity&mo... |
3 | SPE4A619T13F9 | 51--JACK,SCREW,MECHANIC | 51 -- Hand tools | https://www.fbo.gov/index.php?s=opportunity&mo... |
4 | 12905B19R8027 | Notice of intent to sole source: Azure Biosyst... | 99 -- Miscellaneous | https://www.fbo.gov/index.php?s=opportunity&mo... |