JournalCrawler
(soup)¶gummy.utils.journal_utils.py
gummy.journals.py
tests.data.py
You can create a new JournalCrawler
whose crawl_type
is "soup".
from gummy.utils import get_driver
from gummy.journals import *
[success] local driver can be built. [failure] remote driver can't be built. DRIVER_TYPE: local
def get_soup(url):
cano_url = canonicalize(url=url, driver=None)
return BeautifulSoup(requests.get(url).content, "html.parser"), cano_url
def get_soup_driver(url):
with get_driver() as driver:
driver.get(url)
time.sleep(3)
html = driver.page_source.encode("utf-8")
cano_url = canonicalize(url=url, driver=driver)
return BeautifulSoup(html, "html.parser"), cano_url
class GoogleJournal(GummyAbstJournal):
pass
self = GoogleJournal()
url = input()
https://www.google.com/
get_contents_soup
¶soup, cano_url = get_soup(url)
self._store_crawled_info(cano_url=cano_url)
print(f"canonicalized URL: {toBLUE(cano_url)}")
canonicalized URL: https://www.google.com/
get_title_from_soup
¶title = find_text(soup=soup, name="div", attrs={"id": "SIvCob"}, strip=True, not_found=self.default_title)
print(f"title: {toGREEN(title)}")
title: 2020-08-06@23.55.12
get_sections_from_soup
¶sections = soup.find_all(name="center")
print(f"num sections: {toBLUE(len(sections))}")
num sections: 1
get_sections_from_soup
¶soup_sections = sections
contents = []
len_soup_sections = len(soup_sections)
for i,section in enumerate(soup_sections):
headline = "headline"
inputTag = section.find("input")
if inputTag is not None:
headline = inputTag.get("aria-label")
inputTag.decompose()
contents.extend(self.organize_soup_section(section=section, headline=headline))
if self.verbose: print(f"[{i+1:>0{len(str(len_soup_sections))}}/{len_soup_sections}] {headline}")
[1/1] None
soup, cano_url = get_soup_driver(url)
self._store_crawled_info(cano_url=cano_url)
print(f"canonicalized URL: {toBLUE(cano_url)}")
DRIVER_TYPE: local canonicalized URL: https://www.google.com/
get_title_from_soup
¶title = find_text(soup=soup, name="div", attrs={"id": "SIvCob"}, strip=True, not_found=self.default_title)
print(f"title: {toGREEN(title)}")
title: 2020-08-06@23.55.12
get_sections_from_soup
¶sections = soup.find_all(name="center")
print(f"num sections: {toBLUE(len(sections))}")
num sections: 3
get_sections_from_soup
¶soup_sections = sections
contents = []
len_soup_sections = len(soup_sections)
for i,section in enumerate(soup_sections):
headline = "headline"
inputTag = section.find("input")
if inputTag is not None:
headline = inputTag.get("aria-label")
inputTag.decompose()
contents_.extend(self.organize_soup_section(section=section, headline=headline))
if self.verbose: print(f"[{i+1:>0{len(str(len_soup_sections))}}/{len_soup_sections}] {headline}")
[1/3] Google 検索 [2/3] Google 検索 [3/3] headline
NOTE: You also have to modify these variables:
from gummy import TranslationGummy
model = TranslationGummy()
model.toPDF(url=url)
If successful, edit here too: