JournalCrawler
(soup)¶gummy.utils.journal_utils.py
gummy.journals.py
tests.data.py
# Copy HERE
from gummy.utils import get_driver
from gummy.journals import *
self = GummyAbstJournal()
def get_soup(url):
with get_driver() as driver:
soup = self.get_soup_source(url=url, driver=driver)
cano_url = canonicalize(url=url, driver=driver)
return soup, cano_url
url = input()
soup, cano_url = get_soup(url)
self._store_crawling_logs(cano_url=cano_url)
print(f"canonicalized URL: {toBLUE(cano_url)}")
# get_title_from_soup
title = find_target_text(soup=soup, name="div", attrs={"id": "SIvCob"}, strip=True, default=self.default_title)
print(f"title: {toGREEN(title)}")
# get_sections_from_soup
sections = soup.find_all(name="center")
print(f"num sections: {toBLUE(len(sections))}")
# get_head_from_section
def get_head_from_section(section):
head = section.find(name="input")
return head
self.get_head_from_section = get_head_from_section
contens = self.get_contents_from_soup_sections(sections)
You can create a new JournalCrawler
whose crawl_type
is "soup".
from gummy.utils import get_driver
from gummy.journals import *
[success] local driver can be built. [failure] remote driver can't be built. DRIVER_TYPE: local
class GoogleJournal(GummyAbstJournal):
pass
self = GoogleJournal()
def get_soup_driver(url):
with get_driver() as driver:
soup = self.get_soup_source(url=url, driver=driver)
cano_url = canonicalize(url=url, driver=driver)
return soup, cano_url
def get_soup(url):
cano_url = canonicalize(url=url, driver=None)
soup = self.get_soup_source(url=url, driver=None)
return soup, cano_url
url = input()
https://www.google.com/
get_contents_soup
¶soup, cano_url = get_soup_driver(url)
self._store_crawling_logs(cano_url=cano_url)
print(f"canonicalized URL: {toBLUE(cano_url)}")
Use UselessGateWay._pass2others method. Wait up to 3[s] for all page elements to load. Scroll down to the bottom of the page. Decompose unnecessary tags to make it easy to parse. ============================== Decomposed <i> tag (0) Decomposed <link> tag (1) Decomposed <meta> tag (4) Decomposed <noscript> tag (0) Decomposed <script> tag (13) Decomposed <style> tag (24) Decomposed <sup> tag (0) Decomposed <None> tag (0) canonicalized URL: https://www.google.com/
get_title_from_soup
¶title = find_target_text(soup=soup, name="div", attrs={"id": "SIvCob"}, strip=True, default=self.default_title)
print(f"title: {toGREEN(title)}")
title: Google 検索は次の言語でもご利用いただけます: English
get_sections_from_soup
¶sections = soup.find_all(name="center")
print(f"num sections: {toBLUE(len(sections))}")
num sections: 3
get_head_from_section
¶def get_head_from_section(section):
head = section.find(name="input")
return head
self.get_head_from_section = get_head_from_section
contens = self.get_contents_from_soup_sections(sections)
Show contents of the paper. ============================== [1/3] [2/3] [3/3]
soup, cano_url = get_soup(url)
self._store_crawling_logs(cano_url=cano_url)
print(f"canonicalized URL: {toBLUE(cano_url)}")
Get HTML content from https://www.google.com/ Decompose unnecessary tags to make it easy to parse. ============================== Decomposed <i> tag (0) Decomposed <link> tag (0) Decomposed <meta> tag (4) Decomposed <noscript> tag (0) Decomposed <script> tag (6) Decomposed <style> tag (2) Decomposed <sup> tag (0) Decomposed <None> tag (0) canonicalized URL: https://www.google.com/
get_title_from_soup
¶title = find_target_text(soup=soup, name="div", attrs={"id": "SIvCob"}, strip=True, default=self.default_title)
print(f"title: {toGREEN(title)}")
title: 2020-09-30@17.42.18
get_sections_from_soup
¶sections = soup.find_all(name="center")
print(f"num sections: {toBLUE(len(sections))}")
num sections: 1
get_head_from_section
¶def get_head_from_section(section):
head = section.find(name="input")
return head
self.get_head_from_section = get_head_from_section
contens = self.get_contents_from_soup_sections(sections)
Show contents of the paper. ============================== [1/1]
NOTE: You also have to modify these variables:
from gummy import TranslationGummy
# model = TranslationGummy()
# model.toPDF(url=url)
If successful, edit here too: