#!/usr/bin/env python
# coding: utf-8
# ## Make a new `JournalCrawler` (soup)
#
#
Where you need to update
#
# gummy.utils.journal_utils.py
# gummy.journals.py
# tests.data.py
# - Wiki
#
#
# You can create a new `JournalCrawler` whose `crawl_type` is **"soup"**.
# In[1]:
from gummy.utils import get_driver
from gummy.journals import *
# In[2]:
class GoogleJournal(GummyAbstJournal):
pass
self = GoogleJournal()
# In[3]:
def get_soup_driver(url):
with get_driver() as driver:
soup = self.get_soup_source(url=url, driver=driver)
cano_url = canonicalize(url=url, driver=driver)
return soup, cano_url
# In[4]:
def get_soup(url):
cano_url = canonicalize(url=url, driver=None)
soup = self.get_soup_source(url=url, driver=None)
return soup, cano_url
# In[5]:
url = input()
# ## create `get_contents_soup`
# ### With Driver Ver.
# In[6]:
soup, cano_url = get_soup_driver(url)
self._store_crawling_logs(cano_url=cano_url)
print(f"canonicalized URL: {toBLUE(cano_url)}")
# #### `get_title_from_soup`
# In[7]:
title = find_target_text(soup=soup, name="div", attrs={"id": "SIvCob"}, strip=True, default=self.default_title)
print(f"title: {toGREEN(title)}")
# #### `get_sections_from_soup`
# In[8]:
sections = soup.find_all(name="center")
print(f"num sections: {toBLUE(len(sections))}")
# #### `get_head_from_section`
# In[9]:
def get_head_from_section(section):
head = section.find(name="input")
return head
self.get_head_from_section = get_head_from_section
# In[10]:
contens = self.get_contents_from_soup_sections(sections)
# ### No Driver Ver.
# In[11]:
soup, cano_url = get_soup(url)
self._store_crawling_logs(cano_url=cano_url)
print(f"canonicalized URL: {toBLUE(cano_url)}")
# #### `get_title_from_soup`
# In[12]:
title = find_target_text(soup=soup, name="div", attrs={"id": "SIvCob"}, strip=True, default=self.default_title)
print(f"title: {toGREEN(title)}")
# #### `get_sections_from_soup`
# In[13]:
sections = soup.find_all(name="center")
print(f"num sections: {toBLUE(len(sections))}")
# #### `get_head_from_section`
# In[14]:
def get_head_from_section(section):
head = section.find(name="input")
return head
self.get_head_from_section = get_head_from_section
# In[15]:
contens = self.get_contents_from_soup_sections(sections)
# ***
# ## Confirmation
# NOTE: You also have to modify these variables:
#
# - [`gummy.journals.TranslationGummyJournalCrawlers`](https://github.com/iwasakishuto/Translation-Gummy/blob/master/gummy/journals.py)
# - [`gummy.utils.journal_utils.DOMAIN2JOURNAL`](https://github.com/iwasakishuto/Translation-Gummy/blob/master/gummy/utils/journal_utils.py)
# In[16]:
from gummy import TranslationGummy
# In[17]:
# model = TranslationGummy()
# model.toPDF(url=url)
# If successful, edit here too:
#
# - [Wiki: Supported journals](https://github.com/iwasakishuto/Translation-Gummy/wiki/Supported-journals)
# - [tests.data](https://github.com/iwasakishuto/Translation-Gummy/blob/master/tests/data.py)
# In[ ]: