#!/usr/bin/env python # coding: utf-8 # ## Make a new `JournalCrawler` (soup) #
#

Where you need to update

# #
# You can create a new `JournalCrawler` whose `crawl_type` is **"soup"**. # In[1]: from gummy.utils import get_driver from gummy.journals import * # In[2]: class GoogleJournal(GummyAbstJournal): pass self = GoogleJournal() # In[3]: def get_soup_driver(url): with get_driver() as driver: soup = self.get_soup_source(url=url, driver=driver) cano_url = canonicalize(url=url, driver=driver) return soup, cano_url # In[4]: def get_soup(url): cano_url = canonicalize(url=url, driver=None) soup = self.get_soup_source(url=url, driver=None) return soup, cano_url # In[5]: url = input() # ## create `get_contents_soup` # ### With Driver Ver. # In[6]: soup, cano_url = get_soup_driver(url) self._store_crawling_logs(cano_url=cano_url) print(f"canonicalized URL: {toBLUE(cano_url)}") # #### `get_title_from_soup` # In[7]: title = find_target_text(soup=soup, name="div", attrs={"id": "SIvCob"}, strip=True, default=self.default_title) print(f"title: {toGREEN(title)}") # #### `get_sections_from_soup` # In[8]: sections = soup.find_all(name="center") print(f"num sections: {toBLUE(len(sections))}") # #### `get_head_from_section` # In[9]: def get_head_from_section(section): head = section.find(name="input") return head self.get_head_from_section = get_head_from_section # In[10]: contens = self.get_contents_from_soup_sections(sections) # ### No Driver Ver. # In[11]: soup, cano_url = get_soup(url) self._store_crawling_logs(cano_url=cano_url) print(f"canonicalized URL: {toBLUE(cano_url)}") # #### `get_title_from_soup` # In[12]: title = find_target_text(soup=soup, name="div", attrs={"id": "SIvCob"}, strip=True, default=self.default_title) print(f"title: {toGREEN(title)}") # #### `get_sections_from_soup` # In[13]: sections = soup.find_all(name="center") print(f"num sections: {toBLUE(len(sections))}") # #### `get_head_from_section` # In[14]: def get_head_from_section(section): head = section.find(name="input") return head self.get_head_from_section = get_head_from_section # In[15]: contens = self.get_contents_from_soup_sections(sections) # *** # ## Confirmation # NOTE: You also have to modify these variables: # # - [`gummy.journals.TranslationGummyJournalCrawlers`](https://github.com/iwasakishuto/Translation-Gummy/blob/master/gummy/journals.py) # - [`gummy.utils.journal_utils.DOMAIN2JOURNAL`](https://github.com/iwasakishuto/Translation-Gummy/blob/master/gummy/utils/journal_utils.py) # In[16]: from gummy import TranslationGummy # In[17]: # model = TranslationGummy() # model.toPDF(url=url) # If successful, edit here too: # # - [Wiki: Supported journals](https://github.com/iwasakishuto/Translation-Gummy/wiki/Supported-journals) # - [tests.data](https://github.com/iwasakishuto/Translation-Gummy/blob/master/tests/data.py) # In[ ]: