Notebook

10,000 개의 레시피 메뉴명 크롤링¶

from momukji import recipeMan
recipe = recipeMan()
recipe.menu_list(1)[:5]

1 분류용 Tag 데이터 수집 및 전처리¶

식재료 분류기준 Tag 수집

분석용인 만큼 빠르게 진행할 수 있도록 작업하기

'<title.*?>(.+?)</title>'  # 특정태그
"<[^>]+>|[^<]+"            # html 태그 내부의 한글 추출
'<.*?>'                    # 모든 태그
reg_table = '<table.*?>(.+?)</table>'
reg_table = '<font.*?>(.+?)</font>'
reg_table = '<.*?>'
reg_table = "<table.*?>(.*?)"
reg_table = '<table class="wtable3" .*?>(.*?)</table>'

계량법 안내
1큰술(1T, 1Ts) = 1숟가락 	15ml = 3t (밥숟가락 뜨면 1큰술)
1작은술(1t, 1ts)              5ml (티스푼으로는 2스푼이 1작은술)
1컵(1Cup, 1C) 	200ml = 16T (한국,중국,일본)
(서양(미국)은 1C가 240~250ml
1종이컵 	180ml
1oz 	28.3g
1파운드(lb) 	약 0.453 킬로그램(kg)
1갤런(gallon) 	약 3.78 리터(ℓ)
1꼬집 	약 2g 정도이며 '약간'이라고 표현하기도 함
조금   	약간의 2~3배
적당량 	기호에 따라 마음대로 조절해서 넣으란 표현
1줌 	    한손 가득
(예시 : 멸치 1줌 = 국멸치인 경우 12~15마리, 나물 1줌은 50g)
크게 1줌 = 2줌 	1줌의 두배
1주먹 	여자 어른의 주먹크기, 고기로는 100g
1토막 	2~3cm두께 정도의 분량
마늘 1톨 	깐 마늘 한쪽
생강 1쪽 	마늘 1톨의 크기와 비슷
생강 1톨 	아기 손바닥만한 크기의 통생강 1개
고기 1근 	600g
채소 1근 	400g
채소 1봉지 	200g 정도

In [1]:

from urllib import parse, request
from lxml.html import fromstring, tostring
from tqdm import tqdm
import time
url = "http://www.10000recipe.com/recipe/list.html?order=date&page="

# 전체 페이지에서 메뉴명과 Link 수집하기
def web_to_list(no):
    resp = request.urlopen(url+str(no)).read().decode('utf-8')
    resp_lxml  = fromstring(resp)
    title_list = resp_lxml.xpath('.//div[@class="row"]/div[@class="col-xs-3"]/a[@class="thumbnail"]')
    return [(_.get('href'), _.xpath('./div[@class="caption"]/h4//text()')[0])  for _ in title_list]

result = []
for _ in tqdm(range(1, 2)): # 3216
    result += web_to_list(_)
    time.sleep(0.3)
# import pandas as pd
# pd.DataFrame(result).to_csv("10000title.csv", header=None, index=None)

100%|██████████| 1/1 [00:00<00:00,  2.25it/s]

2 메뉴별 세부정보 수집 Type01¶

레시피와 Tag 정보를 중심으로 수집 및 전처리

In [2]:

from urllib import parse
# url = 'http://'+parse.urlsplit(url).netloc + result[10][0]
url = 'http://www.10000recipe.com/recipe/6923645'
resp = request.urlopen(url).read().decode('utf-8')
resp_lxml = fromstring(resp)
resp_lxml

Out[2]:

<Element html at 0x7fae2812c458>

In [5]:

xpath_ingre = '//div[@id="contents_area"]/div[@class="view_cont"]/div[@class="cont_ingre"]//text()'
cont_ingre = resp_lxml.xpath(xpath_ingre)
# cont_ingre = [_.strip()   for _ in cont_ingre   if len(_.strip()) > 2]
[_.strip() for _ in cont_ingre]

Out[5]:

['',
 '',
 '[재료]',
 '',
 '핫도그파우더3컵, 찹쌀가루반컵, 우유(농도에 맞게 가감)2컵, 베이킹파우더미량, 소세지큰거3개, 미니소세지(300g)1봉, 모짜렐라치즈, 나무젓가락, 식용유, 빵가루',
 '',
 '']

3 메뉴별 세부정보 수집 Type 02¶

레시피와 Tag 정보를 중심으로 수집 및 전처리

In [7]:

# url ='http://www.10000recipe.com/recipe/6923533'
url = 'http://www.10000recipe.com/recipe/6923600'
resp = request.urlopen(url).read().decode('utf-8')
resp_lxml  = fromstring(resp)

# 여러 줄 재료목록 수집기
xpath_ingre = '//div[@id="divConfirmedMaterialArea"]//ul'
xpath_tag   = './/div[@id="contents_area"]/div[@class="view_step"]/div[@class="view_tag"]/a//text()'
cont_ingre  = resp_lxml.xpath(xpath_ingre)
result = []
if len(cont_ingre) != 0:
    for _ in cont_ingre:
        result.append("".join(_.xpath('.//b//text()')))
        result_temp = ["".join(item.xpath('./text()')).strip() +\
                                     "|"+ "".join(item.xpath('./span/text()')) 
                       for item in _.xpath('.//li')]
        result.append(",".join(result_temp))
",".join(resp_lxml.xpath(xpath_tag)), result

Out[7]:

('#소고기미역죽,#미역죽,#소고기,#죽 끓이기,#든든한 한그릇,#겨울보양죽,#김하진,#알토란,#tv요리',
 ['[재료]',
  '소고기|300gm,불린쌀|2컵,참기름|5큰술,마른미역|30g,된장|1큰술,달걀노른자|,통깨|적당량',
  '[육수재료]',
  '대파|1대,통마늘|15개,무|100g,양파|1/2개,저민생강|10g,통후추|1큰술,물|20컵'])

4 Class 함수로 정리 및 묶기¶

작업 내용을 Class 함수로 묶기

In [8]:

from urllib import parse, request
from lxml.html import fromstring, tostring
from tqdm import tqdm
import time

class recipeMan:
    
    def __init__(self):
        #self.url = "http://www.10000recipe.com/recipe/list.html?order=accuracy&page=" # 정확도순
        self.url = "http://www.10000recipe.com/recipe/list.html?order=date&page=" # 날짜 정렬순
        self.url_base = "http://www.10000recipe.com"

    # 전체 페이지에서 메뉴명과 Link 수집하기
    def menu_list(self, no):
        r"""만개의 레시피 Page 크롤링
        개별 페이지의 메뉴제목과, Link URL 수집기
        :param no: 페이지 번호
        :return: [(/recipe/개별페이지, "제목명"), ... ]"""
        resp = request.urlopen(self.url+str(no)).read().decode('utf-8')
        resp_lxml  = fromstring(resp)
        title_list = resp_lxml.xpath('.//div[@class="row"]/div[@class="col-xs-3"]/a[@class="thumbnail"]')
        return [(_.get('href'), _.xpath('./div[@class="caption"]/h4//text()')[0])  for _ in title_list]
    
    # 개별 페이지 레시피와 Tag 수집
    def menu_detail(self, add_url):
        r"""개별 메뉴페이지에서 레시피와 Tag 수집
        :xpath_ingre: 메뉴목록 xpath
        :xpath_tag  : 메뉴의 구분용 Tag xpath
        :return     : Tag묵음(,), 레시피 묶음(|)
        """
        resp = request.urlopen(self.url_base+add_url).read().decode('utf-8')
        resp_lxml = fromstring(resp)
        xpath_ingre = '//div[@id="divConfirmedMaterialArea"]//ul'
        xpath_tag   = './/div[@id="contents_area"]/div[@class="view_step"]/div[@class="view_tag"]/a//text()'
        cont_ingre  = resp_lxml.xpath(xpath_ingre)
        result = []
        if len(cont_ingre) != 0:
            for _ in cont_ingre:
                result.append("".join(_.xpath('.//b//text()')))
                result_temp = ["".join(item.xpath('./text()')).strip() +\
                                             "|"+ "".join(item.xpath('./span/text()')) 
                               for item in _.xpath('.//li')]
                result.append(",".join(result_temp))
        return ",".join(resp_lxml.xpath(xpath_tag)), result

In [11]:

recipe = recipeMan()
recipe.menu_list(1)[:5]

Out[11]:

[('/recipe/6923656', '초간단배추찜'),
 ('/recipe/6923655', '푸짐하게 전부쳐먹자! 모듬전만들기!'),
 ('/recipe/6923654', '팽이버섯전 레시피 맛있는 팽이버섯 요리'),
 ('/recipe/6923653', '손님 초대요리 차돌박이 숙주볶음'),
 ('/recipe/6923652', '밤 무침 오도독 샐러드 같은 밤 생채')]

In [ ]:

In [1]:

from urllib import request
header = {
#     'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13F69 Safari/601.1',
    'User-Agent': 'Mozilla/5.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    'Accept-Encoding': 'none',
    'Accept-Language': 'en-US,en;q=0.8',
    'Connection': 'keep-alive'
}
url  = 'http://whois.domaintools.com/momukji.kr'
resp = request.Request(url, headers = header);
resp = request.urlopen(resp)#.read()
resp

Out[1]:

<http.client.HTTPResponse at 0x7fb1e9565208>

In [ ]: