#!/usr/bin/env python # coding: utf-8 # # **식품안전나라** # ex) http://openapi.foodsafetykorea.go.kr/api/인증키/I0580/xml/1/20 # # 1. **[Open API 메인 페이지](https://www.foodsafetykorea.go.kr/api/userApiKey.do#)**, **[API 활용 방법](https://www.foodsafetykorea.go.kr/api/howToUseApi.do?menu_grp=MENU_GRP34&menu_no=687)**, **[API 이용절차](https://www.foodsafetykorea.go.kr/api/board/boardDetail.do)** # 1. **[영양사협회 영양소 섭취기준표](http://www.kns.or.kr/FileRoom/FileRoom_view.asp?mode=mod&restring=%252FFileRoom%252FFileRoom.asp%253Fxsearch%253D0%253D%253Dxrow%253D10%253D%253DBoardID%253DKdr%253D%253Dpage%253D1&idx=79&page=1&BoardID=Kdr&xsearch=1&cn_search)** # 1. key : "8acba1823ae742359560" # ## **1 함수로 만든내용 확인** # 식품의약처 API 내용 활용함수 : **Json / DataFrame** 으로 출력 # 1. **'C005' :** '바코드제품정보' # 1. **'I2570' :** '유통바코드' # 1. **'I0490' :** '회수판매중지' # 1. **'I0750' :** '식품영양정보DB' # 1. **'COOKRCP01' :** '조리식품_레시피_DB' # In[1]: import pandas as pd from tqdm import tqdm from momukji import FoodSafe [(_, FoodSafe().apiKey[_]['name']) for _ in FoodSafe().apiKey.keys()] # In[ ]: get_ipython().run_cell_magic('time', '', 'result, foodId = [], \'I0490\' # 회수요청제품 (400개)\ndata = FoodSafe().getData(foodId, 1, 1000, FoodSafe().apiKey[foodId][\'cols\'], display=True)\ndata.to_excel("data/food_recall.xls", index=None)\n') # In[ ]: get_ipython().run_cell_magic('time', '', "result, foodId = [], 'I0750' # 식품 영양정보\n_ = FoodSafe().getData(foodId, 1, 2, FoodSafe().apiKey[foodId]['cols'], display=True)\n# for _ in tqdm(range(1, 13824+1, 1000)):\n# result.append(FoodSafe().getData(foodId, _, _+999, FoodSafe().apiKey[foodId]['cols'])) \n# pd.concat(result).to_excel('data/food_nutrient.xls', index=None)\n") # In[ ]: get_ipython().run_cell_magic('time', '', "result, foodId = [], 'COOKRCP01' # 레시피 데이터\n_ = FoodSafe().getData(foodId, 1, 2, FoodSafe().apiKey[foodId]['cols'], display=True)\n# for _ in tqdm(range(1, 1500+1, 1000)):\n# result.append(FoodSafe().getData(foodId, _, _+999, FoodSafe().apiKey[foodId]['cols']))\n# pd.concat(result).to_excel('data/food_recipe_info.xls', index=None)\n") # In[ ]: get_ipython().run_cell_magic('time', '', "result, foodId = [], 'C005' # 제품 바코드 번호\n# _ = FoodSafe().getData(foodId, 1, 2, FoodSafe().apiKey[foodId]['cols'], display=True)\n# for _ in tqdm(range(1, 100200+1, 1000)):\nfrom collections import OrderedDict\nfor _ in tqdm(range(1, 100200+1, 1000)):\n result.append(FoodSafe().getData(foodId, _, _+999, \n FoodSafe().apiKey[foodId]['cols']).loc[:,\n list(OrderedDict(FoodSafe().apiKey['C005']['cols']).values())])\npd.concat(result).to_csv('data/food_barcode.csv', index=None)\npd.concat(result).to_excel('data/food_barcode.xls', index=None)\n") # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: get_ipython().run_cell_magic('time', '', 'result, foodId = [], "I2570" # 유통 바코드\n_ = FoodSafe().getData(foodId, 1, 2, FoodSafe().apiKey[foodId][\'cols\'], display=True)\n# for _ in tqdm(range(1, 49000+1, 1000)):\n# result.append(FoodSafe().getData(foodId, _, _+999, FoodSafe().apiKey[foodId][\'cols\']))\n# pd.concat(result).to_excel(\'data/food_barcode_info.xls\', index=None)\n') #
# # # **Kamis 식재료 정보수집** # **[Open API 관리 페이지](https://www.kamis.or.kr/customer/mypage/my_openapi/my_openapi.do)** # 1. **[Regex Html Crawling](https://stackoverflow.com/questions/13823479/finding-all-tr-from-html-table-in-python)** # 1. key : "fb54dcd7-218f-4297-8fa6-31cfcd0a897d" # # ``` # '(.+?)' # 특정태그 # "<[^>]+>|[^<]+" # html 태그 내부의 한글 추출 # '<.*?>' # 모든 태그 # reg_table = '(.+?)' # reg_table = '(.+?)' # reg_table = '<.*?>' # reg_table = "(.*?)" # reg_table = '(.*?)
' # ``` # ## **1 Web 페이지 크롤링 함수 확인** # Kamis 식품정보 수집 # In[ ]: get_ipython().run_cell_magic('time', '', 'from momukji import Kamis\nKamis().getData(\'2019-10-15\', "100").head()\n') # In[ ]: get_ipython().run_cell_magic('time', '', 'from momukji import Kamis\nKamis().getData(\'2019-10-15\', "100" ,types=2).head()\n') # ## **2 API Json 호출** # 1. **[Kamis OpenAPI Key 매뉴얼 페이지](https://www.kamis.or.kr/customer/reference/openapi_list.do)** # 1. **[Kamis Code 상세 페이지](https://www.kamis.or.kr/customer/reference/openapi_list.do?action=detail&boardno=5)** | **[Code 다운로드](http://www.kamis.or.kr/customer/board/board_file.do?brdno=4&brdctsno=424245&brdctsfileno=12212)** # 1. **[Kamis OpenAPI](https://www.kamis.or.kr/customer/reference/openapi_list.do?action=detail&boardno=8)** 식품정보 수집 # In[ ]: from momukji import Kamis resp = Kamis().getApi('2019-10-21', 500, cd=2) resp.head(3) #
# # # **서울시 농수산 식품공사 API** # 1. https://www.garakprice.com/index.php?go_date=20191001 가락동 농산물 시세 Web # 1. http://www.garak.co.kr/gongsa/jsp/gs/intro/common.jsp JSP 호출주소 # # **[서울시 농산물 공사 OpenAPI](https://www.garak.co.kr/publicdata/selectPageListPublicData.do?sch_public_data_realm_code=1)** # # ```html # http://www.garak.co.kr/gongsa/jsp/gs/data_open/data.jsp?id=2087&passwd=1004&dataid=data4&pagesize=10 # &pageidx=1&portal.templet=false&p_ymd=20190408&p_jymd=20190408&d_cd=2&p_jjymd=20130429 # &p_pos_gubun=1&pum_nm= # ``` # ## **1 API 를 활용한 수집** # 가격정보를 **Xml API** 를 활용하여 수집 # In[ ]: # 문제점... # Page 를 순환하면서 1개의 객체만 있으면 list 갯수가 1개로만 출력 # 해결방법... (https://rfriend.tistory.com/482) # 1개의 dict 을 DataFrame 변환 시, 데이터를 list로 변경해서 적용 # 그래야 컬럼, 데이터와 "인덱스" 를 자동으로 계산할 수 있다. # In[ ]: date_info = '20191022' import pandas as pd from momukji import Garak xml_d_1 = Garak().getData(cd=1) # date_info, xml_d_2 = Garak().getData(cd=2) # date_info, xml_data = pd.concat([xml_d_1, xml_d_2]).reset_index(drop=True) xml_data.to_excel("data/food_Garak.xls", index=None) # ## **2 Web 페이지를 활용한 수집기** # https://www.garakprice.com/index.php?go_date=20191021 # 1. 하지만 품목수가 부족해서 실익이 적음 # In[ ]: # https://www.garakprice.com/index.php?go_date=20191021 from urllib import parse, request def urllib_request(url, params, encode='utf-8'): params = parse.urlencode(params).encode(encode) url = request.Request(url, params) resp = request.data/lopen(url).read() resp = parse.quote_plus(resp) return parse.unquote_plus(resp) # params = { "go_date":20191021 } # url = "https://www.garakprice.com/index.php" # urllib_request(url, params)[:500] #
# # # **기타 공산품 가격정보** # # **[Data.go.kr 공산품 가격정보](https://www.data.go.kr/dataset/3043385/openapi.do?lang=ko)** # ## **1 공산품 Id 정보수집** # **[Data.go.kr 공산품 가격정보](https://www.data.go.kr/dataset/3043385/openapi.do?lang=ko)** # 1. **[한국소비자원 참가격](https://www.price.go.kr/tprice/portal/pricenewsandtpriceintro/iteminfo/getItemList.do)** # 1. **[유통상품지식뱅크](http://35.200.32.201/)** # 1. 분류기준 명확하게 찾기 # In[ ]: from momukji import Product item_list = Product().getList() item_list.head(3) # In[ ]: url = "http://ksetup.com/bbs/page.php?hid=code" import re import pandas as pd from urllib import request, parse resp = request.urlopen(url).read() resp = parse.quote_plus(resp) resp = parse.unquote_plus(resp) table = re.findall(r'(.*?)
', resp, re.M|re.I|re.S) table = "" + table[0] + "
" table = pd.read_html(table)[0] # table.to_excel('company_code.xls', index=None, columns=None) table.head(3) # ## **2 공산품 인터넷 쇼핑몰 정보** # 1. 네이버 쇼핑/ 핫딜 # 1. 다음쇼핑 등 정리한 내용 재정의 및 복습 # 1. 유통정보센터 식품관련 데이터 수집 및 정리 # In[ ]: # In[ ]: # In[ ]: #
# # # **급식메뉴 안내** # **[Data.go.kr 공산품 가격정보](https://www.data.go.kr/dataset/3043385/openapi.do?lang=ko)** # # **[NEIS 급식메뉴 조회 수집 사이트](https://stu.gen.go.kr/sts_sci_md00_001.do?schulCode=F100000120&schulCrseScCode=4&schulKndScCode=04&ay=2019&mm=10)** # ``` # https://stu.gen.go.kr/sts_sci_md00_001.do?schulCode=F100000120&schulCrseScCode=4&schulKndScCode=04&ay=2019&mm=12 # # https://stu.gen.go.kr/sts_sci_md00_001.do? # schulCode=F100000120 # &schulCrseScCode=4 # &schulKndScCode=04 # &ay=2019 # &mm=10 # ``` # In[ ]: "{:02d}".format(2) # In[ ]: query = { "schulCode":"F100000120", "schulCrseScCode":4, "schulKndScCode":"04", "ay":2019, # 년도 "mm":10 # 월 } from urllib import parse, request url = "https://stu.gen.go.kr/sts_sci_md00_001.do?" + parse.urlencode(query) resp = request.urlopen(url).read() resp = parse.quote_plus(resp) resp = parse.unquote_plus(resp) resp[:200] # In[ ]: import re re.findall() # In[ ]: # In[ ]: (""/sts_sci_md00_001.do?"") # In[ ]: # In[ ]: # In[ ]: # In[ ]: from menu_parser import MenuParser from school import School school = School(School.Region.GWANGJU, School.Type.HIGH, "F100000120") parser = MenuParser(school) menus = parser.get_menu() print(menus.today) # In[ ]: class School: class Region: BUSAN = "stu.pen.go.kr" CHUNGBUK = "stu.cbe.go.kr" CHUNGNAM = "stu.cne.go.kr" DAEJEON = "stu.dge.go.kr" DEAGU = "stu.dge.go.kr" GWANGJU = "stu.gen.go.kr" GYEONGBUK = "stu.gbe.go.kr" GYEONGGI = "stu.goe.go.kr" GYEONGNAM = "stu.gne.go.kr" INCHEON = "stu.ice.go.kr" JEJU = "stu.jje.go.kr" JEONBUK = "stu.jbe.go.kr" JEONNAM = "stu.jne.go.kr" KANGWON = "stu.kwe.go.kr" SEJONG = "stu.sje.go.kr" SEOUL = "stu.sen.go.kr" ULSAN = "stu.use.go.kr" class Type: KINDERGARTEN = 1 ELEMENTARY = 2 MIDDLE = 3 HIGH = 4 def __init__(self, school_region, school_type, school_code): self.region = school_region self.type = school_type self.code = school_code school = School(School.Region.GWANGJU, School.Type.HIGH, "F100000120") school # In[ ]: class MenuParser: def __init__(self, school): self.school = school def get_menu(self, year=None, month=None): """ 해당 학교로부터 급식을 가져온다. year와 month가 모두 주어졌다면 해당하는 정보를 가져온다. 주어지지 않았을 때에는 자동으로 가져오게 된다. """ if year is None or month is None: today = datetime.date.today() url = self.__create_url(today.year, today.month) else: url = self.__create_url(year, month) page = self.__get_page(url); print(url) soup = BeautifulSoup(page, "html.parser") items = soup.select("#contents > div > table > tbody > tr > td > div") return Menu(items) def __get_page(self, url): try: page = requests.get(url).text except Exception as e: logging.error(e) return page def __create_url(self, year, month): today = datetime.date(year, month, 1) url = f"https://{self.school.region}/sts_sci_md00_001.do?" url += f"schulCode={self.school.code}&" url += f"schulCrseScCode={self.school.type}&" url += f"schulKndScCode={self.school.type:02d}&" url += f"ay={today.year}&" url += f"mm={today.month:02d}" print(url) return url # In[ ]: parser = MenuParser(school) parser # In[ ]: def __create_url(self, year, month): today = datetime.date(year, month, 1) url = f"https://{self.school.region}/sts_sci_md00_001.do?" url += f"schulCode={self.school.code}&" url += f"schulCrseScCode={self.school.type}&" url += f"schulKndScCode={self.school.type:02d}&" url += f"ay={today.year}&" url += f"mm={today.month:02d}" print(url) return url # In[ ]: # 교육청 코드 school.region # In[ ]: # 학교종류 school.type # In[ ]: # 개별 학교코드 school.code # In[ ]: "schulKndScCode={:02d}".format(4) # In[ ]: # In[ ]: # In[ ]: # In[ ]: parser = MenuParser(school) # In[ ]: # In[ ]: # In[ ]: # In[ ]: parser. # In[ ]: # ## **일일수집 해외 위해정보** # http://www.foodsafetykorea.go.kr/riskinfo/board-collect-list.do # 1. Selenium 으로 해당 이벤트를 클릭한 뒤 # 1. 상세페이지에서 해당 xls 파일 다운로드 받기 # In[ ]: # In[ ]: # In[ ]: # In[ ]: temp = datetime.today() temp.strftime('%Y%m%d') # In[ ]: datetime.strftime(temp, '%Y%m%d') # ## **2 알라딘 중고책** # api 크롤링을 사용한 데이터 수집 # In[ ]: query = { "KeyWord":"파이썬", #"%C6%C4%C0%CC%BD%E3", "ViewType":"Detail", "SortOrder":5, # 5:출시일순, 11:등록순 "ViewRowCount":50, "page":2, } # In[ ]: url = "https://www.aladin.co.kr/search/wsearchresult.aspx?SearchTarget=UsedStore" from urllib import parse, request params = parse.urlencode(query).encode('cp949') # Params 인코딩 변환 resp = request.Request(url, params) # url 주소값 생성 resp = request.urlopen(resp).read() # url 을 활용한 response 수집 resp = parse.quote_plus(resp) # response 의 byte 를 string 변환 resp = parse.unquote_plus(resp, encoding='cp949') # string 인코딩 변환 (default='utf-8') # In[ ]: # In[ ]: with open('book.html','w') as f: f.write(resp) # In[ ]: # decoding (기본값 utf-8) parse.unquote_plus('%C4%F6%C6%AE', encoding='cp949') # In[ ]: # encodings parse.quote_plus("퀀트", encoding='cp949') # In[ ]: #
#