#!/usr/bin/env python
# coding: utf-8
# # **식품안전나라**
# ex) http://openapi.foodsafetykorea.go.kr/api/인증키/I0580/xml/1/20
#
# 1. **[Open API 메인 페이지](https://www.foodsafetykorea.go.kr/api/userApiKey.do#)**, **[API 활용 방법](https://www.foodsafetykorea.go.kr/api/howToUseApi.do?menu_grp=MENU_GRP34&menu_no=687)**, **[API 이용절차](https://www.foodsafetykorea.go.kr/api/board/boardDetail.do)**
# 1. **[영양사협회 영양소 섭취기준표](http://www.kns.or.kr/FileRoom/FileRoom_view.asp?mode=mod&restring=%252FFileRoom%252FFileRoom.asp%253Fxsearch%253D0%253D%253Dxrow%253D10%253D%253DBoardID%253DKdr%253D%253Dpage%253D1&idx=79&page=1&BoardID=Kdr&xsearch=1&cn_search)**
# 1. key : "8acba1823ae742359560"
# ## **1 함수로 만든내용 확인**
# 식품의약처 API 내용 활용함수 : **Json / DataFrame** 으로 출력
# 1. **'C005' :** '바코드제품정보'
# 1. **'I2570' :** '유통바코드'
# 1. **'I0490' :** '회수판매중지'
# 1. **'I0750' :** '식품영양정보DB'
# 1. **'COOKRCP01' :** '조리식품_레시피_DB'
# In[1]:
import pandas as pd
from tqdm import tqdm
from momukji import FoodSafe
[(_, FoodSafe().apiKey[_]['name']) for _ in FoodSafe().apiKey.keys()]
# In[ ]:
get_ipython().run_cell_magic('time', '', 'result, foodId = [], \'I0490\' # 회수요청제품 (400개)\ndata = FoodSafe().getData(foodId, 1, 1000, FoodSafe().apiKey[foodId][\'cols\'], display=True)\ndata.to_excel("data/food_recall.xls", index=None)\n')
# In[ ]:
get_ipython().run_cell_magic('time', '', "result, foodId = [], 'I0750' # 식품 영양정보\n_ = FoodSafe().getData(foodId, 1, 2, FoodSafe().apiKey[foodId]['cols'], display=True)\n# for _ in tqdm(range(1, 13824+1, 1000)):\n# result.append(FoodSafe().getData(foodId, _, _+999, FoodSafe().apiKey[foodId]['cols'])) \n# pd.concat(result).to_excel('data/food_nutrient.xls', index=None)\n")
# In[ ]:
get_ipython().run_cell_magic('time', '', "result, foodId = [], 'COOKRCP01' # 레시피 데이터\n_ = FoodSafe().getData(foodId, 1, 2, FoodSafe().apiKey[foodId]['cols'], display=True)\n# for _ in tqdm(range(1, 1500+1, 1000)):\n# result.append(FoodSafe().getData(foodId, _, _+999, FoodSafe().apiKey[foodId]['cols']))\n# pd.concat(result).to_excel('data/food_recipe_info.xls', index=None)\n")
# In[ ]:
get_ipython().run_cell_magic('time', '', "result, foodId = [], 'C005' # 제품 바코드 번호\n# _ = FoodSafe().getData(foodId, 1, 2, FoodSafe().apiKey[foodId]['cols'], display=True)\n# for _ in tqdm(range(1, 100200+1, 1000)):\nfrom collections import OrderedDict\nfor _ in tqdm(range(1, 100200+1, 1000)):\n result.append(FoodSafe().getData(foodId, _, _+999, \n FoodSafe().apiKey[foodId]['cols']).loc[:,\n list(OrderedDict(FoodSafe().apiKey['C005']['cols']).values())])\npd.concat(result).to_csv('data/food_barcode.csv', index=None)\npd.concat(result).to_excel('data/food_barcode.xls', index=None)\n")
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
get_ipython().run_cell_magic('time', '', 'result, foodId = [], "I2570" # 유통 바코드\n_ = FoodSafe().getData(foodId, 1, 2, FoodSafe().apiKey[foodId][\'cols\'], display=True)\n# for _ in tqdm(range(1, 49000+1, 1000)):\n# result.append(FoodSafe().getData(foodId, _, _+999, FoodSafe().apiKey[foodId][\'cols\']))\n# pd.concat(result).to_excel(\'data/food_barcode_info.xls\', index=None)\n')
#
#
# # **Kamis 식재료 정보수집**
# **[Open API 관리 페이지](https://www.kamis.or.kr/customer/mypage/my_openapi/my_openapi.do)**
# 1. **[Regex Html Crawling](https://stackoverflow.com/questions/13823479/finding-all-tr-from-html-table-in-python)**
# 1. key : "fb54dcd7-218f-4297-8fa6-31cfcd0a897d"
#
# ```
# '
(.+?)' # 특정태그
# "<[^>]+>|[^<]+" # html 태그 내부의 한글 추출
# '<.*?>' # 모든 태그
# reg_table = '(.+?)'
# reg_table = '(.+?)'
# reg_table = '<.*?>'
# reg_table = "(.*?)"
# reg_table = ''
# ```
# ## **1 Web 페이지 크롤링 함수 확인**
# Kamis 식품정보 수집
# In[ ]:
get_ipython().run_cell_magic('time', '', 'from momukji import Kamis\nKamis().getData(\'2019-10-15\', "100").head()\n')
# In[ ]:
get_ipython().run_cell_magic('time', '', 'from momukji import Kamis\nKamis().getData(\'2019-10-15\', "100" ,types=2).head()\n')
# ## **2 API Json 호출**
# 1. **[Kamis OpenAPI Key 매뉴얼 페이지](https://www.kamis.or.kr/customer/reference/openapi_list.do)**
# 1. **[Kamis Code 상세 페이지](https://www.kamis.or.kr/customer/reference/openapi_list.do?action=detail&boardno=5)** | **[Code 다운로드](http://www.kamis.or.kr/customer/board/board_file.do?brdno=4&brdctsno=424245&brdctsfileno=12212)**
# 1. **[Kamis OpenAPI](https://www.kamis.or.kr/customer/reference/openapi_list.do?action=detail&boardno=8)** 식품정보 수집
# In[ ]:
from momukji import Kamis
resp = Kamis().getApi('2019-10-21', 500, cd=2)
resp.head(3)
#
#
# # **서울시 농수산 식품공사 API**
# 1. https://www.garakprice.com/index.php?go_date=20191001 가락동 농산물 시세 Web
# 1. http://www.garak.co.kr/gongsa/jsp/gs/intro/common.jsp JSP 호출주소
#
# **[서울시 농산물 공사 OpenAPI](https://www.garak.co.kr/publicdata/selectPageListPublicData.do?sch_public_data_realm_code=1)**
#
# ```html
# http://www.garak.co.kr/gongsa/jsp/gs/data_open/data.jsp?id=2087&passwd=1004&dataid=data4&pagesize=10
# &pageidx=1&portal.templet=false&p_ymd=20190408&p_jymd=20190408&d_cd=2&p_jjymd=20130429
# &p_pos_gubun=1&pum_nm=
# ```
# ## **1 API 를 활용한 수집**
# 가격정보를 **Xml API** 를 활용하여 수집
# In[ ]:
# 문제점...
# Page 를 순환하면서 1개의 객체만 있으면 list 갯수가 1개로만 출력
# 해결방법... (https://rfriend.tistory.com/482)
# 1개의 dict 을 DataFrame 변환 시, 데이터를 list로 변경해서 적용
# 그래야 컬럼, 데이터와 "인덱스" 를 자동으로 계산할 수 있다.
# In[ ]:
date_info = '20191022'
import pandas as pd
from momukji import Garak
xml_d_1 = Garak().getData(cd=1) # date_info,
xml_d_2 = Garak().getData(cd=2) # date_info,
xml_data = pd.concat([xml_d_1, xml_d_2]).reset_index(drop=True)
xml_data.to_excel("data/food_Garak.xls", index=None)
# ## **2 Web 페이지를 활용한 수집기**
# https://www.garakprice.com/index.php?go_date=20191021
# 1. 하지만 품목수가 부족해서 실익이 적음
# In[ ]:
# https://www.garakprice.com/index.php?go_date=20191021
from urllib import parse, request
def urllib_request(url, params, encode='utf-8'):
params = parse.urlencode(params).encode(encode)
url = request.Request(url, params)
resp = request.data/lopen(url).read()
resp = parse.quote_plus(resp)
return parse.unquote_plus(resp)
# params = { "go_date":20191021 }
# url = "https://www.garakprice.com/index.php"
# urllib_request(url, params)[:500]
#
#
# # **기타 공산품 가격정보**
#
# **[Data.go.kr 공산품 가격정보](https://www.data.go.kr/dataset/3043385/openapi.do?lang=ko)**
# ## **1 공산품 Id 정보수집**
# **[Data.go.kr 공산품 가격정보](https://www.data.go.kr/dataset/3043385/openapi.do?lang=ko)**
# 1. **[한국소비자원 참가격](https://www.price.go.kr/tprice/portal/pricenewsandtpriceintro/iteminfo/getItemList.do)**
# 1. **[유통상품지식뱅크](http://35.200.32.201/)**
# 1. 분류기준 명확하게 찾기
# In[ ]:
from momukji import Product
item_list = Product().getList()
item_list.head(3)
# In[ ]:
url = "http://ksetup.com/bbs/page.php?hid=code"
import re
import pandas as pd
from urllib import request, parse
resp = request.urlopen(url).read()
resp = parse.quote_plus(resp)
resp = parse.unquote_plus(resp)
table = re.findall(r'', resp, re.M|re.I|re.S)
table = ""
table = pd.read_html(table)[0]
# table.to_excel('company_code.xls', index=None, columns=None)
table.head(3)
# ## **2 공산품 인터넷 쇼핑몰 정보**
# 1. 네이버 쇼핑/ 핫딜
# 1. 다음쇼핑 등 정리한 내용 재정의 및 복습
# 1. 유통정보센터 식품관련 데이터 수집 및 정리
# In[ ]:
# In[ ]:
# In[ ]:
#
#
# # **급식메뉴 안내**
# **[Data.go.kr 공산품 가격정보](https://www.data.go.kr/dataset/3043385/openapi.do?lang=ko)**
#
# **[NEIS 급식메뉴 조회 수집 사이트](https://stu.gen.go.kr/sts_sci_md00_001.do?schulCode=F100000120&schulCrseScCode=4&schulKndScCode=04&ay=2019&mm=10)**
# ```
# https://stu.gen.go.kr/sts_sci_md00_001.do?schulCode=F100000120&schulCrseScCode=4&schulKndScCode=04&ay=2019&mm=12
#
# https://stu.gen.go.kr/sts_sci_md00_001.do?
# schulCode=F100000120
# &schulCrseScCode=4
# &schulKndScCode=04
# &ay=2019
# &mm=10
# ```
# In[ ]:
"{:02d}".format(2)
# In[ ]:
query = {
"schulCode":"F100000120",
"schulCrseScCode":4,
"schulKndScCode":"04",
"ay":2019, # 년도
"mm":10 # 월
}
from urllib import parse, request
url = "https://stu.gen.go.kr/sts_sci_md00_001.do?" + parse.urlencode(query)
resp = request.urlopen(url).read()
resp = parse.quote_plus(resp)
resp = parse.unquote_plus(resp)
resp[:200]
# In[ ]:
import re
re.findall()
# In[ ]:
# In[ ]:
(""/sts_sci_md00_001.do?"")
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
from menu_parser import MenuParser
from school import School
school = School(School.Region.GWANGJU, School.Type.HIGH, "F100000120")
parser = MenuParser(school)
menus = parser.get_menu()
print(menus.today)
# In[ ]:
class School:
class Region:
BUSAN = "stu.pen.go.kr"
CHUNGBUK = "stu.cbe.go.kr"
CHUNGNAM = "stu.cne.go.kr"
DAEJEON = "stu.dge.go.kr"
DEAGU = "stu.dge.go.kr"
GWANGJU = "stu.gen.go.kr"
GYEONGBUK = "stu.gbe.go.kr"
GYEONGGI = "stu.goe.go.kr"
GYEONGNAM = "stu.gne.go.kr"
INCHEON = "stu.ice.go.kr"
JEJU = "stu.jje.go.kr"
JEONBUK = "stu.jbe.go.kr"
JEONNAM = "stu.jne.go.kr"
KANGWON = "stu.kwe.go.kr"
SEJONG = "stu.sje.go.kr"
SEOUL = "stu.sen.go.kr"
ULSAN = "stu.use.go.kr"
class Type:
KINDERGARTEN = 1
ELEMENTARY = 2
MIDDLE = 3
HIGH = 4
def __init__(self, school_region, school_type, school_code):
self.region = school_region
self.type = school_type
self.code = school_code
school = School(School.Region.GWANGJU, School.Type.HIGH, "F100000120")
school
# In[ ]:
class MenuParser:
def __init__(self, school):
self.school = school
def get_menu(self, year=None, month=None):
"""
해당 학교로부터 급식을 가져온다.
year와 month가 모두 주어졌다면 해당하는 정보를 가져온다.
주어지지 않았을 때에는 자동으로 가져오게 된다.
"""
if year is None or month is None:
today = datetime.date.today()
url = self.__create_url(today.year, today.month)
else:
url = self.__create_url(year, month)
page = self.__get_page(url); print(url)
soup = BeautifulSoup(page, "html.parser")
items = soup.select("#contents > div > table > tbody > tr > td > div")
return Menu(items)
def __get_page(self, url):
try:
page = requests.get(url).text
except Exception as e:
logging.error(e)
return page
def __create_url(self, year, month):
today = datetime.date(year, month, 1)
url = f"https://{self.school.region}/sts_sci_md00_001.do?"
url += f"schulCode={self.school.code}&"
url += f"schulCrseScCode={self.school.type}&"
url += f"schulKndScCode={self.school.type:02d}&"
url += f"ay={today.year}&"
url += f"mm={today.month:02d}"
print(url)
return url
# In[ ]:
parser = MenuParser(school)
parser
# In[ ]:
def __create_url(self, year, month):
today = datetime.date(year, month, 1)
url = f"https://{self.school.region}/sts_sci_md00_001.do?"
url += f"schulCode={self.school.code}&"
url += f"schulCrseScCode={self.school.type}&"
url += f"schulKndScCode={self.school.type:02d}&"
url += f"ay={today.year}&"
url += f"mm={today.month:02d}"
print(url)
return url
# In[ ]:
# 교육청 코드
school.region
# In[ ]:
# 학교종류
school.type
# In[ ]:
# 개별 학교코드
school.code
# In[ ]:
"schulKndScCode={:02d}".format(4)
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
parser = MenuParser(school)
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
parser.
# In[ ]:
# ## **일일수집 해외 위해정보**
# http://www.foodsafetykorea.go.kr/riskinfo/board-collect-list.do
# 1. Selenium 으로 해당 이벤트를 클릭한 뒤
# 1. 상세페이지에서 해당 xls 파일 다운로드 받기
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
temp = datetime.today()
temp.strftime('%Y%m%d')
# In[ ]:
datetime.strftime(temp, '%Y%m%d')
# ## **2 알라딘 중고책**
# api 크롤링을 사용한 데이터 수집
# In[ ]:
query = {
"KeyWord":"파이썬", #"%C6%C4%C0%CC%BD%E3",
"ViewType":"Detail",
"SortOrder":5, # 5:출시일순, 11:등록순
"ViewRowCount":50,
"page":2,
}
# In[ ]:
url = "https://www.aladin.co.kr/search/wsearchresult.aspx?SearchTarget=UsedStore"
from urllib import parse, request
params = parse.urlencode(query).encode('cp949') # Params 인코딩 변환
resp = request.Request(url, params) # url 주소값 생성
resp = request.urlopen(resp).read() # url 을 활용한 response 수집
resp = parse.quote_plus(resp) # response 의 byte 를 string 변환
resp = parse.unquote_plus(resp, encoding='cp949') # string 인코딩 변환 (default='utf-8')
# In[ ]:
# In[ ]:
with open('book.html','w') as f:
f.write(resp)
# In[ ]:
# decoding (기본값 utf-8)
parse.unquote_plus('%C4%F6%C6%AE', encoding='cp949')
# In[ ]:
# encodings
parse.quote_plus("퀀트", encoding='cp949')
# In[ ]:
#