출처: https://dololak.tistory.com/255 [코끼리를 냉장고에 넣는 방법]
# Base url 추출 및 추가경로 붙이기
from urllib import parse
url_root = "https://www.kamis.or.kr/test/join"
url_base = parse.urljoin(url_root, '/customer/price/retail/item.do?action=priceinfo')
url_base
'https://www.kamis.or.kr/customer/price/retail/item.do?action=priceinfo'
query = {
'examParam1': 'value1',
'examParam2': ['aaa', 'bbb']
}
parse.urlencode(query, encoding='UTF-8', doseq=True)
'examParam1=value1&examParam2=aaa&examParam2=bbb'
# Param 를 사용하여 Query Url 만들기
query = {
"regday": "2019-10-18",
"itemcode": "111",
"convert_kg_yn": "N",
"itemcategorycode": "100",
}
url_query = parse.urlencode(query, encoding='UTF-8', doseq=True)
url_query
'regday=2019-10-18&itemcode=111&convert_kg_yn=N&itemcategorycode=100'
# Base url 과 Params 연결하기
url = url_base + "&" + url_query
url
'https://www.kamis.or.kr/customer/price/retail/item.do?action=priceinfo®day=2019-10-18&itemcode=111&convert_kg_yn=N&itemcategorycode=100'
필요한 부분 추출하기
# 긴 Url 경로를 Parsing 하기
url_parse = parse.urlparse(url)
url_parse
ParseResult(scheme='https', netloc='www.kamis.or.kr', path='/customer/price/retail/item.do', params='', query='action=priceinfo®day=2019-10-18&itemcode=111&convert_kg_yn=N&itemcategorycode=100', fragment='')
# Root Url 추출하기 (url Net Location)
url_parse.netloc
'www.kamis.or.kr'
# url Add Paths
url_parse.path
'/customer/price/retail/item.do'
# url Query
url_parse.query
'action=priceinfo®day=2019-10-18&itemcode=111&convert_kg_yn=N&itemcategorycode=100'
Urllib 모듈의 request 사용
# Python 기본 모듈인 urllib 을 사용하여 Robots.txt 내용 확인하기
from urllib import request
url = "https://news.naver.com/robots.txt"
resp = request.urlopen(url)
resp.read() # 통으로 불러오기
b'User-agent: Yeti\nAllow: /main/imagemontage\nDisallow: /\nUser-agent: *\nDisallow: /'
# 필요한 상세페이지 크롤링
url = "https://www.kamis.or.kr/customer/price/retail/item.do?action=priceinfo®day=2019-10-18&itemcode=111&convert_kg_yn=N&itemcategorycode=100"
resp = request.urlopen(url)
r_test = resp.readlines()[103:107]
r_test
[b'\t\t\t\t\t\t\t\t\t<a href="/customer/customer_service/login/login.do">\xeb\xa1\x9c\xea\xb7\xb8\xec\x9d\xb8</a>\r\n', b'\t\t\t\t\t\t\t\t</li>\r\n', b'\t\t\t\t\t\t\t\t<li>\r\n', b'\t\t\t\t\t\t\t\t\t<a href="/customer/customer_service/join/customer_join.do">\xed\x9a\x8c\xec\x9b\x90\xea\xb0\x80\xec\x9e\x85</a>\r\n']
from urllib import parse
txt_byte = parse.quote_plus(r_test[3])
type(txt_byte), txt_byte
(str, '%09%09%09%09%09%09%09%09%09%3Ca+href%3D%22%2Fcustomer%2Fcustomer_service%2Fjoin%2Fcustomer_join.do%22%3E%ED%9A%8C%EC%9B%90%EA%B0%80%EC%9E%85%3C%2Fa%3E%0D%0A')
# 변환된 str 중 인코딩 문제를 해결
parse.unquote_plus(txt_byte)
'\t\t\t\t\t\t\t\t\t<a href="/customer/customer_service/join/customer_join.do">회원가입</a>\r\n'
"""Generate Strings from `c1` to `c2`, inclusive."""
def str_range(c1, c2):
for c in range(ord(c1), ord(c2)+1):
yield chr(c)
str_start = [_ for _ in [c for c in str_range('ㄱ', 'ㅎ')] # 초성 리스트
if _ not in ['ㄳ','ㄵ','ㄶ','ㄺ','ㄻ','ㄼ','ㄽ','ㄾ','ㄿ','ㅀ','ㅄ']]
str_mid = [c for c in str_range('ㅏ', 'ㅣ')] # 중성 리스트
str_last = [_ for _ in [" "] + [c for c in str_range('ㄱ', 'ㅎ')]
if _ not in ['ㄸ', 'ㅃ', 'ㅉ']] # 종성 리스트
def str_korchar(str_text):
r_lst = []
for _ in list(str_text.strip()):
if '가' <= _ <= '힣': # 영어는 구분작성
ch1 = (ord(_)-ord('가'))//588 # 588개 마다 초성변경
ch2 = ((ord(_)-ord('가'))-(588*ch1))//28 # 중성은 총 28개
ch3 = (ord(_) - ord('가'))-(588*ch1)-28*ch2
r_lst.append([str_start[ch1], str_mid[ch2], str_last[ch3]])
else: r_lst.append([_])
return r_lst
str_korchar("된장찌개 볶음abc")
[['ㄷ', 'ㅚ', 'ㄴ'], ['ㅈ', 'ㅏ', 'ㅇ'], ['ㅉ', 'ㅣ', ' '], ['ㄱ', 'ㅐ', ' '], [' '], ['ㅂ', 'ㅗ', 'ㄲ'], ['ㅇ', 'ㅡ', 'ㅁ'], ['a'], ['b'], ['c']]
list("깍두기")
['깍', '두', '기']
str_korchar("깍두기")
[['ㄲ', 'ㅏ', 'ㄱ'], ['ㄷ', 'ㅜ', ' '], ['ㄱ', 'ㅣ', ' ']]
(ord("깍")-ord("가"))//588
1
앞에서 작업했던 내용을 활용하여 경로명 만들기
# Base url 경로
url = "https://www.kamis.or.kr/customer/price/retail/item.do?action=priceinfo"
from urllib import parse
parse.urlsplit(url)
SplitResult(scheme='https', netloc='www.kamis.or.kr', path='/customer/price/retail/item.do', query='action=priceinfo', fragment='')
params = {
"regday" : "2019-10-18",
"itemcategorycode" : "100",
"itemcode" : "111",
"convert_kg_yn" : "N",
}
query = '&' + parse.urlencode(params, encoding='UTF-8')
query
'®day=2019-10-18&itemcategorycode=100&itemcode=111&convert_kg_yn=N'
url + query
'https://www.kamis.or.kr/customer/price/retail/item.do?action=priceinfo®day=2019-10-18&itemcategorycode=100&itemcode=111&convert_kg_yn=N'
import pandas as pd
pd.read_html(url + query)[3].head(3)
구분 | 구분.1 | 구분.2 | 당일 10/18 | 1일전10/17 | 2일전10/16 | 3일전10/15 | 4일전10/14 | 7일전10/11 | 1개월전 | 1년전 | 평년 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 평균 | 평균 | 평균 | 51469 | 51475 | 51484 | 51695 | 51695 | 51318 | 51553 | 53253 | 45758 |
1 | 최고값 | 최고값 | 최고값 | 58000 | 58000 | 58000 | 58000 | 58000 | 56000 | 58000 | 62500 | 59600 |
2 | 최저값 | 최저값 | 최저값 | 44900 | 44900 | 44900 | 45900 | 45900 | 45900 | 47800 | 49000 | 33800 |
# Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);
# Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)
# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'
# Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1290.1 Safari/537.13
# Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1
# Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1
# Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11
# Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25
url = "https://www.kamis.or.kr/customer/price/retail/item.do?action=priceinfo"
params = {
"regday" : "2019-10-18",
"itemcategorycode" : "100",
"itemcode" : "111",
"convert_kg_yn" : "N",
}
import requests
web = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'
headers = {'User-Agent' : web}
resp = requests.get(url, params=params, headers=headers)
resp.text[2800:3000]
'\t\t\t<li><a href="http://www.foodbiz.or.kr" target="_blank">농수산식품기업지원센터</a></li>\r\n\t\t\t\t\t<li><a href="http://www.at.or.kr" target="_blank">aT메인</a></li>\r\n\t\t\t\t</ul>\r\n\t\t\t</div>\r\n\t\t\t<div class="fl_r">\r\n\t\t\t\t<'
# OTP key 값 추출하기
# KRX 주가 데이터 수집예제
gen_otp_url = 'http://marketdata.krx.co.kr/contents/COM/GenerateOTP.jspx'
gen_otp_data = {
'name':'fileDown',
'filetype':'xls',
'market_gubun':'ALL', #시장구분: ALL=전체
'url':'MKD/04/0404/04040200/mkd04040200_01',
'schdate':'20191018',
'pagePath':'/contents/MKD/04/0404/04040200/MKD04040200.jsp'
}
r = requests.post(gen_otp_url, gen_otp_data)
OTP_code = r.content
OTP_code
b'kR4BVNFag+szu8hB3hhNym4wnhHiECpalBD0QD5gSoY4/Mjy4rk9q1LU0QEBvNx3VL/nX4om315BLTjIX61DApKASSKfgGAp4Jqixd5FS6sKsv5LGpssDJ7Zl00V37JevrpGopSxpJATyEj4YPPUfOCnedA+9U5lnd06bVaLHpljzFThxW7yJKPGJ2/gVaY36o75kXbijHhx476kbDn1h7fJpvPTvq+bUW+PV6jAxw11fjqegl6QlsJCpllml/kOmriWaPY/1wrCgpqPf1MfAO0M5UAH9uwubXKtNjgkp5w='
data = {
"Accept" :"text/html,application/xhtml+xm…plication/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding" : "gzip, deflate, br",
"Accept-Language" : "ko-KR,ko;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection" : "keep-alive",
"Content-Length" : "76940",
"Content-Type" : "application/x-www-form-urlencoded",
"Host" : "www.kamis.or.kr",
"Referer" : url + query,
"Upgrade-Insecure-Requests" : "1",
"User-Agent" : "Mozilla/5.0 (X11; Ubuntu; Linu…) Gecko/20100101 Firefox/69.0"
}
resp = requests.post("https://www.kamis.or.kr/jsp/common/excel_util.jsp", data=data)
resp.content
b'\r\n\r\n\r\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n<HTML xmlns="http://www.w3.org/1999/xhtml" xml:lang="ko" lang="ko">\r\n<head>\r\n\t<title>\xea\xb0\x80\xea\xb2\xa9\xec\xa0\x95\xeb\xb3\xb4 - \xec\xb9\x9c\xed\x99\x98\xea\xb2\xbd\xeb\x86\x8d\xec\x82\xb0\xeb\xac\xbc - \xed\x92\x88\xeb\xaa\xa9\xeb\xb3\x84 \xec\x97\x91\xec\x85\x80 \xec\xb6\x9c\xeb\xa0\xa5</title>\r\n\t<meta http-equiv="Content-Type" content="application/vnd.ms-excel;charset=utf-8">\r\n\t<style type="text/css">\r\n\t\ttable caption\t{ text-align:left; }\r\n\t\ttable th\t\t{ background-color:#eff2f9; }\r\n\t\ttable td\t\t{ mso-number-format:\\@; text-align:general; vertical-align:middle; text-align:center; white-space:nowrap; mso-rotate:0; mso-background-source:auto; mso-pattern:auto; color:black; font-size:11.0pt; font-style:normal; text-decoration:none; font-family:"\xeb\xa7\x91\xec\x9d\x80 \xea\xb3\xa0\xeb\x94\x95", monospace; mso-font-charset:129; border:1px; mso-protection:locked visible; mso-style-name:\xed\x91\x9c\xec\xa4\x80; mso-style-id:0; }\r\n\t\ttable td.c\t\t{ text-align:center;}\r\n\t\ttable td.l\t\t{ text-align:left; }\r\n\t\ttable td.r\t\t{ text-align:right; }\r\n\t</style>\r\n</head>\r\n<body>\r\n\t<table>\r\n\t\t<tr>\r\n\t\t\t<td\tclass="data">\r\n\t\t\t\t<table id="data" border="1px">\r\n\t\t\t\t\t\r\n\t\t\t\t</table>\r\n\t\t\t</td>\r\n\t\t</tr>\r\n\t</table>\r\n</body>\r\n</html>'
Get 방식으로 기존의 lxml 모듈로 수집하기
url = "https://www.kamis.or.kr/customer/price/retail/item.do?action=priceinfo"
params = {
"regday" : "2015-10-14",
"itemcategorycode" : "100",
"itemcode" : "111",
"convert_kg_yn" : "N",
}
import requests
web = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'
headers = {'User-Agent' : web}
resp = requests.get(url, params=params, headers=headers)
resp
<Response [200]>
from lxml.html import fromstring, tostring
resp_lxml = fromstring(resp.text)
resp_table = resp_lxml.xpath('//table[@id="itemTable_1"]')[0]
import pandas as pd
table = pd.read_html(tostring(resp_table))[0]
table_idx = [_ for _, txt in enumerate(list(table.columns)) if txt.find('당일') != -1]
if len(table_idx) == 1:
table_idx = table_idx[0]
table = table.iloc[:,[0, table_idx]].set_index(
table.columns[0]).loc[["평균","최고값","최저값"]]
table = list(table.to_dict(orient='list').values())[0]
else:
table = None
table
['45042', '50000', '38800']
# 조리 레시피 API 안내 페이지 https://www.foodsafetykorea.go.kr/api/openApiInfo.do
query = {
"menu_no":"849",
"menu_grp":"MENU_GRP31",
"start_idx":"120",
"svc_no":"COOKRCP01",
"svc_type_cd":"API_TYPE06",
"show_cnt":"10",
"Referer":"https://www.foodsafetykorea.go.kr/api/sheetInfo.do",
}
import requests
resp = requests.post("https://www.foodsafetykorea.go.kr/api/openApiInfo.do", data=query)
resp
# with open("food.html", "w") as f:
# f.write(resp.text)
<Response [200]>
"keyId" 부분에 인증키를 입력하고, "serviceId"에는 미리보기 주소에 있는 serviceId 를 사용
ex) http://openapi.foodsafetykorea.go.kr/api/%EC%9D%B8%EC%A6%9D%ED%82%A4/I0580/xml/1/20
# http://openapi.foodsafetykorea.go.kr/api/keyId/serviceId/dataType/startIdx/endIdx
url = "http://openapi.foodsafetykorea.go.kr/api" # sample/I2620/xml/1/5"
keyId = "8acba1823ae742359560"
serviceId = "I0750" # "I0580" # I0750
dataType = "json"
startNum = "1"
endNum = "3"
url = "/".join([url, keyId, serviceId, dataType, startNum, endNum])
%%time
from urllib import request, parse
resp = request.urlopen(url)
resp = parse.quote_plus(resp.read()) # Byte to String
resp = parse.unquote_plus(resp) # Encoding Text
import json
resp = json.loads(resp) # Json 데이터를 Dict 으로 변환
resp
# len(resp['I0750']['row'])
CPU times: user 7.15 ms, sys: 402 µs, total: 7.55 ms Wall time: 22.7 s
{'I0750': {'RESULT': {'MSG': '정상처리되었습니다.', 'CODE': 'INFO-000'}, 'total_count': '13824', 'row': [{'NUTR_CONT3': '10.1', 'NUTR_CONT2': '67.8', 'NUTR_CONT1': '349', 'FOOD_GROUP': '농촌진흥청 식품성분표 DB', 'BGN_YEAR': '2001', 'NUTR_CONT9': 'N/A', 'NUTR_CONT8': 'N/A', 'FOOD_CD': '100101000100000001', 'NUTR_CONT7': 'N/A', 'NUTR_CONT6': 'N/A', 'NUTR_CONT5': 'N/A', 'NUTR_CONT4': '3.7', 'DESC_KOR': '고량미,알곡', 'SERVING_WT': '100', 'FDGRP_NM': '곡류 및 그 제품', 'NUM': '1', 'ANIMAL_PLANT': ''}, {'NUTR_CONT3': '11.4', 'NUTR_CONT2': '73.5', 'NUTR_CONT1': '332', 'FOOD_GROUP': '농촌진흥청 식품성분표 DB', 'BGN_YEAR': '2017', 'NUTR_CONT9': 'N/A', 'NUTR_CONT8': 'N/A', 'FOOD_CD': '100101000200200001', 'NUTR_CONT7': 'N/A', 'NUTR_CONT6': '2', 'NUTR_CONT5': 'N/A', 'NUTR_CONT4': '3.7', 'DESC_KOR': '겉귀리,생것', 'SERVING_WT': '100', 'FDGRP_NM': '곡류 및 그 제품', 'NUM': '2', 'ANIMAL_PLANT': ''}, {'NUTR_CONT3': '14.3', 'NUTR_CONT2': '70.4', 'NUTR_CONT1': '334', 'FOOD_GROUP': '농촌진흥청 식품성분표 DB', 'BGN_YEAR': '2017', 'NUTR_CONT9': 'N/A', 'NUTR_CONT8': 'N/A', 'FOOD_CD': '100101000200300001', 'NUTR_CONT7': 'N/A', 'NUTR_CONT6': '3', 'NUTR_CONT5': 'N/A', 'NUTR_CONT4': '3.8', 'DESC_KOR': '쌀귀리,생것', 'SERVING_WT': '100', 'FDGRP_NM': '곡류 및 그 제품', 'NUM': '3', 'ANIMAL_PLANT': ''}]}}
resp['I0750']['RESULT']['CODE'].lower()#.find("error")
'info-000'
resp['I0750']['RESULT']['CODE'].lower().find("error")
-1
col_to_kor = {
"NUM":"번호",
"FOOD_CD":"식품코드",
"FDGRP_NM":"식품군",
"DESC_KOR":"식품이름",
"SERVING_WT":"1회제공량(g)",
"NUTR_CONT1":"열량(kcal)(1회제공량당)",
"NUTR_CONT2":"탄수화물(g)(1회제공량당)",
"NUTR_CONT3":"단백질(g)(1회제공량당)",
"NUTR_CONT4":"지방(g)(1회제공량당)",
"NUTR_CONT5":"당류(g)(1회제공량당)",
"NUTR_CONT6":"나트륨(mg)(1회제공량당)",
"NUTR_CONT7":"콜레스테롤(mg)(1회제공량당)",
"NUTR_CONT8":"포화지방산(g)(1회제공량당)",
"NUTR_CONT9":"트랜스지방(g)(1회제공량당)",
"ANIMAL_PLANT":"가공업체명",
"BGN_YEAR":"구축년도",
"FOOD_GROUP":"자료원",
}
import pandas as pd
df = pd.DataFrame(resp['I0750']['row'])
df = df.loc[:,list(col_to_kor.keys())] # header 정렬
df.columns = [col_to_kor[_] for _ in list(df.columns)]
# df.to_excel('data/food_nutrient.xls', index=False)
df
번호 | 식품코드 | 식품군 | 식품이름 | 1회제공량(g) | 열량(kcal)(1회제공량당) | 탄수화물(g)(1회제공량당) | 단백질(g)(1회제공량당) | 지방(g)(1회제공량당) | 당류(g)(1회제공량당) | 나트륨(mg)(1회제공량당) | 콜레스테롤(mg)(1회제공량당) | 포화지방산(g)(1회제공량당) | 트랜스지방(g)(1회제공량당) | 가공업체명 | 구축년도 | 자료원 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 100101000100000001 | 곡류 및 그 제품 | 고량미,알곡 | 100 | 349 | 67.8 | 10.1 | 3.7 | N/A | N/A | N/A | N/A | N/A | 2001 | 농촌진흥청 식품성분표 DB | |
1 | 2 | 100101000200200001 | 곡류 및 그 제품 | 겉귀리,생것 | 100 | 332 | 73.5 | 11.4 | 3.7 | N/A | 2 | N/A | N/A | N/A | 2017 | 농촌진흥청 식품성분표 DB | |
2 | 3 | 100101000200300001 | 곡류 및 그 제품 | 쌀귀리,생것 | 100 | 334 | 70.4 | 14.3 | 3.8 | N/A | 3 | N/A | N/A | N/A | 2017 | 농촌진흥청 식품성분표 DB |