ex) http://openapi.foodsafetykorea.go.kr/api/%EC%9D%B8%EC%A6%9D%ED%82%A4/I0580/xml/1/20
식품의약처 API 내용 활용함수 : Json / DataFrame 으로 출력
import pandas as pd
from tqdm import tqdm
from momukji import FoodSafe
[(_, FoodSafe().apiKey[_]['name']) for _ in FoodSafe().apiKey.keys()]
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-1-9b28abdf7f20> in <module> 1 import pandas as pd 2 from tqdm import tqdm ----> 3 from momukji import FoodSafe 4 [(_, FoodSafe().apiKey[_]['name']) for _ in FoodSafe().apiKey.keys()] ModuleNotFoundError: No module named 'momukji'
%%time
result, foodId = [], 'I0490' # 회수요청제품 (400개)
data = FoodSafe().getData(foodId, 1, 1000, FoodSafe().apiKey[foodId]['cols'], display=True)
data.to_excel("data/food_recall.xls", index=None)
%%time
result, foodId = [], 'I0750' # 식품 영양정보
_ = FoodSafe().getData(foodId, 1, 2, FoodSafe().apiKey[foodId]['cols'], display=True)
# for _ in tqdm(range(1, 13824+1, 1000)):
# result.append(FoodSafe().getData(foodId, _, _+999, FoodSafe().apiKey[foodId]['cols']))
# pd.concat(result).to_excel('data/food_nutrient.xls', index=None)
%%time
result, foodId = [], 'COOKRCP01' # 레시피 데이터
_ = FoodSafe().getData(foodId, 1, 2, FoodSafe().apiKey[foodId]['cols'], display=True)
# for _ in tqdm(range(1, 1500+1, 1000)):
# result.append(FoodSafe().getData(foodId, _, _+999, FoodSafe().apiKey[foodId]['cols']))
# pd.concat(result).to_excel('data/food_recipe_info.xls', index=None)
%%time
result, foodId = [], 'C005' # 제품 바코드 번호
# _ = FoodSafe().getData(foodId, 1, 2, FoodSafe().apiKey[foodId]['cols'], display=True)
# for _ in tqdm(range(1, 100200+1, 1000)):
from collections import OrderedDict
for _ in tqdm(range(1, 100200+1, 1000)):
result.append(FoodSafe().getData(foodId, _, _+999,
FoodSafe().apiKey[foodId]['cols']).loc[:,
list(OrderedDict(FoodSafe().apiKey['C005']['cols']).values())])
pd.concat(result).to_csv('data/food_barcode.csv', index=None)
pd.concat(result).to_excel('data/food_barcode.xls', index=None)
%%time
result, foodId = [], "I2570" # 유통 바코드
_ = FoodSafe().getData(foodId, 1, 2, FoodSafe().apiKey[foodId]['cols'], display=True)
# for _ in tqdm(range(1, 49000+1, 1000)):
# result.append(FoodSafe().getData(foodId, _, _+999, FoodSafe().apiKey[foodId]['cols']))
# pd.concat(result).to_excel('data/food_barcode_info.xls', index=None)
'<title.*?>(.+?)</title>' # 특정태그
"<[^>]+>|[^<]+" # html 태그 내부의 한글 추출
'<.*?>' # 모든 태그
reg_table = '<table.*?>(.+?)</table>'
reg_table = '<font.*?>(.+?)</font>'
reg_table = '<.*?>'
reg_table = "<table.*?>(.*?)"
reg_table = '<table class="wtable3" .*?>(.*?)</table>'
Kamis 식품정보 수집
%%time
from momukji import Kamis
Kamis().getData('2019-10-15', "100").head()
%%time
from momukji import Kamis
Kamis().getData('2019-10-15', "100" ,types=2).head()
from momukji import Kamis
resp = Kamis().getApi('2019-10-21', 500, cd=2)
resp.head(3)
http://www.garak.co.kr/gongsa/jsp/gs/data_open/data.jsp?id=2087&passwd=1004&dataid=data4&pagesize=10
&pageidx=1&portal.templet=false&p_ymd=20190408&p_jymd=20190408&d_cd=2&p_jjymd=20130429
&p_pos_gubun=1&pum_nm=
가격정보를 Xml API 를 활용하여 수집
# 문제점...
# Page 를 순환하면서 1개의 객체만 있으면 list 갯수가 1개로만 출력
# 해결방법... (https://rfriend.tistory.com/482)
# 1개의 dict 을 DataFrame 변환 시, 데이터를 list로 변경해서 적용
# 그래야 컬럼, 데이터와 "인덱스" 를 자동으로 계산할 수 있다.
date_info = '20191022'
import pandas as pd
from momukji import Garak
xml_d_1 = Garak().getData(cd=1) # date_info,
xml_d_2 = Garak().getData(cd=2) # date_info,
xml_data = pd.concat([xml_d_1, xml_d_2]).reset_index(drop=True)
xml_data.to_excel("data/food_Garak.xls", index=None)
# https://www.garakprice.com/index.php?go_date=20191021
from urllib import parse, request
def urllib_request(url, params, encode='utf-8'):
params = parse.urlencode(params).encode(encode)
url = request.Request(url, params)
resp = request.data/lopen(url).read()
resp = parse.quote_plus(resp)
return parse.unquote_plus(resp)
# params = { "go_date":20191021 }
# url = "https://www.garakprice.com/index.php"
# urllib_request(url, params)[:500]
from momukji import Product
item_list = Product().getList()
item_list.head(3)
url = "http://ksetup.com/bbs/page.php?hid=code"
import re
import pandas as pd
from urllib import request, parse
resp = request.urlopen(url).read()
resp = parse.quote_plus(resp)
resp = parse.unquote_plus(resp)
table = re.findall(r'<table class="table table-bordered".*?>(.*?)</table>', resp, re.M|re.I|re.S)
table = "<table>" + table[0] + "</table>"
table = pd.read_html(table)[0]
# table.to_excel('company_code.xls', index=None, columns=None)
table.head(3)
https://stu.gen.go.kr/sts_sci_md00_001.do?schulCode=F100000120&schulCrseScCode=4&schulKndScCode=04&ay=2019&mm=12
https://stu.gen.go.kr/sts_sci_md00_001.do?
schulCode=F100000120
&schulCrseScCode=4
&schulKndScCode=04
&ay=2019
&mm=10
"{:02d}".format(2)
query = {
"schulCode":"F100000120",
"schulCrseScCode":4,
"schulKndScCode":"04",
"ay":2019, # 년도
"mm":10 # 월
}
from urllib import parse, request
url = "https://stu.gen.go.kr/sts_sci_md00_001.do?" + parse.urlencode(query)
resp = request.urlopen(url).read()
resp = parse.quote_plus(resp)
resp = parse.unquote_plus(resp)
resp[:200]
import re
re.findall()
, "/sts_sci_md00_001.do?"
from menu_parser import MenuParser
from school import School
school = School(School.Region.GWANGJU, School.Type.HIGH, "F100000120")
parser = MenuParser(school)
menus = parser.get_menu()
print(menus.today)
class School:
class Region:
BUSAN = "stu.pen.go.kr"
CHUNGBUK = "stu.cbe.go.kr"
CHUNGNAM = "stu.cne.go.kr"
DAEJEON = "stu.dge.go.kr"
DEAGU = "stu.dge.go.kr"
GWANGJU = "stu.gen.go.kr"
GYEONGBUK = "stu.gbe.go.kr"
GYEONGGI = "stu.goe.go.kr"
GYEONGNAM = "stu.gne.go.kr"
INCHEON = "stu.ice.go.kr"
JEJU = "stu.jje.go.kr"
JEONBUK = "stu.jbe.go.kr"
JEONNAM = "stu.jne.go.kr"
KANGWON = "stu.kwe.go.kr"
SEJONG = "stu.sje.go.kr"
SEOUL = "stu.sen.go.kr"
ULSAN = "stu.use.go.kr"
class Type:
KINDERGARTEN = 1
ELEMENTARY = 2
MIDDLE = 3
HIGH = 4
def __init__(self, school_region, school_type, school_code):
self.region = school_region
self.type = school_type
self.code = school_code
school = School(School.Region.GWANGJU, School.Type.HIGH, "F100000120")
school
class MenuParser:
def __init__(self, school):
self.school = school
def get_menu(self, year=None, month=None):
"""
해당 학교로부터 급식을 가져온다.
year와 month가 모두 주어졌다면 해당하는 정보를 가져온다.
주어지지 않았을 때에는 자동으로 가져오게 된다.
"""
if year is None or month is None:
today = datetime.date.today()
url = self.__create_url(today.year, today.month)
else:
url = self.__create_url(year, month)
page = self.__get_page(url); print(url)
soup = BeautifulSoup(page, "html.parser")
items = soup.select("#contents > div > table > tbody > tr > td > div")
return Menu(items)
def __get_page(self, url):
try:
page = requests.get(url).text
except Exception as e:
logging.error(e)
return page
def __create_url(self, year, month):
today = datetime.date(year, month, 1)
url = f"https://{self.school.region}/sts_sci_md00_001.do?"
url += f"schulCode={self.school.code}&"
url += f"schulCrseScCode={self.school.type}&"
url += f"schulKndScCode={self.school.type:02d}&"
url += f"ay={today.year}&"
url += f"mm={today.month:02d}"
print(url)
return url
parser = MenuParser(school)
parser
def __create_url(self, year, month):
today = datetime.date(year, month, 1)
url = f"https://{self.school.region}/sts_sci_md00_001.do?"
url += f"schulCode={self.school.code}&"
url += f"schulCrseScCode={self.school.type}&"
url += f"schulKndScCode={self.school.type:02d}&"
url += f"ay={today.year}&"
url += f"mm={today.month:02d}"
print(url)
return url
# 교육청 코드
school.region
# 학교종류
school.type
# 개별 학교코드
school.code
"schulKndScCode={:02d}".format(4)
parser = MenuParser(school)
parser.
http://www.foodsafetykorea.go.kr/riskinfo/board-collect-list.do
temp = datetime.today()
temp.strftime('%Y%m%d')
datetime.strftime(temp, '%Y%m%d')
api 크롤링을 사용한 데이터 수집
query = {
"KeyWord":"파이썬", #"%C6%C4%C0%CC%BD%E3",
"ViewType":"Detail",
"SortOrder":5, # 5:출시일순, 11:등록순
"ViewRowCount":50,
"page":2,
}
url = "https://www.aladin.co.kr/search/wsearchresult.aspx?SearchTarget=UsedStore"
from urllib import parse, request
params = parse.urlencode(query).encode('cp949') # Params 인코딩 변환
resp = request.Request(url, params) # url 주소값 생성
resp = request.urlopen(resp).read() # url 을 활용한 response 수집
resp = parse.quote_plus(resp) # response 의 byte 를 string 변환
resp = parse.unquote_plus(resp, encoding='cp949') # string 인코딩 변환 (default='utf-8')
with open('book.html','w') as f:
f.write(resp)
# decoding (기본값 utf-8)
parse.unquote_plus('%C4%F6%C6%AE', encoding='cp949')
# encodings
parse.quote_plus("퀀트", encoding='cp949')
# <div class="Search3_Pager">
# <table
with open("data/food_recipie.json", "r", encoding='utf-8-sig') as f:
recipe = f.read()
import json
food_data = json.loads(recipe)
foods = [_ for _ in food_data.keys()]
food_data[foods[0]]
import codecs
decoded_data=codecs.decode(r.text, 'utf-8-sig')