#!/usr/bin/env python
# coding: utf-8
# # **날씨정보 API 수집**
# 동네예보(2일), 중기예보(3일 ~), 미세먼지
# 1. **.decode()** : Byte to String
# 1. https://data.kma.go.kr/api/selectApiDetail.do
# 1. http://www.weather.go.kr/weather/lifenindustry/sevice_rss.jsp
# 1. http://www.airkorea.or.kr/web
# In[14]:
import re, csv, xmltodict
from urllib import request, parse
id_area = "109" # 서울/경기지역 코드
url_rss = "http://www.weather.go.kr/weather/forecast/mid-term-rss3.jsp?stnId="
# Open and read HTMl / XML
# xml = parse.unquote_plus(
# parse.quote_plus(
# request.urlopen(url_rss).read()))
xml = request.urlopen(url_rss+id_area).read().decode('utf8')
xml = xmltodict.parse(xml)
xml.keys()
# In[15]:
xml['rss']['channel']['item']['description']['header']#['body'] #.keys()
# In[6]:
xml['rss']['channel']['item']['description']['body']['location'][1]['data'][:2]
# In[43]:
xml['rss']['channel'].keys() #['item']['description']['body']['location'][1]['data'][:2]
# In[48]:
xml['rss']['channel']['item']['description']
# In[8]:
# Grab article titles and links using regex
xmlTitle = re.compile("<title>(.*)</title>")
xmlLink = re.compile("<link>(.*)</link>")
# Find and store the data
findTitle = re.findall(xmlTitle,xml)
findLink = re.findall(xmlLink,xml)
# In[9]:
findTitle
# In[8]:
#Iterate through the articles to create a range
iterate = []
iterate[:] = range(1, 25)
# Open the CSV file, write the headers
writer = csv.writer(open("pytest.csv", "wb"))
head = ("Title", "URL")
writer.writerow(head)
# Using a For Loop, write the results to the CSV file, row by row
for i in iterate:
writer.writerow([findTitle[i], findLink[i]])
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
#
#
# # **서울시 농수산 식품공사 API**
# 1. https://www.garakprice.com/index.php?go_date=20191001 가락동 농산물 시세 Web
# 1. http://www.garak.co.kr/gongsa/jsp/gs/intro/common.jsp JSP 호출주소
#
# **[서울시 농산물 공사 OpenAPI](https://www.garak.co.kr/publicdata/selectPageListPublicData.do?sch_public_data_realm_code=1)**
#
# ```html
# http://www.garak.co.kr/gongsa/jsp/gs/data_open/data.jsp?id=2087&passwd=1004&dataid=data4&pagesize=10
# &pageidx=1&portal.templet=false&p_ymd=20190408&p_jymd=20190408&d_cd=2&p_jjymd=20130429
# &p_pos_gubun=1&pum_nm=
# ```
# ## **1 API 를 활용한 수집**
# 가격정보를 **Xml API** 를 활용하여 수집
# In[10]:
# 문제점...
# Page 를 순환하면서 1개의 객체만 있으면 list 갯수가 1개로만 출력
# 해결방법... (https://rfriend.tistory.com/482)
# 1개의 dict 을 DataFrame 변환 시, 데이터를 list로 변경해서 적용
# 그래야 컬럼, 데이터와 "인덱스" 를 자동으로 계산할 수 있다.
# In[2]:
date_info = '20191209'
import pandas as pd
from momukji import Garak
xml_d_1 = Garak().getData(cd=1) # date_info,
xml_d_2 = Garak().getData(cd=2) # date_info,
xml_data = pd.concat([xml_d_1, xml_d_2]).reset_index(drop=True)
xml_data.head(3)
# xml_data.to_excel("data/food_Garak.xls", index=None)
# ## **2 Web 페이지를 활용한 수집기**
# https://www.garakprice.com/index.php?go_date=20191021
# 1. 하지만 품목수가 부족해서 실익이 적음
# In[13]:
# https://www.garakprice.com/index.php?go_date=20191021
from urllib import parse, request
def urllib_request(url, params, encode='utf-8'):
params = parse.urlencode(params).encode(encode)
url = request.Request(url, params)
resp = request.data/lopen(url).read()
resp = parse.quote_plus(resp)
return parse.unquote_plus(resp)
# params = { "go_date":20191021 }
# url = "https://www.garakprice.com/index.php"
# urllib_request(url, params)[:500]
#
#
# # **기타 공산품 가격정보**
#
# **[Data.go.kr 공산품 가격정보](https://www.data.go.kr/dataset/3043385/openapi.do?lang=ko)**
# ## **1 공산품 Id 정보수집**
# **[Data.go.kr 공산품 가격정보](https://www.data.go.kr/dataset/3043385/openapi.do?lang=ko)**
# 1. **[한국소비자원 참가격](https://www.price.go.kr/tprice/portal/pricenewsandtpriceintro/iteminfo/getItemList.do)**
# 1. **[유통상품지식뱅크](http://35.200.32.201/)**
# 1. 분류기준 명확하게 찾기
# In[14]:
from momukji import Product
item_list = Product().getList()
item_list.head(3)
# In[15]:
url = "http://ksetup.com/bbs/page.php?hid=code"
import re
import pandas as pd
from urllib import request, parse
resp = request.urlopen(url).read()
resp = parse.quote_plus(resp)
resp = parse.unquote_plus(resp)
table = re.findall(r'
', resp, re.M|re.I|re.S)
table = ""
table = pd.read_html(table)[0]
# table.to_excel('company_code.xls', index=None, columns=None)
table.head(3)
# ## **2 공산품 인터넷 쇼핑몰 정보**
# 1. 네이버 쇼핑/ 핫딜
# 1. 다음쇼핑 등 정리한 내용 재정의 및 복습
# 1. 유통정보센터 식품관련 데이터 수집 및 정리
# In[ ]:
# In[ ]:
# In[ ]:
#
#
# # **급식메뉴 안내**
# **[Data.go.kr 공산품 가격정보](https://www.data.go.kr/dataset/3043385/openapi.do?lang=ko)**
#
# **[NEIS 급식메뉴 조회 수집 사이트](https://stu.gen.go.kr/sts_sci_md00_001.do?schulCode=F100000120&schulCrseScCode=4&schulKndScCode=04&ay=2019&mm=10)**
# ```
# https://stu.gen.go.kr/sts_sci_md00_001.do?schulCode=F100000120&schulCrseScCode=4&schulKndScCode=04&ay=2019&mm=12
#
# https://stu.gen.go.kr/sts_sci_md00_001.do?
# schulCode=F100000120
# &schulCrseScCode=4
# &schulKndScCode=04
# &ay=2019
# &mm=10
# ```
# In[3]:
"{:02d}".format(2)
# In[1]:
query = {
"schulCode":"F100000120",
"schulCrseScCode":4,
"schulKndScCode":"04",
"ay":2019, # 년도
"mm":10 # 월
}
from urllib import parse, request
url = "https://stu.gen.go.kr/sts_sci_md00_001.do?" + parse.urlencode(query)
resp = request.urlopen(url).read()
resp = parse.quote_plus(resp)
resp = parse.unquote_plus(resp)
resp[:200]
# In[ ]:
import re
re.findall()
# In[ ]:
from menu_parser import MenuParser
from school import School
school = School(School.Region.GWANGJU, School.Type.HIGH, "F100000120")
parser = MenuParser(school)
menus = parser.get_menu()
print(menus.today)
# In[ ]:
class School:
class Region:
BUSAN = "stu.pen.go.kr"
CHUNGBUK = "stu.cbe.go.kr"
CHUNGNAM = "stu.cne.go.kr"
DAEJEON = "stu.dge.go.kr"
DEAGU = "stu.dge.go.kr"
GWANGJU = "stu.gen.go.kr"
GYEONGBUK = "stu.gbe.go.kr"
GYEONGGI = "stu.goe.go.kr"
GYEONGNAM = "stu.gne.go.kr"
INCHEON = "stu.ice.go.kr"
JEJU = "stu.jje.go.kr"
JEONBUK = "stu.jbe.go.kr"
JEONNAM = "stu.jne.go.kr"
KANGWON = "stu.kwe.go.kr"
SEJONG = "stu.sje.go.kr"
SEOUL = "stu.sen.go.kr"
ULSAN = "stu.use.go.kr"
class Type:
KINDERGARTEN = 1
ELEMENTARY = 2
MIDDLE = 3
HIGH = 4
def __init__(self, school_region, school_type, school_code):
self.region = school_region
self.type = school_type
self.code = school_code
school = School(School.Region.GWANGJU, School.Type.HIGH, "F100000120")
school
# In[ ]:
class MenuParser:
def __init__(self, school):
self.school = school
def get_menu(self, year=None, month=None):
"""
해당 학교로부터 급식을 가져온다.
year와 month가 모두 주어졌다면 해당하는 정보를 가져온다.
주어지지 않았을 때에는 자동으로 가져오게 된다.
"""
if year is None or month is None:
today = datetime.date.today()
url = self.__create_url(today.year, today.month)
else:
url = self.__create_url(year, month)
page = self.__get_page(url); print(url)
soup = BeautifulSoup(page, "html.parser")
items = soup.select("#contents > div > table > tbody > tr > td > div")
return Menu(items)
def __get_page(self, url):
try:
page = requests.get(url).text
except Exception as e:
logging.error(e)
return page
def __create_url(self, year, month):
today = datetime.date(year, month, 1)
url = f"https://{self.school.region}/sts_sci_md00_001.do?"
url += f"schulCode={self.school.code}&"
url += f"schulCrseScCode={self.school.type}&"
url += f"schulKndScCode={self.school.type:02d}&"
url += f"ay={today.year}&"
url += f"mm={today.month:02d}"
print(url)
return url
# In[ ]:
parser = MenuParser(school)
parser
# In[ ]:
def __create_url(self, year, month):
today = datetime.date(year, month, 1)
url = f"https://{self.school.region}/sts_sci_md00_001.do?"
url += f"schulCode={self.school.code}&"
url += f"schulCrseScCode={self.school.type}&"
url += f"schulKndScCode={self.school.type:02d}&"
url += f"ay={today.year}&"
url += f"mm={today.month:02d}"
print(url)
return url
# In[ ]:
# 교육청 코드
school.region
# In[ ]:
# 학교종류
school.type
# In[ ]:
# 개별 학교코드
school.code
# In[ ]:
"schulKndScCode={:02d}".format(4)
# ## **일일수집 해외 위해정보**
# http://www.foodsafetykorea.go.kr/riskinfo/board-collect-list.do
# 1. Selenium 으로 해당 이벤트를 클릭한 뒤
# 1. 상세페이지에서 해당 xls 파일 다운로드 받기
# In[ ]:
temp = datetime.today()
temp.strftime('%Y%m%d')
# In[ ]:
datetime.strftime(temp, '%Y%m%d')
# ## **2 알라딘 중고책**
# api 크롤링을 사용한 데이터 수집
# In[ ]:
from urllib import parse, request
query = {
"SearchTarget":"UsedStore",
"KeyWord":"머신러닝",
"ViewType":"Detail",
"SortOrder":5, # 5:출시일순, 11:등록순
"ViewRowCount":50,
"page":1,
}
base_url = "https://www.aladin.co.kr/search/wsearchresult.aspx?"
url = base_url + parse.urlencode(query, encoding='euc-kr')
resp = request.urlopen(url).read() # url 의 response 수집
resp = parse.quote_plus(resp) # byte 를 string 변환
resp = parse.unquote_plus(resp, encoding='euc-kr') # string 인코딩
# resp = parse.unquote_plus(resp, encoding='cp949') # string 인코딩
resp
# In[ ]:
with open('book.html','w') as f:
f.write(resp)
# In[ ]:
# decoding (기본값 utf-8)
parse.unquote_plus('%C4%F6%C6%AE', encoding='cp949')
# In[ ]:
# encodings
parse.quote_plus("퀀트", encoding='cp949')
# In[ ]:
#