#!/usr/bin/env python # coding: utf-8 # # **날씨정보 API 수집** # 동네예보(2일), 중기예보(3일 ~), 미세먼지 # 1. **.decode()** : Byte to String # 1. https://data.kma.go.kr/api/selectApiDetail.do # 1. http://www.weather.go.kr/weather/lifenindustry/sevice_rss.jsp # 1. http://www.airkorea.or.kr/web # In[14]: import re, csv, xmltodict from urllib import request, parse id_area = "109" # 서울/경기지역 코드 url_rss = "http://www.weather.go.kr/weather/forecast/mid-term-rss3.jsp?stnId=" # Open and read HTMl / XML # xml = parse.unquote_plus( # parse.quote_plus( # request.urlopen(url_rss).read())) xml = request.urlopen(url_rss+id_area).read().decode('utf8') xml = xmltodict.parse(xml) xml.keys() # In[15]: xml['rss']['channel']['item']['description']['header']#['body'] #.keys() # In[6]: xml['rss']['channel']['item']['description']['body']['location'][1]['data'][:2] # In[43]: xml['rss']['channel'].keys() #['item']['description']['body']['location'][1]['data'][:2] # In[48]: xml['rss']['channel']['item']['description'] # In[8]: # Grab article titles and links using regex xmlTitle = re.compile("<title>(.*)</title>") xmlLink = re.compile("<link>(.*)</link>") # Find and store the data findTitle = re.findall(xmlTitle,xml) findLink = re.findall(xmlLink,xml) # In[9]: findTitle # In[8]: #Iterate through the articles to create a range iterate = [] iterate[:] = range(1, 25) # Open the CSV file, write the headers writer = csv.writer(open("pytest.csv", "wb")) head = ("Title", "URL") writer.writerow(head) # Using a For Loop, write the results to the CSV file, row by row for i in iterate: writer.writerow([findTitle[i], findLink[i]]) # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: #
# # # **서울시 농수산 식품공사 API** # 1. https://www.garakprice.com/index.php?go_date=20191001 가락동 농산물 시세 Web # 1. http://www.garak.co.kr/gongsa/jsp/gs/intro/common.jsp JSP 호출주소 # # **[서울시 농산물 공사 OpenAPI](https://www.garak.co.kr/publicdata/selectPageListPublicData.do?sch_public_data_realm_code=1)** # # ```html # http://www.garak.co.kr/gongsa/jsp/gs/data_open/data.jsp?id=2087&passwd=1004&dataid=data4&pagesize=10 # &pageidx=1&portal.templet=false&p_ymd=20190408&p_jymd=20190408&d_cd=2&p_jjymd=20130429 # &p_pos_gubun=1&pum_nm= # ``` # ## **1 API 를 활용한 수집** # 가격정보를 **Xml API** 를 활용하여 수집 # In[10]: # 문제점... # Page 를 순환하면서 1개의 객체만 있으면 list 갯수가 1개로만 출력 # 해결방법... (https://rfriend.tistory.com/482) # 1개의 dict 을 DataFrame 변환 시, 데이터를 list로 변경해서 적용 # 그래야 컬럼, 데이터와 "인덱스" 를 자동으로 계산할 수 있다. # In[2]: date_info = '20191209' import pandas as pd from momukji import Garak xml_d_1 = Garak().getData(cd=1) # date_info, xml_d_2 = Garak().getData(cd=2) # date_info, xml_data = pd.concat([xml_d_1, xml_d_2]).reset_index(drop=True) xml_data.head(3) # xml_data.to_excel("data/food_Garak.xls", index=None) # ## **2 Web 페이지를 활용한 수집기** # https://www.garakprice.com/index.php?go_date=20191021 # 1. 하지만 품목수가 부족해서 실익이 적음 # In[13]: # https://www.garakprice.com/index.php?go_date=20191021 from urllib import parse, request def urllib_request(url, params, encode='utf-8'): params = parse.urlencode(params).encode(encode) url = request.Request(url, params) resp = request.data/lopen(url).read() resp = parse.quote_plus(resp) return parse.unquote_plus(resp) # params = { "go_date":20191021 } # url = "https://www.garakprice.com/index.php" # urllib_request(url, params)[:500] #
# # # **기타 공산품 가격정보** # # **[Data.go.kr 공산품 가격정보](https://www.data.go.kr/dataset/3043385/openapi.do?lang=ko)** # ## **1 공산품 Id 정보수집** # **[Data.go.kr 공산품 가격정보](https://www.data.go.kr/dataset/3043385/openapi.do?lang=ko)** # 1. **[한국소비자원 참가격](https://www.price.go.kr/tprice/portal/pricenewsandtpriceintro/iteminfo/getItemList.do)** # 1. **[유통상품지식뱅크](http://35.200.32.201/)** # 1. 분류기준 명확하게 찾기 # In[14]: from momukji import Product item_list = Product().getList() item_list.head(3) # In[15]: url = "http://ksetup.com/bbs/page.php?hid=code" import re import pandas as pd from urllib import request, parse resp = request.urlopen(url).read() resp = parse.quote_plus(resp) resp = parse.unquote_plus(resp) table = re.findall(r'(.*?)
', resp, re.M|re.I|re.S) table = "" + table[0] + "
" table = pd.read_html(table)[0] # table.to_excel('company_code.xls', index=None, columns=None) table.head(3) # ## **2 공산품 인터넷 쇼핑몰 정보** # 1. 네이버 쇼핑/ 핫딜 # 1. 다음쇼핑 등 정리한 내용 재정의 및 복습 # 1. 유통정보센터 식품관련 데이터 수집 및 정리 # In[ ]: # In[ ]: # In[ ]: #
# # # **급식메뉴 안내** # **[Data.go.kr 공산품 가격정보](https://www.data.go.kr/dataset/3043385/openapi.do?lang=ko)** # # **[NEIS 급식메뉴 조회 수집 사이트](https://stu.gen.go.kr/sts_sci_md00_001.do?schulCode=F100000120&schulCrseScCode=4&schulKndScCode=04&ay=2019&mm=10)** # ``` # https://stu.gen.go.kr/sts_sci_md00_001.do?schulCode=F100000120&schulCrseScCode=4&schulKndScCode=04&ay=2019&mm=12 # # https://stu.gen.go.kr/sts_sci_md00_001.do? # schulCode=F100000120 # &schulCrseScCode=4 # &schulKndScCode=04 # &ay=2019 # &mm=10 # ``` # In[3]: "{:02d}".format(2) # In[1]: query = { "schulCode":"F100000120", "schulCrseScCode":4, "schulKndScCode":"04", "ay":2019, # 년도 "mm":10 # 월 } from urllib import parse, request url = "https://stu.gen.go.kr/sts_sci_md00_001.do?" + parse.urlencode(query) resp = request.urlopen(url).read() resp = parse.quote_plus(resp) resp = parse.unquote_plus(resp) resp[:200] # In[ ]: import re re.findall() # In[ ]: from menu_parser import MenuParser from school import School school = School(School.Region.GWANGJU, School.Type.HIGH, "F100000120") parser = MenuParser(school) menus = parser.get_menu() print(menus.today) # In[ ]: class School: class Region: BUSAN = "stu.pen.go.kr" CHUNGBUK = "stu.cbe.go.kr" CHUNGNAM = "stu.cne.go.kr" DAEJEON = "stu.dge.go.kr" DEAGU = "stu.dge.go.kr" GWANGJU = "stu.gen.go.kr" GYEONGBUK = "stu.gbe.go.kr" GYEONGGI = "stu.goe.go.kr" GYEONGNAM = "stu.gne.go.kr" INCHEON = "stu.ice.go.kr" JEJU = "stu.jje.go.kr" JEONBUK = "stu.jbe.go.kr" JEONNAM = "stu.jne.go.kr" KANGWON = "stu.kwe.go.kr" SEJONG = "stu.sje.go.kr" SEOUL = "stu.sen.go.kr" ULSAN = "stu.use.go.kr" class Type: KINDERGARTEN = 1 ELEMENTARY = 2 MIDDLE = 3 HIGH = 4 def __init__(self, school_region, school_type, school_code): self.region = school_region self.type = school_type self.code = school_code school = School(School.Region.GWANGJU, School.Type.HIGH, "F100000120") school # In[ ]: class MenuParser: def __init__(self, school): self.school = school def get_menu(self, year=None, month=None): """ 해당 학교로부터 급식을 가져온다. year와 month가 모두 주어졌다면 해당하는 정보를 가져온다. 주어지지 않았을 때에는 자동으로 가져오게 된다. """ if year is None or month is None: today = datetime.date.today() url = self.__create_url(today.year, today.month) else: url = self.__create_url(year, month) page = self.__get_page(url); print(url) soup = BeautifulSoup(page, "html.parser") items = soup.select("#contents > div > table > tbody > tr > td > div") return Menu(items) def __get_page(self, url): try: page = requests.get(url).text except Exception as e: logging.error(e) return page def __create_url(self, year, month): today = datetime.date(year, month, 1) url = f"https://{self.school.region}/sts_sci_md00_001.do?" url += f"schulCode={self.school.code}&" url += f"schulCrseScCode={self.school.type}&" url += f"schulKndScCode={self.school.type:02d}&" url += f"ay={today.year}&" url += f"mm={today.month:02d}" print(url) return url # In[ ]: parser = MenuParser(school) parser # In[ ]: def __create_url(self, year, month): today = datetime.date(year, month, 1) url = f"https://{self.school.region}/sts_sci_md00_001.do?" url += f"schulCode={self.school.code}&" url += f"schulCrseScCode={self.school.type}&" url += f"schulKndScCode={self.school.type:02d}&" url += f"ay={today.year}&" url += f"mm={today.month:02d}" print(url) return url # In[ ]: # 교육청 코드 school.region # In[ ]: # 학교종류 school.type # In[ ]: # 개별 학교코드 school.code # In[ ]: "schulKndScCode={:02d}".format(4) # ## **일일수집 해외 위해정보** # http://www.foodsafetykorea.go.kr/riskinfo/board-collect-list.do # 1. Selenium 으로 해당 이벤트를 클릭한 뒤 # 1. 상세페이지에서 해당 xls 파일 다운로드 받기 # In[ ]: temp = datetime.today() temp.strftime('%Y%m%d') # In[ ]: datetime.strftime(temp, '%Y%m%d') # ## **2 알라딘 중고책** # api 크롤링을 사용한 데이터 수집 # In[ ]: from urllib import parse, request query = { "SearchTarget":"UsedStore", "KeyWord":"머신러닝", "ViewType":"Detail", "SortOrder":5, # 5:출시일순, 11:등록순 "ViewRowCount":50, "page":1, } base_url = "https://www.aladin.co.kr/search/wsearchresult.aspx?" url = base_url + parse.urlencode(query, encoding='euc-kr') resp = request.urlopen(url).read() # url 의 response 수집 resp = parse.quote_plus(resp) # byte 를 string 변환 resp = parse.unquote_plus(resp, encoding='euc-kr') # string 인코딩 # resp = parse.unquote_plus(resp, encoding='cp949') # string 인코딩 resp # In[ ]: with open('book.html','w') as f: f.write(resp) # In[ ]: # decoding (기본값 utf-8) parse.unquote_plus('%C4%F6%C6%AE', encoding='cp949') # In[ ]: # encodings parse.quote_plus("퀀트", encoding='cp949') # In[ ]: #
#