#!/usr/bin/env python # coding: utf-8 # ## 예제 # In[1]: import requests from bs4 import BeautifulSoup # ### 뉴스 제목 # In[2]: def get_daum_news_title(news_id): url = 'https://news.v.daum.net/v/{}'.format(news_id) resp = requests.get(url) soup = BeautifulSoup(resp.text, 'lxml') title_tag = soup.select_one('h3.tit_view') if title_tag: return title_tag.get_text() return '' # In[3]: get_daum_news_title(20200110071011691) # ### 뉴스 본문 크롤링 # In[4]: def get_daum_news_content(news_id): url = 'https://news.v.daum.net/v/{}'.format(news_id) resp = requests.get(url) soup = BeautifulSoup(resp.text, 'lxml') content = '' for p in soup.select('div#harmonyContainer p'): content += p.get_text() return content # In[5]: get_daum_news_content(20200110071011691) # ### 뉴스 댓글 크롤링 # In[6]: url = 'https://comment.daum.net/apis/v1/posts/@20200110071011691/comments?parentId=0&offset=3&limit=10&sort=RECOMMEND&isInitial=false' requests.get(url) # In[8]: headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', 'Authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJmb3J1bV9rZXkiOiJuZXdzIiwiZ3JhbnRfdHlwZSI6ImFsZXhfY3JlZGVudGlhbHMiLCJzY29wZSI6W10sImV4cCI6MTU3ODkzMTQzNSwiYXV0aG9yaXRpZXMiOlsiUk9MRV9DTElFTlQiXSwianRpIjoiMWJjZjJiZTEtYTdkZS00MzZkLWJkMzUtYjFmOGE2YmVjOGQzIiwiZm9ydW1faWQiOi05OSwiY2xpZW50X2lkIjoiMjZCWEF2S255NVdGNVowOWxyNWs3N1k4In0.B-tRCNqj_SZIlSf8v2nWm0wxi0MIZtCfAEaeSWMhpo4', 'Connection': 'keep-alive', 'Cookie': 'webid=5c722b7f04af4ba6964dc8f673793fb9; webid_sync=1578888236313; TIARA=zAZ2uRST4BVty9VU6zSypPaYtjcSkCzD7Q4YkeXvlT4rkBsSB6fa_2u_SZHtC4iplHAZaydMHCsc3xEsuYk8MG5MJRW114wB', 'Host': 'comment.daum.net', 'Origin': 'https://news.v.daum.net', 'Referer': 'https://news.v.daum.net/v/20200110071011691', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-site', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' } # In[9]: requests.get(url, headers=headers) # In[10]: resp = requests.get(url, headers=headers) resp.json() # In[18]: def get_daum_news_comments(news_id): headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', 'Authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJmb3J1bV9rZXkiOiJuZXdzIiwiZ3JhbnRfdHlwZSI6ImFsZXhfY3JlZGVudGlhbHMiLCJzY29wZSI6W10sImV4cCI6MTU3ODkzMTQzNSwiYXV0aG9yaXRpZXMiOlsiUk9MRV9DTElFTlQiXSwianRpIjoiMWJjZjJiZTEtYTdkZS00MzZkLWJkMzUtYjFmOGE2YmVjOGQzIiwiZm9ydW1faWQiOi05OSwiY2xpZW50X2lkIjoiMjZCWEF2S255NVdGNVowOWxyNWs3N1k4In0.B-tRCNqj_SZIlSf8v2nWm0wxi0MIZtCfAEaeSWMhpo4', 'Connection': 'keep-alive', 'Cookie': 'webid=5c722b7f04af4ba6964dc8f673793fb9; webid_sync=1578888236313; TIARA=zAZ2uRST4BVty9VU6zSypPaYtjcSkCzD7Q4YkeXvlT4rkBsSB6fa_2u_SZHtC4iplHAZaydMHCsc3xEsuYk8MG5MJRW114wB', 'Host': 'comment.daum.net', 'Origin': 'https://news.v.daum.net', 'Referer': 'https://news.v.daum.net/v/20200110071011691', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-site', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' } url_template = 'https://comment.daum.net/apis/v1/posts/@{}/comments?parentId=0&offset={}&limit=10&sort=RECOMMEND&isInitial=false' offset = 0 comments = [] while True: url = url_template.format(news_id, offset) resp = requests.get(url, headers=headers) data = resp.json() if not data: break comments.extend(data) offset += 10 return comments # In[20]: len(get_daum_news_comments('20200110071011691')) # In[23]: for i in get_daum_news_comments('20200110071011691'): print(i['content']) print('\n')