Web Crawling¶

01 웹페이지 정보 확인¶

$ pip install python-whois

Web Crawling¶

In [ ]:

# Bs4 01
#url = 'http://example.webscraping.com/view/United Kingdom-239'
#html = download(url)
# soup.find( text = re.compile("sisters") )              # text 내용이 sisters 를 포함시
# soup.find( attrs = {'id' : 'place_area_row'} )
# soup.find( attrs = {'class' : 'w2p_fw'} )

In [ ]:

# Bs4 02 
# soup.find_all( "p")           # <tag>
# soup.find_all( "p", "title" )  # CSS클래스 title 인 <p> tag의 값을 추출
# soup.find_all( id = "link2" )    # id
# soup.find_all( id = True)                              # id 속성을 포함시
# soup.find_all( href = re.compile("elsie") )            # elsie URL 링크 포함시
# soup.find_all( href = re.compile("elsie"), id='link1')

In [ ]:

# Regular Expression 01
# print(re.findall('<td class="w2p_fw">(.*?)</td>', html)[0])
# print(re.findall('<tr id="places_area__row">.*?<td\s*class=["\']w2p_fw["\']>(.*?)</td>', html))
# print(re.findall('<tr id="places_area__row"><td class="w2p_fl"><label for="places_area" id="places_area__label">\
#                   Area: </label></td><td class="w2p_fw">(.*?)</td>', html))

In [ ]:

# Regular Expression 02
# url = 'http://example.webscraping.com/view/United Kingdom-239'
# html = download(url)
# re.findall("<td class='w2p_fw'>(.*?)</td>", html)

In [ ]:

# Regular Expression (제일 빠름)
# results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).groups()[0]
# lxml  (중간)
# tree = lxml.html.fromstring(html)
# results[field] = tree.cssselect('table > tr#places_%s__row> td.w2p_fw' % field)[0].text_content()
# BS4   (제일 느리다)
# soup = BeautifulSoup(html, 'html.parser')
# results[field] = soup.find('table').find('tr',id='places_%s__row' % field).find('td', class_='w2p_fw').text

LXML¶

C로 만든 XML 분석용 라이브러리
설치는 어렵지만 가장 강력한 모듈

In [ ]:

#bs4로 활용찾고, Re로 완성하기
# Regular Expression (용이성은 떨어지지만, 제일 빠르고 파이썬 내부 모듈을 활용) - bs4로 활용찾고, Re로 완성하기
# results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).groups()[0]

In [ ]:

# find_population_lxml.py
import re
from lxml import html
broken_html = '<ul class=country><li>Area<li>Population</ul>'
tree = html.fromstring(broken_html)  # HTML 분석
fixed_html = html.tostring(tree, pretty_print=True); fixed_html.decode('utf-8')

In [ ]:

from lxml import html
import requests
url = 'http://media.daum.net/ranking/empathy'
page = requests.get(url)
tree = html.fromstring(page.content)
page.text[:300]

In [ ]:

xpath = '//*[@id="mArticle"]/div[2]/ul[2]/li[1]/div[2]/div/span/text()'
hotel_name = tree.xpath(xpath)
temp = re.sub("\n","",hotel_name[0])
temp = re.sub("  ","",temp); temp

In [ ]:

# .list_news2 > li:nth-child(1) > div:nth-child(3) > strong:nth-child(1) > a:nth-child(1)
# //*[@id="mArticle"]/div[2]/ul[2]/li[1]/div[2]/strong/a
# //*[@id="mArticle"]/div[2]/ul[2]/li[2]/div[2]/strong/a

In [ ]:

n=10

In [ ]:

xpath = '//*[@id="mArticle"]/div[2]/ul[2]/li['+str(n)+']/div[2]/strong/a/text()'
hotel_name = tree.xpath(xpath)
temp = re.sub("\n","",hotel_name[0])
temp = re.sub("  ","",temp); temp

In [ ]:

xpath = '//*[@id="mArticle"]/div[2]/ul[2]/li['+str(n)+']/div[2]/div/span/text()'
hotel_name = tree.xpath(xpath)
temp = re.sub("\n","",hotel_name[0]) 
temp = re.sub("  ","",temp); temp

Web Crawling to CSV DB¶

In [ ]:

from lxml import etree
from io import StringIO
from csv import DictWriter
f= StringIO('''
    <html><body>
    <a class="ui-magnifier-glass" 
       href="here goes the link that i want to extract" data-spm-anchor-id="0.0.0.0" 
       style="width: 258px; height: 258px; position: absolute; left: -1px; top: -1px; display: none;"></a>
    <a href="link to extract" title="title to extract" rel="category tag" data-spm-anchor-id="0.0.0.0">
    or maybe this word instead of title</a>
    </body></html>''')

In [ ]:

doc = etree.parse(f); data=[]
r = doc.xpath('//a[@data-spm-anchor-id="0.0.0.0"]')
for elem in r:
    link=elem.get('href')
    title=elem.get('title')
    text=elem.text
    data.append({'link': link,'title': title,'text': text})

In [ ]:

with open('file.csv', 'w') as csvfile:
    fieldnames=['link', 'title', 'text']
    writer = DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in data:
        writer.writerow(row)

In [ ]:

import pandas as pd
pd.read_csv('./file.csv')