# Bs4 01
#url = 'http://example.webscraping.com/view/United Kingdom-239'
#html = download(url)
# soup.find( text = re.compile("sisters") ) # text 내용이 sisters 를 포함시
# soup.find( attrs = {'id' : 'place_area_row'} )
# soup.find( attrs = {'class' : 'w2p_fw'} )
# Bs4 02
# soup.find_all( "p") # <tag>
# soup.find_all( "p", "title" ) # CSS클래스 title 인 <p> tag의 값을 추출
# soup.find_all( id = "link2" ) # id
# soup.find_all( id = True) # id 속성을 포함시
# soup.find_all( href = re.compile("elsie") ) # elsie URL 링크 포함시
# soup.find_all( href = re.compile("elsie"), id='link1')
# Regular Expression 01
# print(re.findall('<td class="w2p_fw">(.*?)</td>', html)[0])
# print(re.findall('<tr id="places_area__row">.*?<td\s*class=["\']w2p_fw["\']>(.*?)</td>', html))
# print(re.findall('<tr id="places_area__row"><td class="w2p_fl"><label for="places_area" id="places_area__label">\
# Area: </label></td><td class="w2p_fw">(.*?)</td>', html))
# Regular Expression 02
# url = 'http://example.webscraping.com/view/United Kingdom-239'
# html = download(url)
# re.findall("<td class='w2p_fw'>(.*?)</td>", html)
# Regular Expression (제일 빠름)
# results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).groups()[0]
# lxml (중간)
# tree = lxml.html.fromstring(html)
# results[field] = tree.cssselect('table > tr#places_%s__row> td.w2p_fw' % field)[0].text_content()
# BS4 (제일 느리다)
# soup = BeautifulSoup(html, 'html.parser')
# results[field] = soup.find('table').find('tr',id='places_%s__row' % field).find('td', class_='w2p_fw').text
#bs4로 활용찾고, Re로 완성하기
# Regular Expression (용이성은 떨어지지만, 제일 빠르고 파이썬 내부 모듈을 활용) - bs4로 활용찾고, Re로 완성하기
# results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).groups()[0]
# find_population_lxml.py
import re
from lxml import html
broken_html = '<ul class=country><li>Area<li>Population</ul>'
tree = html.fromstring(broken_html) # HTML 분석
fixed_html = html.tostring(tree, pretty_print=True); fixed_html.decode('utf-8')
from lxml import html
import requests
url = 'http://media.daum.net/ranking/empathy'
page = requests.get(url)
tree = html.fromstring(page.content)
page.text[:300]
xpath = '//*[@id="mArticle"]/div[2]/ul[2]/li[1]/div[2]/div/span/text()'
hotel_name = tree.xpath(xpath)
temp = re.sub("\n","",hotel_name[0])
temp = re.sub(" ","",temp); temp
# .list_news2 > li:nth-child(1) > div:nth-child(3) > strong:nth-child(1) > a:nth-child(1)
# //*[@id="mArticle"]/div[2]/ul[2]/li[1]/div[2]/strong/a
# //*[@id="mArticle"]/div[2]/ul[2]/li[2]/div[2]/strong/a
n=10
xpath = '//*[@id="mArticle"]/div[2]/ul[2]/li['+str(n)+']/div[2]/strong/a/text()'
hotel_name = tree.xpath(xpath)
temp = re.sub("\n","",hotel_name[0])
temp = re.sub(" ","",temp); temp
xpath = '//*[@id="mArticle"]/div[2]/ul[2]/li['+str(n)+']/div[2]/div/span/text()'
hotel_name = tree.xpath(xpath)
temp = re.sub("\n","",hotel_name[0])
temp = re.sub(" ","",temp); temp
from lxml import etree
from io import StringIO
from csv import DictWriter
f= StringIO('''
<html><body>
<a class="ui-magnifier-glass"
href="here goes the link that i want to extract" data-spm-anchor-id="0.0.0.0"
style="width: 258px; height: 258px; position: absolute; left: -1px; top: -1px; display: none;"></a>
<a href="link to extract" title="title to extract" rel="category tag" data-spm-anchor-id="0.0.0.0">
or maybe this word instead of title</a>
</body></html>''')
doc = etree.parse(f); data=[]
r = doc.xpath('//a[@data-spm-anchor-id="0.0.0.0"]')
for elem in r:
link=elem.get('href')
title=elem.get('title')
text=elem.text
data.append({'link': link,'title': title,'text': text})
with open('file.csv', 'w') as csvfile:
fieldnames=['link', 'title', 'text']
writer = DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
import pandas as pd
pd.read_csv('./file.csv')