import this import requests import lxml.html as lh url = 'http://www.nytimes.com/reuters/2013/01/25/world/americas/25reuters-venezuela-prison.html?partner=rss&emc=rss' page = requests.get(url) doc = lh.fromstring(page.content) text = doc.xpath('//p[@itemprop="articleBody"]') finalText = str() for par in text: finalText += par.text_content() print finalText import pattern.web url = 'http://rss.nytimes.com/services/xml/rss/nyt/World.xml' results = pattern.web.Newsfeed().search(url, count=5) results print '%s \n\n %s \n\n %s \n\n' % (results[0].url, results[0].title, results[0].description) print '%s \n\n %s \n\n %s \n\n' % (results[0].url, results[0].title, pattern.web.plaintext(results[0].description)) import codecs outputFile = codecs.open('~/tutorialOutput.txt', encoding='utf-8', mode='a') def scrape(url): page = requests.get(url) doc = lh.fromstring(page.content) text = doc.xpath('//p[@itemprop="articleBody"]') finalText = str() for par in text: finalText += par.text_content() return finalText for result in results: outputText = scrape(result.url) outputFile.write(outputText) outputFile.close() url = 'http://164.100.47.132/LssNew/psearch/Result13.aspx?dbsl=' for i in xrange(5175,5973): newUrl = url + str(i) print 'Scraping: %s' % newUrl from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from scrapy.item import Item from BeautifulSoup import BeautifulSoup import re import codecs class MySpider(CrawlSpider): name = 'statespider' #name is a name start_urls = ['http://www.state.gov/r/pa/prs/dpb/2010/index.htm', ] #defines the URL that the spider should start on. adjust the year. #defines the rules for the spider rules = (Rule(SgmlLinkExtractor(allow=('/2010/'), restrict_xpaths=('//*[@id="local-nav"]'),)), #allows only links within the navigation panel that have /year/ in them. Rule(SgmlLinkExtractor(restrict_xpaths=('//*[@id="dpb-calendar"]',), deny=('/video/')), callback='parse_item'), #follows links within the caldendar on the index page for the individuals years, while denying any links with /video/ in them ) def parse_item(self, response): self.log('Hi, this is an item page! %s' % response.url) #prints the response.url out in the terminal to help with debugging #Insert code to scrape page content #opens the file defined above and writes 'texts' using utf-8 with codecs.open(filename, 'w', encoding='utf-8') as output: output.write(texts)