import this

import requests
import lxml.html as lh

url = 'http://www.nytimes.com/reuters/2013/01/25/world/americas/25reuters-venezuela-prison.html?partner=rss&emc=rss'
page = requests.get(url)
doc = lh.fromstring(page.content)
text = doc.xpath('//p[@itemprop="articleBody"]')
finalText = str()
for par in text:
    finalText += par.text_content()

print finalText

import pattern.web

url = 'http://rss.nytimes.com/services/xml/rss/nyt/World.xml'
results = pattern.web.Newsfeed().search(url, count=5)
results

print '%s \n\n %s \n\n %s \n\n' % (results[0].url, results[0].title, results[0].description)

print '%s \n\n %s \n\n %s \n\n' % (results[0].url, results[0].title, pattern.web.plaintext(results[0].description))

import codecs

outputFile = codecs.open('~/tutorialOutput.txt', encoding='utf-8', mode='a')

def scrape(url):
    page = requests.get(url)
    doc = lh.fromstring(page.content)
    text = doc.xpath('//p[@itemprop="articleBody"]')
    finalText = str()
    for par in text:
        finalText += par.text_content()
    return finalText

for result in results:
    outputText = scrape(result.url)
    outputFile.write(outputText)

outputFile.close()

url = 'http://164.100.47.132/LssNew/psearch/Result13.aspx?dbsl='

for i in xrange(5175,5973):
    newUrl = url + str(i)
    print 'Scraping: %s' % newUrl

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from BeautifulSoup import BeautifulSoup
import re
import codecs

class MySpider(CrawlSpider):
    name = 'statespider' #name is a name
    start_urls = ['http://www.state.gov/r/pa/prs/dpb/2010/index.htm',
    ] #defines the URL that the spider should start on. adjust the year.

        #defines the rules for the spider
    rules = (Rule(SgmlLinkExtractor(allow=('/2010/'), restrict_xpaths=('//*[@id="local-nav"]'),)), #allows only links within the navigation panel that have /year/ in them.

    Rule(SgmlLinkExtractor(restrict_xpaths=('//*[@id="dpb-calendar"]',), deny=('/video/')), callback='parse_item'), #follows links within the caldendar on the index page for the individuals years, while denying any links with /video/ in them

    )

    def parse_item(self, response):
        self.log('Hi, this is an item page! %s' % response.url) #prints the response.url out in the terminal to help with debugging
        
        #Insert code to scrape page content

        #opens the file defined above and writes 'texts' using utf-8
        with codecs.open(filename, 'w', encoding='utf-8') as output:
            output.write(texts)