#!/usr/bin/env python # coding: utf-8 # # OpenLearn XML Scraper # # OU OpenLearn materials are published in XML form, which allows some degree of structured access to the document contents. # # For example, we can construct a database of images used across OpenLearn unit material, or a list of quotes, or a list of activities. # # This notebook is a very first pass, just scraping images, and not adding as much metadata to the table (eg parent course) as it should. As and when I get time to tinker, I'll work on this... ;-) # ## Import some stuff... # In[1]: #You will probably need to pip install requests_cache lxml scraperwiki import requests import requests_cache requests_cache.install_cache('openlearn_cache') from urllib.parse import urlsplit, urlunsplit import unicodedata from lxml import etree import os os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///openlearn.sqlite' #os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///scraperwiki.sqlite' import scraperwiki # In[2]: #Example page xmlurl='http://www.open.edu/openlearn/people-politics-law/politics-policy-people/sociology/the-politics-devolution/altformat-ouxml' c=requests.get(xmlurl) # ## XML Parser # # Routines for parsing the OU XML. # In[49]: #=== #via http://stackoverflow.com/questions/5757201/help-or-advice-me-get-started-with-lxml/5899005#5899005 def flatten(el): if el is None: return result = [ (el.text or "") ] for sel in el: result.append(flatten(sel)) result.append(sel.tail or "") return unicodedata.normalize("NFKD", "".join(result)) or ' ' #=== def droptable(table): print("Trying to drop table '{}'".format(table)) try: scraperwiki.sqlite.execute('drop table if exists "{}"'.format(table)) except: pass print('...{} dropped'.format(table)) def _course_code(xml_content): root = etree.fromstring(xml_content) return flatten(root.find('.//CourseCode')) def _xml_figures(xml_content, coursecode='', pageurl='',dbsave=True): figdicts=[] try: root = etree.fromstring(xml_content) except: return False figures=root.findall('.//Figure') #??Note that acknowledgements to figures are provided at the end of the XML file with only informal free text/figure number identifers available for associating a particular acknowledgement/copyright assignment with a given image. It would be so much neater if this could be bundled up with the figure itself, or if the figure and the acknowledgement could share the same unique identifier? figdict={} for figure in figures: figdict = {'xpageurl':pageurl,'caption':'','src':'','coursecode':coursecode, 'desc':'','owner':'','item':'','itemack':''} img=figure.find('Image') #The image url as given does not resolve - we need to add in provided hash info figdict['srcurl']=img.get('src') figdict['x_folderhash']=img.get('x_folderhash') figdict['x_contenthash']=img.get('x_contenthash') if figdict['x_contenthash'] is not None and figdict['x_contenthash'] is not None: path = urlsplit(figdict['srcurl']) sp=path.path.split('/') path=path._replace( path='/'.join(sp[:-1]+[figdict['x_folderhash'],figdict['x_contenthash']]+sp[-1:])) figdict['imgurl']=urlunsplit(path) else:figdict['imgurl']='' xsrc=img.get('x_imagesrc') figdict['caption']=flatten(figure.find('Caption')) #in desc, need to find a way of stripping element from start of description figdict['desc']=flatten(figure.find('Description')) # ref=figure.find('SourceReference') if ref is not None: rights=ref.find('ItemRights') if rights is not None: figdict['owner']=flatten(rights.find('ItemRights')) figdict['item']=flatten(rights.find('ItemRights')) figdict['itemack']=flatten(rights.find('ItemAcknowledgement')) #print( 'figures',xsrc,caption,desc,src) figdicts.append(figdict) if dbsave : scraperwiki.sqlite.save(unique_keys=[],table_name='xmlfigures',data=figdict) return figdicts # In[ ]: #http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/dd203_1_001i.jpg #http://www.open.edu/openlearn/ocw/pluginfile.php/100160/mod_oucontent/oucontent/859/0c10275d/2c1a8d77/dd203_1_001i.jpg # In[44]: #Here's an exampl of the figure data as parsed _xml_figures(c.content,dbsave=False) # ## Grab Unit Locations # # OpenLearn publish an OPML feed of units. It used to be hierarchical, grouping unitis in to topics, now it seems to be flat with links to units as well as topic feeds. At some point, I'll grab the topic feeds and use it to generate lookup tables from topics to units. # In[ ]: def getUnitLocations(): #The OPML file lists all OpenLearn units by topic area srcUrl='http://openlearn.open.ac.uk/rss/file.php/stdfeed/1/full_opml.xml' tree = etree.parse(srcUrl) root = tree.getroot() items=root.findall('.//body/outline') #Handle each topic area separately? #The OPML is linear and mixes links to content twith links to topic feeds #Need to harvest by topic? for item in items: tt = item.get('text') #print( tt) it=item.get('text') if it.startswith('Unit content for'): it=it.replace('Unit content for','') url=item.get('htmlUrl') rssurl=item.get('xmlUrl') #print(url) xmlurl=url.replace('content-section-0','altformat-ouxml') #print(xmlurl) c=requests.get(xmlurl) _xml_figures(c.content) #droptable('xmlfigures') getUnitLocations() # ## TO DO # # There are other things we can scrape data about as well as images: # # - quotes (`...`) # - activities (``) # - box (` ...`) # - OU coursecode and title (`` and ``) # - identifying references is unstructed in some units, structured in others (``) # ## Example Query # In[53]: import pandas as pd import sqlite3 conn = sqlite3.connect('openlearn.sqlite') pd.read_sql('SELECT * FROM xmlfigures LIMIT 3', conn) # ## Database API # # We can use Simon Willison's rather wonderful [datasette](https://github.com/simonw/datasette) package to create a server that provides a query interface to the sqlite database. For example, here's a temporary one for demo purposes (this URL is subject to change and my Heroku limits may also max out)... # # https://sheltered-journey-73156.herokuapp.com/openlearn-29f1575/xmlfigures # # It would be nice if it was as easy to push a datesette to Reclaim Hosting as it is to push one to Heroku or Zeit Now... # ## Tidy Up # In[ ]: os.environ['SCRAPERWIKI_DATABASE_NAME'] = 'sqlite:///scraperwiki.sqlite'