#!/usr/bin/env python # coding: utf-8 # # Simple Tools from Extracting Quantities from Strings # # Suppose we have a report and we want to find the sentences that are talking about numerical things.... # # *Originally inspired by [When you get data in sentences: how to use a spreadsheet to extract numbers from phrases](https://onlinejournalismblog.com/2019/07/29/when-you-get-data-in-sentences-how-to-use-a-spreadsheet-to-extract-numbers-from-phrases/), Paul Bradshaw, Online Journalism blog, form which some of the example sentences (sic!) are taken.* # In[152]: sentences = [ '4 years and 6 months’ imprisonment with a licence extension of 2 years and 6 months', 'No quantities here', 'I measured it as 2 meters and 30 centimeters.', "four years and six months' imprisonment with a licence extension of 2 years and 6 months", 'it cost £250... bargain...', 'it weighs four hundred kilograms.', 'It weighs 400kg.', 'three million, two hundred & forty, you say?', 'it weighs four hundred and twenty kilograms.' ] # ## `quantulum3` # # [`quantulum3`](https://github.com/nielstron/quantulum3) is a Python package *"for information extraction of quantities from unstructured text"*. # In[153]: #!pip3 install quantulum3 from quantulum3 import parser # In[154]: for sent in sentences: print(sent) p = parser.parse(sent) if p: print('\tSpoken:',parser.inline_parse_and_expand(sent)) print('\tNumeric elements:') for q in p: display(q) print('\t\t{} :: {}'.format(q.surface, q)) print('\n---------\n') # ## Finding quantity statements in large texts # # If we have a large blog of text, we might want to quickly skim it for quantity containing sentences, we can do something like the following... # In[155]: import spacy nlp = spacy.load('en_core_web_lg', disable = ['ner']) # In[171]: text = ''' Once upon a time, there was a thing. The thing weighed forty kilogrammes and cost £250. It was blue. It took forty five minutes to get it home. What a day that was. I didn't get back until 2.15pm. Then I had some cake for tea. ''' # In[172]: doc = nlp(text) for sent in doc.sents: print(sent) # In[173]: for sent in doc.sents: sent = sent.text p = parser.parse(sent) if p: print('\tSpoken:',parser.inline_parse_and_expand(sent)) print('\tNumeric elements:') for q in p: display(q) print('\t\t{} :: {}'.format(q.surface, q)) print('\n---------\n') # ## Annotating a dataset # # Can we extract numbers from sentences in a CSV file? Yes we can... # In[174]: url = 'https://raw.githubusercontent.com/BBC-Data-Unit/unduly-lenient-sentences/master/ULS+for+Sankey.csv' # In[175]: import pandas as pd df = pd.read_csv(url) df.head() # In[178]: #get a row df.iloc[1] # In[179]: #and a, erm. sentence... df.iloc[1]['Original sentence (refined)'] # In[180]: parser.parse(df.iloc[1]['Original sentence (refined)']) # In[206]: def amountify(txt): try: if txt: p = parser.parse(txt) x=[] for q in p: x.append( '{} {}'.format(q.value, q.unit.name)) return '::'.join(x) return '' except: return # In[207]: df['amounts'] = df['Original sentence (refined)'].apply(amountify) # In[208]: df.head() # We could then do something to split mutliple amounts into mutliple rows or columns... # In[ ]: