#!/usr/bin/env python
# coding: utf-8

# # Simple Tools from Extracting Quantities from Strings
# 
# Suppose we have a report and we want to find the sentences that are talking about numerical things....
# 
# *Originally inspired by [When you get data in sentences: how to use a spreadsheet to extract numbers from phrases](https://onlinejournalismblog.com/2019/07/29/when-you-get-data-in-sentences-how-to-use-a-spreadsheet-to-extract-numbers-from-phrases/), Paul Bradshaw, Online Journalism blog, form which some of the example sentences (sic!) are taken.*

# In[152]:


sentences = [
    '4 years and 6 months’ imprisonment with a licence extension of 2 years and 6 months',
    'No quantities here',
    'I measured it as 2 meters and 30 centimeters.',
    "four years and six months' imprisonment with a licence extension of 2 years and 6 months",
    'it cost £250... bargain...',
    'it weighs four hundred kilograms.',
    'It weighs 400kg.',
    'three million, two hundred & forty, you say?',
    'it weighs four hundred and twenty kilograms.'
    
]


# ## `quantulum3`
# 
# [`quantulum3`](https://github.com/nielstron/quantulum3) is a Python package *"for information extraction of quantities from unstructured text"*.

# In[153]:


#!pip3 install quantulum3
from quantulum3 import parser


# In[154]:


for sent in sentences:
    print(sent)
    p = parser.parse(sent)
    if p:
        print('\tSpoken:',parser.inline_parse_and_expand(sent))
        print('\tNumeric elements:')
        for q in p:
            display(q)
            print('\t\t{} :: {}'.format(q.surface, q))
    print('\n---------\n')


# ## Finding quantity statements in large texts
# 
# If we have a large blog of text, we might want to quickly skim it for quantity containing sentences, we can do something like the following...

# In[155]:


import spacy
nlp = spacy.load('en_core_web_lg', disable = ['ner'])


# In[171]:


text = '''
Once upon a time, there was a thing. The thing weighed forty kilogrammes and cost £250. 
It was blue. It took forty five minutes to get it home. 
What a day that was. I didn't get back until 2.15pm. Then I had some cake for tea.
'''


# In[172]:


doc = nlp(text)
for sent in doc.sents:
    print(sent)


# In[173]:


for sent in doc.sents:
    sent = sent.text
    p = parser.parse(sent)
    if p:
        print('\tSpoken:',parser.inline_parse_and_expand(sent))
        print('\tNumeric elements:')
        for q in p:
            display(q)
            print('\t\t{} :: {}'.format(q.surface, q))
    print('\n---------\n')


# ## Annotating a dataset
# 
# Can we extract numbers from sentences in a CSV file? Yes we can...

# In[174]:


url = 'https://raw.githubusercontent.com/BBC-Data-Unit/unduly-lenient-sentences/master/ULS+for+Sankey.csv'


# In[175]:


import pandas as pd

df = pd.read_csv(url)
df.head()


# In[178]:


#get a row
df.iloc[1]


# In[179]:


#and a, erm. sentence...
df.iloc[1]['Original sentence (refined)']


# In[180]:


parser.parse(df.iloc[1]['Original sentence (refined)'])


# In[206]:


def amountify(txt):
    try:
        if txt:
            p = parser.parse(txt)
            x=[]
            for q in p:
                x.append( '{} {}'.format(q.value, q.unit.name))
            return '::'.join(x)
        return ''
    except:
        return


# In[207]:


df['amounts'] = df['Original sentence (refined)'].apply(amountify)


# In[208]:


df.head()


# We could then do something to split mutliple amounts into mutliple rows or columns...

# In[ ]: