import pandas as pd

#Read in the data file
df=pd.read_csv('data/normalisation-prescription.csv')
df

df.fillna(method='ffill', inplace=True)
df

for c in df.columns:
    print(c,df[c].value_counts(),sep='\n',end='\n\n')

df_1a=df[['patient_id', 'patient_name', 'doctor_id','doctor_name']]
df_1a

df_1a=df_1a.drop_duplicates()
df_1a

df_1b=df.drop(['patient_name', 'doctor_id','doctor_name'], 1)
df_1b

pd.merge(df_1a,df_1b,on='patient_id')

#Split out the drug code and drug name
df_2a=df_1b[['drug_code','drug_name']].drop_duplicates()
df_2a

#Remove the drug name from table
df_2b=df_1b.drop(['drug_name'], 1)
df_2b

#Test the join
pd.merge(df_2a,df_2b,on='drug_code')

#Separate out the doctor and patient details
df_3a=df_1a[['doctor_id','doctor_name']]
df_3a

df_3b=df_1a.drop(['doctor_name'],1)
df_3b

pd.merge(df_3a,df_3b,on='doctor_id')

df_3c=df_3b[['patient_id','patient_name']]
df_3c

df_3d=df_3b.drop(['patient_name'],1)
df_3d

def tableNorming(df,newTableCols,keyCol):
    df1=df[newTableCols].drop_duplicates()
    df2=df.drop(set(newTableCols)-set([keyCol]),1)
    return df1,df2

a,b=tableNorming(df,['patient_id', 'patient_name', 'doctor_id','doctor_name'],'patient_id')

a

b

#Read in the data file
ex1=pd.read_csv('data/normalisation-books.csv')
ex1

#Clean
ex1.fillna(method='ffill', inplace=True)
ex1

#Review the data
for c in ex1.columns:
    print(c,ex1[c].value_counts(),sep='\n',end='\n\n')

#Convert to 1NF
ex1_1a,ex1_1b=tableNorming(ex1,['invoice_no', 'customer_no', 'customer_name','date'],'invoice_no')


ex1_1a

ex1_1b

#Test
pd.merge(ex1_1a,ex1_1b,on='invoice_no')

#Convert to 2NF
#In ex1_1b, the combination of invoice_no and isbn attributes together determine the quantity attribute.
# Only isbn determines cost. cost is removed from the relation, and isbn and cost form a new relation, with isbn as key.

ex1_2a,ex1_2b=tableNorming(ex1_1b,['isbn', 'title', 'cost'],'isbn')

ex1_2a

ex1_2b

#Test
pd.merge(ex1_2a,ex1_2b,on='isbn')

#Convert to 3NF
#In ex1_1a, the date and customer_no attributes are all directly dependent on invoice_no 
#customer_name is directly dependent on customer_no not invoice_no.
#Therefore create a new relation from customer_no and customer_name where customer_no is the primary key.
#The customer_no remains in the original relation as a foreign key, as its value is determined by invoice_no

ex1_3a,ex1_3b=tableNorming(ex1_1a,['customer_no', 'customer_name'],'customer_no')

ex1_3a

ex1_3b

#Test
pd.merge(ex1_3a,ex1_3b,on='customer_no')

ex3=pd.read_csv('data/normalisation-authors.csv')
ex3

#Sort of via http://stackoverflow.com/a/12681217/454773
ex3_authors=pd.concat([pd.DataFrame({'isbn':row['isbn'], 'author':row['authors'].split(',') })             
                    for _, row in ex3.iterrows()])
ex3_authors

ex3_books=ex3.drop('authors',1)
ex3_books