import pandas as pd #Read in the data file df=pd.read_csv('data/normalisation-prescription.csv') df df.fillna(method='ffill', inplace=True) df for c in df.columns: print(c,df[c].value_counts(),sep='\n',end='\n\n') df_1a=df[['patient_id', 'patient_name', 'doctor_id','doctor_name']] df_1a df_1a=df_1a.drop_duplicates() df_1a df_1b=df.drop(['patient_name', 'doctor_id','doctor_name'], 1) df_1b pd.merge(df_1a,df_1b,on='patient_id') #Split out the drug code and drug name df_2a=df_1b[['drug_code','drug_name']].drop_duplicates() df_2a #Remove the drug name from table df_2b=df_1b.drop(['drug_name'], 1) df_2b #Test the join pd.merge(df_2a,df_2b,on='drug_code') #Separate out the doctor and patient details df_3a=df_1a[['doctor_id','doctor_name']] df_3a df_3b=df_1a.drop(['doctor_name'],1) df_3b pd.merge(df_3a,df_3b,on='doctor_id') df_3c=df_3b[['patient_id','patient_name']] df_3c df_3d=df_3b.drop(['patient_name'],1) df_3d def tableNorming(df,newTableCols,keyCol): df1=df[newTableCols].drop_duplicates() df2=df.drop(set(newTableCols)-set([keyCol]),1) return df1,df2 a,b=tableNorming(df,['patient_id', 'patient_name', 'doctor_id','doctor_name'],'patient_id') a b #Read in the data file ex1=pd.read_csv('data/normalisation-books.csv') ex1 #Clean ex1.fillna(method='ffill', inplace=True) ex1 #Review the data for c in ex1.columns: print(c,ex1[c].value_counts(),sep='\n',end='\n\n') #Convert to 1NF ex1_1a,ex1_1b=tableNorming(ex1,['invoice_no', 'customer_no', 'customer_name','date'],'invoice_no') ex1_1a ex1_1b #Test pd.merge(ex1_1a,ex1_1b,on='invoice_no') #Convert to 2NF #In ex1_1b, the combination of invoice_no and isbn attributes together determine the quantity attribute. # Only isbn determines cost. cost is removed from the relation, and isbn and cost form a new relation, with isbn as key. ex1_2a,ex1_2b=tableNorming(ex1_1b,['isbn', 'title', 'cost'],'isbn') ex1_2a ex1_2b #Test pd.merge(ex1_2a,ex1_2b,on='isbn') #Convert to 3NF #In ex1_1a, the date and customer_no attributes are all directly dependent on invoice_no #customer_name is directly dependent on customer_no not invoice_no. #Therefore create a new relation from customer_no and customer_name where customer_no is the primary key. #The customer_no remains in the original relation as a foreign key, as its value is determined by invoice_no ex1_3a,ex1_3b=tableNorming(ex1_1a,['customer_no', 'customer_name'],'customer_no') ex1_3a ex1_3b #Test pd.merge(ex1_3a,ex1_3b,on='customer_no') ex3=pd.read_csv('data/normalisation-authors.csv') ex3 #Sort of via http://stackoverflow.com/a/12681217/454773 ex3_authors=pd.concat([pd.DataFrame({'isbn':row['isbn'], 'author':row['authors'].split(',') }) for _, row in ex3.iterrows()]) ex3_authors ex3_books=ex3.drop('authors',1) ex3_books