Task 1: Loading Libraries and Data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import time
In [2]:
data=pd.read_csv("data2.csv")
data.head()
Out[2]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678

5 rows × 32 columns

In [3]:
 col=data.columns
 print(col)
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')
In [5]:
y = data.diagnosis
x = data.drop(["id","diagnosis"],axis=1)

Separate Target from Features

In [6]:
x.head()
Out[6]:
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 0.07871 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 0.05667 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 0.05999 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 0.09744 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 0.05883 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678

5 rows × 30 columns

Plot Diagnosis Distributions

In [7]:
ax = sns.countplot(y,label ="Count")
In [8]:
B, M = y.value_counts()
print("No. of Benign Tumors", B)
print("No. of Malignant Tumors", M)
No. of Benign Tumors 357
No. of Malignant Tumors 212
In [9]:
x.describe()
Out[9]:
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
count 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 0.062798 ... 16.269190 25.677223 107.261213 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946
std 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 0.007060 ... 4.833242 6.146258 33.602542 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061
min 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 0.049960 ... 7.930000 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040
25% 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 0.057700 ... 13.010000 21.080000 84.110000 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460
50% 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 0.061540 ... 14.970000 25.410000 97.660000 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040
75% 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 0.066120 ... 18.790000 29.720000 125.400000 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080
max 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 0.097440 ... 36.040000 49.540000 251.200000 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500

8 rows × 30 columns

Data Visualization


Visualizing Standardized Data with Seaborn

In [10]:
print(data)
           id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0      842302         M        17.99         10.38          122.80     1001.0   
1      842517         M        20.57         17.77          132.90     1326.0   
2    84300903         M        19.69         21.25          130.00     1203.0   
3    84348301         M        11.42         20.38           77.58      386.1   
4    84358402         M        20.29         14.34          135.10     1297.0   
..        ...       ...          ...           ...             ...        ...   
564    926424         M        21.56         22.39          142.00     1479.0   
565    926682         M        20.13         28.25          131.20     1261.0   
566    926954         M        16.60         28.08          108.30      858.1   
567    927241         M        20.60         29.33          140.10     1265.0   
568     92751         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0            0.11840           0.27760         0.30010              0.14710   
1            0.08474           0.07864         0.08690              0.07017   
2            0.10960           0.15990         0.19740              0.12790   
3            0.14250           0.28390         0.24140              0.10520   
4            0.10030           0.13280         0.19800              0.10430   
..               ...               ...             ...                  ...   
564          0.11100           0.11590         0.24390              0.13890   
565          0.09780           0.10340         0.14400              0.09791   
566          0.08455           0.10230         0.09251              0.05302   
567          0.11780           0.27700         0.35140              0.15200   
568          0.05263           0.04362         0.00000              0.00000   

     ...  radius_worst  texture_worst  perimeter_worst  area_worst  \
0    ...        25.380          17.33           184.60      2019.0   
1    ...        24.990          23.41           158.80      1956.0   
2    ...        23.570          25.53           152.50      1709.0   
3    ...        14.910          26.50            98.87       567.7   
4    ...        22.540          16.67           152.20      1575.0   
..   ...           ...            ...              ...         ...   
564  ...        25.450          26.40           166.10      2027.0   
565  ...        23.690          38.25           155.00      1731.0   
566  ...        18.980          34.12           126.70      1124.0   
567  ...        25.740          39.42           184.60      1821.0   
568  ...         9.456          30.37            59.16       268.6   

     smoothness_worst  compactness_worst  concavity_worst  \
0             0.16220            0.66560           0.7119   
1             0.12380            0.18660           0.2416   
2             0.14440            0.42450           0.4504   
3             0.20980            0.86630           0.6869   
4             0.13740            0.20500           0.4000   
..                ...                ...              ...   
564           0.14100            0.21130           0.4107   
565           0.11660            0.19220           0.3215   
566           0.11390            0.30940           0.3403   
567           0.16500            0.86810           0.9387   
568           0.08996            0.06444           0.0000   

     concave points_worst  symmetry_worst  fractal_dimension_worst  
0                  0.2654          0.4601                  0.11890  
1                  0.1860          0.2750                  0.08902  
2                  0.2430          0.3613                  0.08758  
3                  0.2575          0.6638                  0.17300  
4                  0.1625          0.2364                  0.07678  
..                    ...             ...                      ...  
564                0.2216          0.2060                  0.07115  
565                0.1628          0.2572                  0.06637  
566                0.1418          0.2218                  0.07820  
567                0.2650          0.4087                  0.12400  
568                0.0000          0.2871                  0.07039  

[569 rows x 32 columns]
In [14]:
data_dia = y
data = x
data_n_2 = (data - data.mean()) / (data.std())              
data = pd.concat([y,data_n_2.iloc[:,0:10]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(5,5))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data, split=True, inner="points")
plt.xticks(rotation=45);
# inner{“box”, “quartile”, “point”, “stick”, None}, optional

Violin Plots and Box Plots

In [15]:
data = pd.concat([y,data_n_2.iloc[:,10:20]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(10,10))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=False, inner="quartile")
plt.xticks(rotation=45);
In [16]:
data = pd.concat([y,data_n_2.iloc[:,20:31]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(10,10))
sns.boxplot(x="features", y="value", hue="diagnosis", data=data, saturation=1, fliersize="5", linewidth=3)
plt.xticks(rotation=45);
In [24]:
data = pd.concat([y,data_n_2.iloc[:,20:31]],axis=1)
data
Out[24]:
diagnosis radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 M 1.885031 -1.358098 2.301575 1.999478 1.306537 2.614365 2.107672 2.294058 2.748204 1.935312
1 M 1.804340 -0.368879 1.533776 1.888827 -0.375282 -0.430066 -0.146620 1.086129 -0.243675 0.280943
2 M 1.510541 -0.023953 1.346291 1.455004 0.526944 1.081980 0.854222 1.953282 1.151242 0.201214
3 M -0.281217 0.133866 -0.249720 -0.549538 3.391291 3.889975 1.987839 2.173873 6.040726 4.930672
4 M 1.297434 -1.465481 1.337363 1.219651 0.220362 -0.313119 0.612640 0.728618 -0.867590 -0.396751
... ... ... ... ... ... ... ... ... ... ... ...
564 M 1.899514 0.117596 1.751022 2.013529 0.378033 -0.273077 0.663928 1.627719 -1.358963 -0.708467
565 M 1.535369 2.045599 1.420690 1.493644 -0.690623 -0.394473 0.236365 0.733182 -0.531387 -0.973122
566 M 0.560868 1.373645 0.578492 0.427529 -0.808876 0.350427 0.326479 0.413705 -1.103578 -0.318129
567 M 1.959515 2.235958 2.301575 1.651717 1.429169 3.901415 3.194794 2.287972 1.917396 2.217684
568 B -1.409652 0.763518 -1.431475 -1.074867 -1.857384 -1.206491 -1.304683 -1.743529 -0.048096 -0.750546

569 rows × 11 columns

In [25]:
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')
data
Out[25]:
diagnosis features value
0 M radius_worst 1.885031
1 M radius_worst 1.804340
2 M radius_worst 1.510541
3 M radius_worst -0.281217
4 M radius_worst 1.297434
... ... ... ...
5685 M fractal_dimension_worst -0.708467
5686 M fractal_dimension_worst -0.973122
5687 M fractal_dimension_worst -0.318129
5688 M fractal_dimension_worst 2.217684
5689 B fractal_dimension_worst -0.750546

5690 rows × 3 columns

In [27]:
data['features'].unique()
Out[27]:
array(['radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'smoothness_worst', 'compactness_worst', 'concavity_worst',
       'concave points_worst', 'symmetry_worst',
       'fractal_dimension_worst'], dtype=object)
In [29]:
data['features'].value_counts()
Out[29]:
fractal_dimension_worst    569
area_worst                 569
texture_worst              569
smoothness_worst           569
compactness_worst          569
concave points_worst       569
symmetry_worst             569
perimeter_worst            569
concavity_worst            569
radius_worst               569
Name: features, dtype: int64

Using Joint Plots for Feature Comparison

In [ ]:
sns.jointplot(x.loc[:,'concavity_worst'],
              x.loc[:,'concave points_worst'],
              kind="hex",
              color="g")
;
# kind{ “scatter” | “reg” | “resid” | “kde” | “hex” }

Observing the Distribution of Values and their Variance with Swarm Plots

In [ ]:
#sns.set(style="whitegrid", palette="muted")
data_dia = y
data = x
data_n_2 = (data - data.mean()) / (data.std())  
data = pd.concat([y,data_n_2.iloc[:,0:10]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(10,10))
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data,size=8)
plt.xticks(rotation=45);

Observing all Pair-wise Correlations

In [ ]:
#correlation map
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(x.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax);

Task 2: Dropping Correlated Columns from Feature Matrix


Note: If you are starting the notebook from this task, you can run cells from all the previous tasks in the kernel by going to the top menu and Kernel > Restart and Run All\n",


In [ ]:
'''['perimeter_mean','radius_mean','compactness_mean',
              'concave points_mean','radius_se','perimeter_se',
              'radius_worst','perimeter_worst','compactness_worst',
              'concave points_worst','compactness_se','concave points_se',
              'texture_worst','area_worst']'''
In [ ]:
 

Task 3: Classification using XGBoost (minimal feature selection)

In [ ]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score
In [ ]:
sns.set(style="white")
df = x.loc[:,['radius_worst','perimeter_worst','area_worst']]
g = sns.PairGrid(df, diag_sharey=False)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3);
In [ ]:
 
In [ ]:
 

Task 4: Univariate Feature Selection and XGBoost

In [ ]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
In [ ]:
 
In [ ]:
 

Task 5: Recursive Feature Elimination with Cross-Validation

In [ ]:
 
In [ ]:
 
In [ ]:
 

Task 6: Feature Extraction using Principal Component Analysis

In [ ]:
 
In [ ]:
 
In [ ]: