print(__doc__)
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn import metrics
import pandas as pd
import seaborn as sns
import numpy as np
Automatically created module for IPython interactive environment
dfInit = pd.read_csv(('./Data/MUSA-650WelcomePoll.csv'))
dfInit.head()
Timestamp | Q1_General background in data analysis? | Q2_Hands-on experience in data analysis using Python? | Q3_Experience in programming in general? | Q4_General background in machine learning? | Q5_Hands-on experience in running machine learning applications? | Q6_Which one would you prefer on a Sunday afternoon? | Q7_Hands-on experience in image analysis using satellite images? | Q8_Level of interest in mathematics? | Q9_Level of interest in reading? | Q10_Level of stress about this class? | Q11_Your overall motivation about this class? | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020/01/14 5:11:10 PM EST | 8 | 5 | 4 | 6 | 7 | Running | 5 | 3 | 5 | 7 | 3 |
1 | 2020/01/14 5:15:45 PM EST | 8 | 8 | 5 | 5 | 6 | Reading | 7 | 7 | 6 | 7 | 8 |
2 | 2020/01/14 10:10:14 PM EST | 6 | 6 | 6 | 6 | 5 | Watching a movie | 7 | 7 | 7 | 7 | 7 |
3 | 2020/01/15 10:02:48 AM EST | 5 | 3 | 6 | 4 | 4 | Watching a movie | 3 | 8 | 8 | 5 | 10 |
4 | 2020/01/15 10:03:20 AM EST | 6 | 6 | 5 | 4 | 3 | Reading | 4 | 5 | 4 | 10 | 8 |
dfInit.Timestamp = pd.to_datetime(dfInit.Timestamp, format='%Y/%m/%d %I:%M:%S %p EST')
dfInit['tsRel'] = (dfInit.Timestamp - dfInit.Timestamp.min()).dt.total_seconds()
dfInit[['Timestamp', 'tsRel']].head(10)
Timestamp | tsRel | |
---|---|---|
0 | 2020-01-14 17:11:10 | 0.0 |
1 | 2020-01-14 17:15:45 | 275.0 |
2 | 2020-01-14 22:10:14 | 17944.0 |
3 | 2020-01-15 10:02:48 | 60698.0 |
4 | 2020-01-15 10:03:20 | 60730.0 |
5 | 2020-01-15 10:03:43 | 60753.0 |
6 | 2020-01-15 10:03:50 | 60760.0 |
7 | 2020-01-15 10:03:53 | 60763.0 |
8 | 2020-01-15 10:03:59 | 60769.0 |
9 | 2020-01-15 10:04:03 | 60773.0 |
df = dfInit[dfInit.columns[1:]]
initCol = df.columns.tolist()
initCol
['Q1_General background in data analysis?', 'Q2_Hands-on experience in data analysis using Python?', 'Q3_Experience in programming in general?', 'Q4_General background in machine learning?', 'Q5_Hands-on experience in running machine learning applications?', 'Q6_Which one would you prefer on a Sunday afternoon?', 'Q7_Hands-on experience in image analysis using satellite images?', 'Q8_Level of interest in mathematics?', 'Q9_Level of interest in reading?', 'Q10_Level of stress about this class?', 'Q11_Your overall motivation about this class?', 'tsRel']
df.columns = df.columns.str.split('_', 1).str[0].tolist()
df.head()
Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | tsRel | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8 | 5 | 4 | 6 | 7 | Running | 5 | 3 | 5 | 7 | 3 | 0.0 |
1 | 8 | 8 | 5 | 5 | 6 | Reading | 7 | 7 | 6 | 7 | 8 | 275.0 |
2 | 6 | 6 | 6 | 6 | 5 | Watching a movie | 7 | 7 | 7 | 7 | 7 | 17944.0 |
3 | 5 | 3 | 6 | 4 | 4 | Watching a movie | 3 | 8 | 8 | 5 | 10 | 60698.0 |
4 | 6 | 6 | 5 | 4 | 3 | Reading | 4 | 5 | 4 | 10 | 8 | 60730.0 |
#sns.pairplot(df, kind = 'reg')
df.corr()
Q1 | Q2 | Q3 | Q4 | Q5 | Q7 | Q8 | Q9 | Q10 | Q11 | tsRel | |
---|---|---|---|---|---|---|---|---|---|---|---|
Q1 | 1.000000 | 0.766725 | 0.713877 | 0.624063 | 0.738857 | 0.676150 | 0.405270 | -0.244388 | -0.441883 | 0.042206 | 0.313668 |
Q2 | 0.766725 | 1.000000 | 0.639008 | 0.481836 | 0.568244 | 0.692275 | 0.399874 | -0.321784 | -0.329991 | 0.099348 | 0.383485 |
Q3 | 0.713877 | 0.639008 | 1.000000 | 0.564825 | 0.544057 | 0.616467 | 0.596732 | -0.083087 | -0.631657 | 0.400580 | 0.457261 |
Q4 | 0.624063 | 0.481836 | 0.564825 | 1.000000 | 0.945541 | 0.450752 | 0.426714 | -0.461877 | -0.168868 | 0.036739 | 0.440323 |
Q5 | 0.738857 | 0.568244 | 0.544057 | 0.945541 | 1.000000 | 0.444940 | 0.467669 | -0.463201 | -0.263556 | -0.023930 | 0.407507 |
Q7 | 0.676150 | 0.692275 | 0.616467 | 0.450752 | 0.444940 | 1.000000 | 0.184545 | -0.198770 | -0.284418 | 0.045276 | 0.152171 |
Q8 | 0.405270 | 0.399874 | 0.596732 | 0.426714 | 0.467669 | 0.184545 | 1.000000 | -0.126656 | -0.205698 | 0.501068 | 0.036620 |
Q9 | -0.244388 | -0.321784 | -0.083087 | -0.461877 | -0.463201 | -0.198770 | -0.126656 | 1.000000 | -0.217410 | 0.228420 | -0.193531 |
Q10 | -0.441883 | -0.329991 | -0.631657 | -0.168868 | -0.263556 | -0.284418 | -0.205698 | -0.217410 | 1.000000 | -0.133846 | -0.311850 |
Q11 | 0.042206 | 0.099348 | 0.400580 | 0.036739 | -0.023930 | 0.045276 | 0.501068 | 0.228420 | -0.133846 | 1.000000 | 0.430875 |
tsRel | 0.313668 | 0.383485 | 0.457261 | 0.440323 | 0.407507 | 0.152171 | 0.036620 | -0.193531 | -0.311850 | 0.430875 | 1.000000 |
corr = df.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
<AxesSubplot:>
initCol
['Q1_General background in data analysis?', 'Q2_Hands-on experience in data analysis using Python?', 'Q3_Experience in programming in general?', 'Q4_General background in machine learning?', 'Q5_Hands-on experience in running machine learning applications?', 'Q6_Which one would you prefer on a Sunday afternoon?', 'Q7_Hands-on experience in image analysis using satellite images?', 'Q8_Level of interest in mathematics?', 'Q9_Level of interest in reading?', 'Q10_Level of stress about this class?', 'Q11_Your overall motivation about this class?', 'tsRel']
sns.catplot(x="Q6", y="Q11", data=df);
df2 = pd.get_dummies(df, columns=['Q6'])
df2
Q1 | Q2 | Q3 | Q4 | Q5 | Q7 | Q8 | Q9 | Q10 | Q11 | tsRel | Q6_Reading | Q6_Running | Q6_Watching a movie | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8 | 5 | 4 | 6 | 7 | 5 | 3 | 5 | 7 | 3 | 0.0 | 0 | 1 | 0 |
1 | 8 | 8 | 5 | 5 | 6 | 7 | 7 | 6 | 7 | 8 | 275.0 | 1 | 0 | 0 |
2 | 6 | 6 | 6 | 6 | 5 | 7 | 7 | 7 | 7 | 7 | 17944.0 | 0 | 0 | 1 |
3 | 5 | 3 | 6 | 4 | 4 | 3 | 8 | 8 | 5 | 10 | 60698.0 | 0 | 0 | 1 |
4 | 6 | 6 | 5 | 4 | 3 | 4 | 5 | 4 | 10 | 8 | 60730.0 | 1 | 0 | 0 |
5 | 8 | 7 | 8 | 3 | 3 | 8 | 4 | 10 | 2 | 8 | 60753.0 | 0 | 1 | 0 |
6 | 4 | 3 | 1 | 1 | 1 | 1 | 1 | 10 | 8 | 8 | 60760.0 | 1 | 0 | 0 |
7 | 7 | 3 | 7 | 6 | 5 | 4 | 6 | 8 | 6 | 9 | 60763.0 | 1 | 0 | 0 |
8 | 5 | 5 | 5 | 4 | 4 | 4 | 4 | 5 | 5 | 7 | 60769.0 | 0 | 0 | 1 |
9 | 6 | 6 | 6 | 6 | 6 | 4 | 6 | 6 | 5 | 6 | 60773.0 | 0 | 0 | 1 |
10 | 4 | 4 | 4 | 5 | 3 | 5 | 2 | 7 | 8 | 7 | 60783.0 | 0 | 0 | 1 |
11 | 7 | 7 | 7 | 2 | 2 | 7 | 6 | 7 | 5 | 8 | 60790.0 | 0 | 1 | 0 |
12 | 8 | 8 | 8 | 6 | 6 | 8 | 7 | 8 | 6 | 8 | 60800.0 | 0 | 0 | 1 |
13 | 4 | 4 | 4 | 1 | 1 | 1 | 5 | 7 | 7 | 8 | 60801.0 | 0 | 0 | 1 |
14 | 8 | 7 | 7 | 7 | 7 | 7 | 10 | 5 | 6 | 10 | 60812.0 | 0 | 1 | 0 |
15 | 7 | 7 | 6 | 6 | 6 | 6 | 6 | 6 | 5 | 7 | 60817.0 | 0 | 0 | 1 |
16 | 7 | 6 | 6 | 5 | 5 | 1 | 7 | 7 | 4 | 7 | 60823.0 | 1 | 0 | 0 |
17 | 6 | 6 | 6 | 5 | 5 | 2 | 9 | 9 | 7 | 9 | 60939.0 | 0 | 0 | 1 |
18 | 9 | 9 | 9 | 9 | 9 | 7 | 6 | 5 | 4 | 10 | 443956.0 | 0 | 0 | 1 |
dfTmp = df2[['Q8', 'Q9', 'Q10', 'Q11', 'Q6_Reading', 'Q6_Running', 'Q6_Watching a movie',]].copy()
#sns.pairplot(dfTmp, kind = 'reg')
corr = df.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
<AxesSubplot:>
initCol
['Q1_General background in data analysis?', 'Q2_Hands-on experience in data analysis using Python?', 'Q3_Experience in programming in general?', 'Q4_General background in machine learning?', 'Q5_Hands-on experience in running machine learning applications?', 'Q6_Which one would you prefer on a Sunday afternoon?', 'Q7_Hands-on experience in image analysis using satellite images?', 'Q8_Level of interest in mathematics?', 'Q9_Level of interest in reading?', 'Q10_Level of stress about this class?', 'Q11_Your overall motivation about this class?', 'tsRel']
dfTmp = df[['tsRel','Q3', 'Q10', 'Q11']].copy()
corr = dfTmp.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
print(corr['tsRel'])
tsRel 1.000000 Q3 0.457261 Q10 -0.311850 Q11 0.430875 Name: tsRel, dtype: float64
sns.regplot(x='tsRel', y='Q3', data=dfTmp, color="g")
<AxesSubplot:xlabel='tsRel', ylabel='Q3'>
sns.distplot(dfTmp.tsRel, hist=True, rug=True, color="g")
/home/guraylab/anaconda3/envs/musa-650/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /home/guraylab/anaconda3/envs/musa-650/lib/python3.9/site-packages/seaborn/distributions.py:2103: FutureWarning: The `axis` variable is no longer used and will be removed. Instead, assign variables directly to `x` or `y`. warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='tsRel', ylabel='Density'>
ax = sns.distplot(dfTmp.tsRel, hist = True, color="g")
ax.set_xlim(0, 100000)
#ax.set_ylim(0, 0.008)
/home/guraylab/anaconda3/envs/musa-650/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
(0.0, 100000.0)
dfTmp.tsRel.median()
60773.00000000001
dfTmp.tsRel - dfTmp.tsRel.median()
0 -60773.0 1 -60498.0 2 -42829.0 3 -75.0 4 -43.0 5 -20.0 6 -13.0 7 -10.0 8 -4.0 9 0.0 10 10.0 11 17.0 12 27.0 13 28.0 14 39.0 15 44.0 16 50.0 17 166.0 18 383183.0 Name: tsRel, dtype: float64
dfInit.Timestamp
0 2020-01-14 17:11:10 1 2020-01-14 17:15:45 2 2020-01-14 22:10:14 3 2020-01-15 10:02:48 4 2020-01-15 10:03:20 5 2020-01-15 10:03:43 6 2020-01-15 10:03:50 7 2020-01-15 10:03:53 8 2020-01-15 10:03:59 9 2020-01-15 10:04:03 10 2020-01-15 10:04:13 11 2020-01-15 10:04:20 12 2020-01-15 10:04:30 13 2020-01-15 10:04:31 14 2020-01-15 10:04:42 15 2020-01-15 10:04:47 16 2020-01-15 10:04:53 17 2020-01-15 10:06:49 18 2020-01-19 20:30:26 Name: Timestamp, dtype: datetime64[ns]
tsRel_z = (dfTmp.tsRel - dfTmp.tsRel.mean()) / dfTmp.tsRel.std()
tsRel_z
0 -0.783125 1 -0.780147 2 -0.588803 3 -0.125805 4 -0.125459 5 -0.125209 6 -0.125134 7 -0.125101 8 -0.125036 9 -0.124993 10 -0.124885 11 -0.124809 12 -0.124700 13 -0.124690 14 -0.124571 15 -0.124516 16 -0.124451 17 -0.123195 18 4.024628 Name: tsRel, dtype: float64
from sklearn.neighbors import LocalOutlierFactor
# fit the model for outlier detection (default)
X = np.array(dfTmp.tsRel).reshape(dfTmp.shape[0],1)
X.shape
clf = LocalOutlierFactor(n_neighbors=5, contamination=0.1)
clf.fit_predict(X)
dfTmp['outScore'] = clf.negative_outlier_factor_.tolist()
dfTmp
tsRel | Q3 | Q10 | Q11 | outScore | |
---|---|---|---|---|---|
0 | 0.0 | 4 | 7 | 3 | -971.093118 |
1 | 275.0 | 5 | 7 | 8 | -969.221765 |
2 | 17944.0 | 6 | 7 | 7 | -848.985675 |
3 | 60698.0 | 6 | 5 | 10 | -2.550890 |
4 | 60730.0 | 5 | 10 | 8 | -1.646488 |
5 | 60753.0 | 8 | 2 | 8 | -1.071656 |
6 | 60760.0 | 1 | 8 | 8 | -0.988349 |
7 | 60763.0 | 7 | 6 | 9 | -0.988349 |
8 | 60769.0 | 5 | 5 | 7 | -1.012356 |
9 | 60773.0 | 6 | 5 | 6 | -1.025628 |
10 | 60783.0 | 4 | 8 | 7 | -0.890923 |
11 | 60790.0 | 7 | 5 | 8 | -0.915438 |
12 | 60800.0 | 8 | 6 | 8 | -1.035467 |
13 | 60801.0 | 4 | 7 | 8 | -1.023812 |
14 | 60812.0 | 7 | 6 | 10 | -1.085007 |
15 | 60817.0 | 6 | 5 | 7 | -1.085007 |
16 | 60823.0 | 6 | 4 | 7 | -1.194836 |
17 | 60939.0 | 6 | 7 | 9 | -5.640073 |
18 | 443956.0 | 9 | 4 | 10 | -13811.188253 |
dfTmp.plot.scatter(x='tsRel', y='outScore', c='DarkBlue')
<AxesSubplot:xlabel='tsRel', ylabel='outScore'>
dfTmpFil = dfTmp[np.logical_and(dfTmp.outScore>-5, dfTmp.outScore<5)]
dfTmpFil
tsRel | Q3 | Q10 | Q11 | outScore | |
---|---|---|---|---|---|
3 | 60698.0 | 6 | 5 | 10 | -2.550890 |
4 | 60730.0 | 5 | 10 | 8 | -1.646488 |
5 | 60753.0 | 8 | 2 | 8 | -1.071656 |
6 | 60760.0 | 1 | 8 | 8 | -0.988349 |
7 | 60763.0 | 7 | 6 | 9 | -0.988349 |
8 | 60769.0 | 5 | 5 | 7 | -1.012356 |
9 | 60773.0 | 6 | 5 | 6 | -1.025628 |
10 | 60783.0 | 4 | 8 | 7 | -0.890923 |
11 | 60790.0 | 7 | 5 | 8 | -0.915438 |
12 | 60800.0 | 8 | 6 | 8 | -1.035467 |
13 | 60801.0 | 4 | 7 | 8 | -1.023812 |
14 | 60812.0 | 7 | 6 | 10 | -1.085007 |
15 | 60817.0 | 6 | 5 | 7 | -1.085007 |
16 | 60823.0 | 6 | 4 | 7 | -1.194836 |
sns.distplot(dfTmpFil.tsRel, hist = True, color="g")
/home/guraylab/anaconda3/envs/musa-650/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='tsRel', ylabel='Density'>
corr = dfTmpFil.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
print(corr['tsRel'])
tsRel 1.000000 Q3 0.130838 Q10 -0.163241 Q11 -0.344576 outScore 0.702471 Name: tsRel, dtype: float64
sns.regplot(x='tsRel', y='Q3', data=dfTmpFil, color="g")
<AxesSubplot:xlabel='tsRel', ylabel='Q3'>
df
Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7 | Q8 | Q9 | Q10 | Q11 | tsRel | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8 | 5 | 4 | 6 | 7 | Running | 5 | 3 | 5 | 7 | 3 | 0.0 |
1 | 8 | 8 | 5 | 5 | 6 | Reading | 7 | 7 | 6 | 7 | 8 | 275.0 |
2 | 6 | 6 | 6 | 6 | 5 | Watching a movie | 7 | 7 | 7 | 7 | 7 | 17944.0 |
3 | 5 | 3 | 6 | 4 | 4 | Watching a movie | 3 | 8 | 8 | 5 | 10 | 60698.0 |
4 | 6 | 6 | 5 | 4 | 3 | Reading | 4 | 5 | 4 | 10 | 8 | 60730.0 |
5 | 8 | 7 | 8 | 3 | 3 | Running | 8 | 4 | 10 | 2 | 8 | 60753.0 |
6 | 4 | 3 | 1 | 1 | 1 | Reading | 1 | 1 | 10 | 8 | 8 | 60760.0 |
7 | 7 | 3 | 7 | 6 | 5 | Reading | 4 | 6 | 8 | 6 | 9 | 60763.0 |
8 | 5 | 5 | 5 | 4 | 4 | Watching a movie | 4 | 4 | 5 | 5 | 7 | 60769.0 |
9 | 6 | 6 | 6 | 6 | 6 | Watching a movie | 4 | 6 | 6 | 5 | 6 | 60773.0 |
10 | 4 | 4 | 4 | 5 | 3 | Watching a movie | 5 | 2 | 7 | 8 | 7 | 60783.0 |
11 | 7 | 7 | 7 | 2 | 2 | Running | 7 | 6 | 7 | 5 | 8 | 60790.0 |
12 | 8 | 8 | 8 | 6 | 6 | Watching a movie | 8 | 7 | 8 | 6 | 8 | 60800.0 |
13 | 4 | 4 | 4 | 1 | 1 | Watching a movie | 1 | 5 | 7 | 7 | 8 | 60801.0 |
14 | 8 | 7 | 7 | 7 | 7 | Running | 7 | 10 | 5 | 6 | 10 | 60812.0 |
15 | 7 | 7 | 6 | 6 | 6 | Watching a movie | 6 | 6 | 6 | 5 | 7 | 60817.0 |
16 | 7 | 6 | 6 | 5 | 5 | Reading | 1 | 7 | 7 | 4 | 7 | 60823.0 |
17 | 6 | 6 | 6 | 5 | 5 | Watching a movie | 2 | 9 | 9 | 7 | 9 | 60939.0 |
18 | 9 | 9 | 9 | 9 | 9 | Watching a movie | 7 | 6 | 5 | 4 | 10 | 443956.0 |
mdlPCA = PCA(n_components=5)
XPCA = mdlPCA.fit_transform(df2)
print(mdlPCA.explained_variance_ratio_)
print(np.sum(mdlPCA.explained_variance_ratio_))
plt.plot(np.arange(0,mdlPCA.explained_variance_ratio_.shape[0]), mdlPCA.explained_variance_ratio_, '-ro')
[9.99999996e-01 1.81226177e-09 6.72112392e-10 5.91733094e-10 3.30434976e-10] 0.9999999994658001
[<matplotlib.lines.Line2D at 0x7f8fb3069bb0>]
print(mdlPCA.components_[0,:])
plt.plot(np.arange(0,mdlPCA.components_[0,:].shape[0]), mdlPCA.components_[0,:], '-ro')
[ 5.23403869e-06 7.40036105e-06 8.97709992e-06 9.61319140e-06 9.32793474e-06 3.94503126e-06 8.95009702e-07 -3.58017421e-06 -6.06213801e-06 7.55257058e-06 1.00000000e+00 -7.70261980e-07 -6.96515391e-07 1.46677737e-06]
[<matplotlib.lines.Line2D at 0x7f8fb2fd4e80>]
df2.describe()
Q1 | Q2 | Q3 | Q4 | Q5 | Q7 | Q8 | Q9 | Q10 | Q11 | tsRel | Q6_Reading | Q6_Running | Q6_Watching a movie | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 |
mean | 6.473684 | 5.789474 | 5.789474 | 4.789474 | 4.631579 | 4.789474 | 5.736842 | 6.842105 | 6.000000 | 7.789474 | 72315.052632 | 0.263158 | 0.210526 | 0.526316 |
std | 1.540866 | 1.781976 | 1.812884 | 2.016018 | 2.113726 | 2.393949 | 2.256893 | 1.708253 | 1.795055 | 1.618605 | 92341.691330 | 0.452414 | 0.418854 | 0.512989 |
min | 4.000000 | 3.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 4.000000 | 2.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 5.500000 | 4.500000 | 5.000000 | 4.000000 | 3.000000 | 3.500000 | 4.500000 | 5.500000 | 5.000000 | 7.000000 | 60741.500000 | 0.000000 | 0.000000 | 0.000000 |
50% | 7.000000 | 6.000000 | 6.000000 | 5.000000 | 5.000000 | 5.000000 | 6.000000 | 7.000000 | 6.000000 | 8.000000 | 60773.000000 | 0.000000 | 0.000000 | 1.000000 |
75% | 8.000000 | 7.000000 | 7.000000 | 6.000000 | 6.000000 | 7.000000 | 7.000000 | 8.000000 | 7.000000 | 8.500000 | 60806.500000 | 0.500000 | 0.000000 | 1.000000 |
max | 9.000000 | 9.000000 | 9.000000 | 9.000000 | 9.000000 | 8.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 443956.000000 | 1.000000 | 1.000000 | 1.000000 |
df2_norm = (df2-df2.min())/(df2.max()-df2.min())
df2_norm.head()
Q1 | Q2 | Q3 | Q4 | Q5 | Q7 | Q8 | Q9 | Q10 | Q11 | tsRel | Q6_Reading | Q6_Running | Q6_Watching a movie | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.8 | 0.333333 | 0.375 | 0.625 | 0.750 | 0.571429 | 0.222222 | 0.166667 | 0.625 | 0.000000 | 0.000000 | 0.0 | 1.0 | 0.0 |
1 | 0.8 | 0.833333 | 0.500 | 0.500 | 0.625 | 0.857143 | 0.666667 | 0.333333 | 0.625 | 0.714286 | 0.000619 | 1.0 | 0.0 | 0.0 |
2 | 0.4 | 0.500000 | 0.625 | 0.625 | 0.500 | 0.857143 | 0.666667 | 0.500000 | 0.625 | 0.571429 | 0.040418 | 0.0 | 0.0 | 1.0 |
3 | 0.2 | 0.000000 | 0.625 | 0.375 | 0.375 | 0.285714 | 0.777778 | 0.666667 | 0.375 | 1.000000 | 0.136721 | 0.0 | 0.0 | 1.0 |
4 | 0.4 | 0.500000 | 0.500 | 0.375 | 0.250 | 0.428571 | 0.444444 | 0.000000 | 1.000 | 0.714286 | 0.136793 | 1.0 | 0.0 | 0.0 |
df2_norm.describe()
Q1 | Q2 | Q3 | Q4 | Q5 | Q7 | Q8 | Q9 | Q10 | Q11 | tsRel | Q6_Reading | Q6_Running | Q6_Watching a movie | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 | 19.000000 |
mean | 0.494737 | 0.464912 | 0.598684 | 0.473684 | 0.453947 | 0.541353 | 0.526316 | 0.473684 | 0.500000 | 0.684211 | 0.162888 | 0.263158 | 0.210526 | 0.526316 |
std | 0.308173 | 0.296996 | 0.226611 | 0.252002 | 0.264216 | 0.341993 | 0.250766 | 0.284709 | 0.224382 | 0.231229 | 0.207997 | 0.452414 | 0.418854 | 0.512989 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.300000 | 0.250000 | 0.500000 | 0.375000 | 0.250000 | 0.357143 | 0.388889 | 0.250000 | 0.375000 | 0.571429 | 0.136819 | 0.000000 | 0.000000 | 0.000000 |
50% | 0.600000 | 0.500000 | 0.625000 | 0.500000 | 0.500000 | 0.571429 | 0.555556 | 0.500000 | 0.500000 | 0.714286 | 0.136890 | 0.000000 | 0.000000 | 1.000000 |
75% | 0.800000 | 0.666667 | 0.750000 | 0.625000 | 0.625000 | 0.857143 | 0.666667 | 0.666667 | 0.625000 | 0.785714 | 0.136965 | 0.500000 | 0.000000 | 1.000000 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
mdlPCA = PCA(n_components=5)
XPCA = mdlPCA.fit_transform(df2_norm)
print(mdlPCA.explained_variance_ratio_)
print(np.sum(mdlPCA.explained_variance_ratio_))
plt.plot(np.arange(0,mdlPCA.explained_variance_ratio_.shape[0]), mdlPCA.explained_variance_ratio_, '-ro')
[0.31431827 0.29012089 0.15348486 0.07788586 0.04541328] 0.8812231522564272
[<matplotlib.lines.Line2D at 0x7f8fb2f5d070>]
print(mdlPCA.components_[0,:])
plt.plot(np.arange(0,mdlPCA.components_[0,:].shape[0]), mdlPCA.components_[0,:], '-ro')
[-0.28489905 -0.3141605 -0.26107415 -0.25405393 -0.27500032 -0.36871538 -0.17301488 0.12875075 0.16485422 -0.03296608 -0.13459097 0.4957779 -0.16340968 -0.33236822]
[<matplotlib.lines.Line2D at 0x7f8fb2f3cac0>]
df2_noTs = df2_norm[df2_norm.columns[df2_norm.columns.str.contains('tsRel')==False]]
df2_noTs.head()
Q1 | Q2 | Q3 | Q4 | Q5 | Q7 | Q8 | Q9 | Q10 | Q11 | Q6_Reading | Q6_Running | Q6_Watching a movie | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.8 | 0.333333 | 0.375 | 0.625 | 0.750 | 0.571429 | 0.222222 | 0.166667 | 0.625 | 0.000000 | 0.0 | 1.0 | 0.0 |
1 | 0.8 | 0.833333 | 0.500 | 0.500 | 0.625 | 0.857143 | 0.666667 | 0.333333 | 0.625 | 0.714286 | 1.0 | 0.0 | 0.0 |
2 | 0.4 | 0.500000 | 0.625 | 0.625 | 0.500 | 0.857143 | 0.666667 | 0.500000 | 0.625 | 0.571429 | 0.0 | 0.0 | 1.0 |
3 | 0.2 | 0.000000 | 0.625 | 0.375 | 0.375 | 0.285714 | 0.777778 | 0.666667 | 0.375 | 1.000000 | 0.0 | 0.0 | 1.0 |
4 | 0.4 | 0.500000 | 0.500 | 0.375 | 0.250 | 0.428571 | 0.444444 | 0.000000 | 1.000 | 0.714286 | 1.0 | 0.0 | 0.0 |
df2_noTs = df2_norm.drop(columns=['tsRel'])
df2_noTs.head()
Q1 | Q2 | Q3 | Q4 | Q5 | Q7 | Q8 | Q9 | Q10 | Q11 | Q6_Reading | Q6_Running | Q6_Watching a movie | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.8 | 0.333333 | 0.375 | 0.625 | 0.750 | 0.571429 | 0.222222 | 0.166667 | 0.625 | 0.000000 | 0.0 | 1.0 | 0.0 |
1 | 0.8 | 0.833333 | 0.500 | 0.500 | 0.625 | 0.857143 | 0.666667 | 0.333333 | 0.625 | 0.714286 | 1.0 | 0.0 | 0.0 |
2 | 0.4 | 0.500000 | 0.625 | 0.625 | 0.500 | 0.857143 | 0.666667 | 0.500000 | 0.625 | 0.571429 | 0.0 | 0.0 | 1.0 |
3 | 0.2 | 0.000000 | 0.625 | 0.375 | 0.375 | 0.285714 | 0.777778 | 0.666667 | 0.375 | 1.000000 | 0.0 | 0.0 | 1.0 |
4 | 0.4 | 0.500000 | 0.500 | 0.375 | 0.250 | 0.428571 | 0.444444 | 0.000000 | 1.000 | 0.714286 | 1.0 | 0.0 | 0.0 |
mdlPCA = PCA(n_components=5)
XPCA = mdlPCA.fit_transform(df2_noTs)
print(mdlPCA.explained_variance_ratio_)
print(np.sum(mdlPCA.explained_variance_ratio_))
plt.plot(np.arange(0,mdlPCA.explained_variance_ratio_.shape[0]), mdlPCA.explained_variance_ratio_, '-ro')
[0.31894549 0.29885772 0.1547158 0.07966809 0.04678632] 0.898973407637008
[<matplotlib.lines.Line2D at 0x7f8fb2ea4910>]
print(mdlPCA.components_[0,:])
plt.plot(np.arange(0,mdlPCA.components_[0,:].shape[0]), mdlPCA.components_[0,:], '-ro')
[-0.30244657 -0.32214438 -0.26410732 -0.25227651 -0.27659662 -0.3869891 -0.17634825 0.12991123 0.1677439 -0.02453888 0.49300779 -0.20299601 -0.29001178]
[<matplotlib.lines.Line2D at 0x7f8fb2e8f640>]
df2Fil = df2[np.logical_and(df2.tsRel>60000, df2.tsRel<61000)]
df2Fil_norm = (df2Fil-df2Fil.min())/(df2Fil.max()-df2Fil.min())
df2Fil_norm.head()
Q1 | Q2 | Q3 | Q4 | Q5 | Q7 | Q8 | Q9 | Q10 | Q11 | tsRel | Q6_Reading | Q6_Running | Q6_Watching a movie | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 0.25 | 0.0 | 0.714286 | 0.500000 | 0.500000 | 0.285714 | 0.777778 | 0.666667 | 0.375 | 1.00 | 0.000000 | 0.0 | 0.0 | 1.0 |
4 | 0.50 | 0.6 | 0.571429 | 0.500000 | 0.333333 | 0.428571 | 0.444444 | 0.000000 | 1.000 | 0.50 | 0.132780 | 1.0 | 0.0 | 0.0 |
5 | 1.00 | 0.8 | 1.000000 | 0.333333 | 0.333333 | 1.000000 | 0.333333 | 1.000000 | 0.000 | 0.50 | 0.228216 | 0.0 | 1.0 | 0.0 |
6 | 0.00 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.750 | 0.50 | 0.257261 | 1.0 | 0.0 | 0.0 |
7 | 0.75 | 0.0 | 0.857143 | 0.833333 | 0.666667 | 0.428571 | 0.555556 | 0.666667 | 0.500 | 0.75 | 0.269710 | 1.0 | 0.0 | 0.0 |
mdlPCA = PCA(n_components=5)
XPCA = mdlPCA.fit_transform(df2Fil_norm)
print(mdlPCA.explained_variance_ratio_)
print(np.sum(mdlPCA.explained_variance_ratio_))
plt.plot(np.arange(0,mdlPCA.explained_variance_ratio_.shape[0]), mdlPCA.explained_variance_ratio_, '-ro')
[0.34631596 0.27017825 0.14469026 0.07712654 0.05616314] 0.8944741480656596
[<matplotlib.lines.Line2D at 0x7f8fb2dfd2b0>]
print(mdlPCA.components_[0,:])
plt.plot(np.arange(0,mdlPCA.components_[0,:].shape[0]), mdlPCA.components_[0,:], '-ro')
[-0.43656834 -0.36520933 -0.30003171 -0.22317841 -0.26032745 -0.38821589 -0.21312891 0.05576004 0.17168435 -0.07017853 -0.05979342 0.27181642 -0.3812503 0.10943387]
[<matplotlib.lines.Line2D at 0x7f8fb2d60880>]
df3 = df2.drop(columns=['tsRel'])
df3_norm = (df3-df3.min())/(df3.max()-df3.min())
df3_norm.head()
Q1 | Q2 | Q3 | Q4 | Q5 | Q7 | Q8 | Q9 | Q10 | Q11 | Q6_Reading | Q6_Running | Q6_Watching a movie | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.8 | 0.333333 | 0.375 | 0.625 | 0.750 | 0.571429 | 0.222222 | 0.166667 | 0.625 | 0.000000 | 0.0 | 1.0 | 0.0 |
1 | 0.8 | 0.833333 | 0.500 | 0.500 | 0.625 | 0.857143 | 0.666667 | 0.333333 | 0.625 | 0.714286 | 1.0 | 0.0 | 0.0 |
2 | 0.4 | 0.500000 | 0.625 | 0.625 | 0.500 | 0.857143 | 0.666667 | 0.500000 | 0.625 | 0.571429 | 0.0 | 0.0 | 1.0 |
3 | 0.2 | 0.000000 | 0.625 | 0.375 | 0.375 | 0.285714 | 0.777778 | 0.666667 | 0.375 | 1.000000 | 0.0 | 0.0 | 1.0 |
4 | 0.4 | 0.500000 | 0.500 | 0.375 | 0.250 | 0.428571 | 0.444444 | 0.000000 | 1.000 | 0.714286 | 1.0 | 0.0 | 0.0 |
colX = df3.columns[pd.np.r_[0:8,10:13]]
colX.tolist()
/tmp/ipykernel_1809247/2650173780.py:1: FutureWarning: The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead colX = df3.columns[pd.np.r_[0:8,10:13]]
['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q7', 'Q8', 'Q9', 'Q6_Reading', 'Q6_Running', 'Q6_Watching a movie']
colY = ['Q10'] # level of stress;
colY = ['Q11'] # level of motivation;
colY
['Q11']
X = np.array(df3[colX])
print(X.shape)
X
(19, 11)
array([[ 8, 5, 4, 6, 7, 5, 3, 5, 0, 1, 0], [ 8, 8, 5, 5, 6, 7, 7, 6, 1, 0, 0], [ 6, 6, 6, 6, 5, 7, 7, 7, 0, 0, 1], [ 5, 3, 6, 4, 4, 3, 8, 8, 0, 0, 1], [ 6, 6, 5, 4, 3, 4, 5, 4, 1, 0, 0], [ 8, 7, 8, 3, 3, 8, 4, 10, 0, 1, 0], [ 4, 3, 1, 1, 1, 1, 1, 10, 1, 0, 0], [ 7, 3, 7, 6, 5, 4, 6, 8, 1, 0, 0], [ 5, 5, 5, 4, 4, 4, 4, 5, 0, 0, 1], [ 6, 6, 6, 6, 6, 4, 6, 6, 0, 0, 1], [ 4, 4, 4, 5, 3, 5, 2, 7, 0, 0, 1], [ 7, 7, 7, 2, 2, 7, 6, 7, 0, 1, 0], [ 8, 8, 8, 6, 6, 8, 7, 8, 0, 0, 1], [ 4, 4, 4, 1, 1, 1, 5, 7, 0, 0, 1], [ 8, 7, 7, 7, 7, 7, 10, 5, 0, 1, 0], [ 7, 7, 6, 6, 6, 6, 6, 6, 0, 0, 1], [ 7, 6, 6, 5, 5, 1, 7, 7, 1, 0, 0], [ 6, 6, 6, 5, 5, 2, 9, 9, 0, 0, 1], [ 9, 9, 9, 9, 9, 7, 6, 5, 0, 0, 1]])
y = np.array(df3[colY])
print(y.shape)
y
(19, 1)
array([[ 3], [ 8], [ 7], [10], [ 8], [ 8], [ 8], [ 9], [ 7], [ 6], [ 7], [ 8], [ 8], [ 8], [10], [ 7], [ 7], [ 9], [10]])
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
#scaler = StandardScaler()
scaler = MinMaxScaler()
scaler.fit(X)
Xnorm = scaler.transform(X)
Xnorm.max(axis=0)
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
import numpy as np
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
for train_index, test_index in loo.split(Xnorm):
print('TRAIN: ' + str(train_index) + ' TEST: ' + str(test_index))
TRAIN: [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18] TEST: [0] TRAIN: [ 0 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18] TEST: [1] TRAIN: [ 0 1 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18] TEST: [2] TRAIN: [ 0 1 2 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18] TEST: [3] TRAIN: [ 0 1 2 3 5 6 7 8 9 10 11 12 13 14 15 16 17 18] TEST: [4] TRAIN: [ 0 1 2 3 4 6 7 8 9 10 11 12 13 14 15 16 17 18] TEST: [5] TRAIN: [ 0 1 2 3 4 5 7 8 9 10 11 12 13 14 15 16 17 18] TEST: [6] TRAIN: [ 0 1 2 3 4 5 6 8 9 10 11 12 13 14 15 16 17 18] TEST: [7] TRAIN: [ 0 1 2 3 4 5 6 7 9 10 11 12 13 14 15 16 17 18] TEST: [8] TRAIN: [ 0 1 2 3 4 5 6 7 8 10 11 12 13 14 15 16 17 18] TEST: [9] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 11 12 13 14 15 16 17 18] TEST: [10] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 12 13 14 15 16 17 18] TEST: [11] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 17 18] TEST: [12] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 14 15 16 17 18] TEST: [13] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 15 16 17 18] TEST: [14] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 16 17 18] TEST: [15] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 18] TEST: [16] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 18] TEST: [17] TRAIN: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17] TEST: [18]
import warnings
warnings.filterwarnings('ignore')
from sklearn.svm import LinearSVR
loo = LeaveOneOut()
predAll = np.zeros([y.shape[0],1])
i=0
for train_index, test_index in loo.split(Xnorm):
X_train, X_test = Xnorm[train_index], Xnorm[test_index]
y_train, y_test = y[train_index], y[test_index]
regr = LinearSVR(random_state=0, tol=1e-5)
regr.fit(X_train, y_train) # Train the model
ypred = regr.predict(X_test) # Apply the model
predAll[i] = ypred
print('Y : ' + str(y_test) + ' , pred as: ' + str(ypred))
i = i + 1
Y : [[3]] , pred as: [6.71770473] Y : [[8]] , pred as: [8.44358673] Y : [[7]] , pred as: [8.57175498] Y : [[10]] , pred as: [7.62213136] Y : [[8]] , pred as: [6.70446542] Y : [[8]] , pred as: [8.9269527] Y : [[8]] , pred as: [5.14777884] Y : [[9]] , pred as: [8.02397316] Y : [[7]] , pred as: [6.22595715] Y : [[6]] , pred as: [7.94496256] Y : [[7]] , pred as: [6.21621235] Y : [[8]] , pred as: [7.00076291] Y : [[8]] , pred as: [9.45348764] Y : [[8]] , pred as: [6.27163448] Y : [[10]] , pred as: [6.519619] Y : [[7]] , pred as: [8.10618284] Y : [[7]] , pred as: [8.58400087] Y : [[9]] , pred as: [7.92971992] Y : [[10]] , pred as: [6.78948034]
np.corrcoef(y.T, predAll.T)
array([[1. , 0.01086243], [0.01086243, 1. ]])
plt.scatter(y.T, predAll.T)
<matplotlib.collections.PathCollection at 0x7f8fb2cd9040>