%useLatestDescriptors
%use dataframe, kandy
val df = DataFrame.readCSV(
fileOrUrl = "../../idea-examples/titanic/src/main/resources/titanic.csv",
delimiter = ';',
parserOptions = ParserOptions(locale = java.util.Locale.FRENCH),
)
df.head()
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | homedest |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | Allen, Miss. Elisabeth Walton | null | 29.000000 | null | null | 24160 | 211.337500 | B5 | null | 2 | null | St Louis, MO |
1 | 1 | Allison, Master. Hudson Trevor | male | 0.916700 | 1 | 2 | 113781 | 151.550000 | C22 C26 | AA | 11 | null | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Miss. Helen Loraine | female | 2.000000 | 1 | 2 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.000000 | 1 | 2 | 113781 | 151.550000 | C22 C26 | S | null | 135 | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Mrs. Hudson J C (Bessie Wald... | female | 25.000000 | 1 | 2 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON |
We have a dataset which uses an alternative pattern for decimal numbers. This is a reason why the French locale will be used in the example.
But before data conversion, we should to handle null values.
df.describe()
name | type | count | unique | nulls | top | freq | mean | std | min | median | max |
---|---|---|---|---|---|---|---|---|---|---|---|
pclass | Int | 1309 | 3 | 0 | 3 | 709 | 2.294882 | 0.837836 | 1 | 3 | 3 |
survived | Int | 1309 | 2 | 0 | 0 | 809 | 0.381971 | 0.486055 | 0 | 0 | 1 |
name | String | 1309 | 1307 | 0 | Connolly, Miss. Kate | 2 | null | null | Abbing, Mr. Anthony | Kink, Mr. Vincenz | van Melkebeke, Mr. Philemon |
sex | String? | 1309 | 3 | 1 | male | 843 | null | null | female | male | male |
age | Double? | 1309 | 99 | 263 | 24.000000 | 47 | 29.881135 | 14.413500 | 0.166700 | 28.000000 | 80.000000 |
sibsp | Int? | 1309 | 8 | 1 | 0 | 890 | 0.499235 | 1.041965 | 0 | 0 | 8 |
parch | Int? | 1309 | 9 | 1 | 0 | 1001 | 0.385321 | 0.865826 | 0 | 0 | 9 |
ticket | String | 1309 | 929 | 0 | CA. 2343 | 11 | null | null | 110152 | 347082 | WE/P 5735 |
fare | Double? | 1309 | 282 | 1 | 8.050000 | 60 | 33.295479 | 51.758668 | 0.000000 | 14.454200 | 512.329200 |
cabin | String? | 1309 | 187 | 1014 | C23 C25 C27 | 6 | null | null | A10 | C62 C64 | T |
embarked | String? | 1309 | 5 | 3 | S | 912 | null | null | AA | S | S |
boat | String? | 1309 | 28 | 823 | 13 | 39 | null | null | 1 | 3 | D |
body | Int? | 1309 | 122 | 1188 | 135 | 1 | 160.809917 | 97.696922 | 1 | 155 | 328 |
homedest | String? | 1309 | 370 | 564 | New York, NY | 64 | null | null | ?Havana, Cuba | Lyndhurst, England | Zurich, Switzerland |
df
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | homedest |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | Allen, Miss. Elisabeth Walton | null | 29.000000 | null | null | 24160 | 211.337500 | B5 | null | 2 | null | St Louis, MO |
1 | 1 | Allison, Master. Hudson Trevor | male | 0.916700 | 1 | 2 | 113781 | 151.550000 | C22 C26 | AA | 11 | null | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Miss. Helen Loraine | female | 2.000000 | 1 | 2 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.000000 | 1 | 2 | 113781 | 151.550000 | C22 C26 | S | null | 135 | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Mrs. Hudson J C (Bessie Wald... | female | 25.000000 | 1 | 2 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON |
1 | 1 | Anderson, Mr. Harry | male | 48.000000 | 0 | 0 | 19952 | 26.550000 | E12 | S | 3 | null | New York, NY |
1 | 1 | Andrews, Miss. Kornelia Theodosia | female | 63.000000 | 1 | 0 | 13502 | 77.958300 | D7 | S | 10 | null | Hudson, NY |
1 | 0 | Andrews, Mr. Thomas Jr | male | 39.000000 | 0 | 0 | 112050 | 0.000000 | A36 | S | null | null | Belfast, NI |
1 | 1 | Appleton, Mrs. Edward Dale (Charlotte... | female | 53.000000 | 2 | 0 | 11769 | 51.479200 | C101 | S | D | null | Bayside, Queens, NY |
1 | 0 | Artagaveytia, Mr. Ramon | male | 71.000000 | 0 | 0 | PC 17609 | 49.504200 | null | C | null | 22 | Montevideo, Uruguay |
1 | 0 | Astor, Col. John Jacob | male | 47.000000 | 1 | 0 | PC 17757 | 227.525000 | C62 C64 | C | null | 124 | New York, NY |
1 | 1 | Astor, Mrs. John Jacob (Madeleine Tal... | female | 18.000000 | 1 | 0 | PC 17757 | 227.525000 | C62 C64 | C | 4 | null | New York, NY |
1 | 1 | Aubart, Mme. Leontine Pauline | female | 24.000000 | 0 | 0 | PC 17477 | 69.300000 | B35 | C | 9 | null | Paris, France |
1 | 1 | Barber, Miss. Ellen "Nellie" | female | 26.000000 | 0 | 0 | 19877 | 78.850000 | null | S | 6 | null | null |
1 | 1 | Barkworth, Mr. Algernon Henry Wilson | male | 80.000000 | 0 | 0 | 27042 | 30.000000 | A23 | S | B | null | Hessle, Yorks |
1 | 0 | Baumann, Mr. John D | male | null | 0 | 0 | PC 17318 | 25.925000 | null | S | null | null | New York, NY |
1 | 0 | Baxter, Mr. Quigg Edmond | male | 24.000000 | 0 | 1 | PC 17558 | 247.520800 | B58 B60 | C | null | null | Montreal, PQ |
1 | 1 | Baxter, Mrs. James (Helene DeLaudenie... | female | 50.000000 | 0 | 1 | PC 17558 | 247.520800 | B58 B60 | C | 6 | null | Montreal, PQ |
1 | 1 | Bazzani, Miss. Albina | female | 32.000000 | 0 | 0 | 11813 | 76.291700 | D15 | C | 8 | null | null |
1 | 0 | Beattie, Mr. Thomson | male | 36.000000 | 0 | 0 | 13050 | 75.241700 | C6 | C | A | null | Winnipeg, MN |
Let's convert all columns of our dataset to non-nullable and impute null values based on mean values.
val df1 = df
// imputing
.fillNulls { sibsp and parch and age and fare }.perCol { mean() }
.fillNulls { sex }.with { "other" }
.fillNulls { embarked }.with { "S" }
.convert { sibsp and parch and age and fare }.toDouble()
df1.head()
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | homedest |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | Allen, Miss. Elisabeth Walton | other | 29.000000 | 0.000000 | 0.000000 | 24160 | 211.337500 | B5 | S | 2 | null | St Louis, MO |
1 | 1 | Allison, Master. Hudson Trevor | male | 0.916700 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | AA | 11 | null | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Miss. Helen Loraine | female | 2.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | S | null | 135 | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Mrs. Hudson J C (Bessie Wald... | female | 25.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON |
df1.schema()
pclass: Int survived: Int name: String sex: String age: Double sibsp: Double parch: Double ticket: String fare: Double cabin: String? embarked: String boat: String? body: Int? homedest: String?
df1.corr()
column | pclass | survived | age | sibsp | parch | fare |
---|---|---|---|---|---|---|
pclass | 1.000000 | -0.312469 | -0.366370 | 0.060832 | 0.018322 | -0.558477 |
survived | -0.312469 | 1.000000 | -0.050199 | -0.027825 | 0.082660 | 0.244208 |
age | -0.366370 | -0.050199 | 1.000000 | -0.190747 | -0.130872 | 0.171521 |
sibsp | 0.060832 | -0.027825 | -0.190747 | 1.000000 | 0.373587 | 0.160224 |
parch | 0.018322 | 0.082660 | -0.130872 | 0.373587 | 1.000000 | 0.221522 |
fare | -0.558477 | 0.244208 | 0.171521 | 0.160224 | 0.221522 | 1.000000 |
val correlations = df1
.corr { all() }.with { survived }
.sortBy { survived }
correlations
column | survived |
---|---|
pclass | -0.312469 |
age | -0.050199 |
sibsp | -0.027825 |
parch | 0.082660 |
fare | 0.244208 |
survived | 1.000000 |
Great, at this moment we have 5 numerical features available for numerical analysis: pclass, age, sibsp, parch, fare.
To confirm some of our observations and assumptions, we can quickly analyze our feature correlations by pivoting features against each other. We can only do so at this stage for features which do not have any empty values. It also makes sense doing so only for features which are categorical (Sex), ordinal (Pclass) or discrete (SibSp, Parch) type.
Pclass: We observe significant correlation (>0.5) among Pclass=1 and Survived.
Sex: We confirm the observation during problem definition that Sex=female had a very high survival rate at 74%.
SibSp and Parch: These features have zero correlation for the certain values. It may be best to derive a feature or a set of features from these individual features.
df1.groupBy { pclass }.mean { survived }.sortBy { pclass }
pclass | survived |
---|---|
1 | 0.619195 |
2 | 0.429603 |
3 | 0.255289 |
df1.groupBy { sex }.mean { survived }.sortBy { survived }
sex | survived |
---|---|
male | 0.190985 |
female | 0.726882 |
other | 1.000000 |
df1.groupBy { sibsp }.mean { survived }.sortBy { sibsp }
sibsp | survived |
---|---|
0.000000 | 0.346801 |
1.000000 | 0.510972 |
2.000000 | 0.452381 |
3.000000 | 0.300000 |
4.000000 | 0.136364 |
5.000000 | 0.000000 |
8.000000 | 0.000000 |
df1.groupBy { parch }.mean { survived }.sortBy { parch }
parch | survived |
---|---|
0.000000 | 0.335329 |
1.000000 | 0.588235 |
2.000000 | 0.504425 |
3.000000 | 0.625000 |
4.000000 | 0.166667 |
5.000000 | 0.166667 |
6.000000 | 0.000000 |
9.000000 | 0.000000 |
It's interesting to discover both age distributions: among survived and not survived passengers.
val byAge = df1.valueCounts { age }.sortBy { age }
byAge
age | count |
---|---|
0.166700 | 1 |
0.333300 | 1 |
0.416700 | 1 |
0.666700 | 1 |
0.750000 | 3 |
0.833300 | 3 |
0.916700 | 2 |
1.000000 | 10 |
2.000000 | 12 |
3.000000 | 7 |
4.000000 | 10 |
5.000000 | 5 |
6.000000 | 6 |
7.000000 | 4 |
8.000000 | 6 |
9.000000 | 10 |
10.000000 | 4 |
11.000000 | 4 |
11.500000 | 1 |
12.000000 | 3 |
// JetBrains color palette
object JetBrainsColors {
val lightOrange = Color.hex("#ffb59e")
val orange = Color.hex("#ff6632")
val lightGrey = Color.hex("#a6a6a6")
val darkGrey = Color.hex("#4c4c4c")
}
byAge.plot {
points {
x(age)
y(count)
size = 5.0
color = JetBrainsColors.lightGrey
}
layout {
size = 850 to 500
}
}
val age = df.select { age }.dropNulls().sortBy { age }
age.plot {
histogram(x = age, binsOption = BinsOption.byWidth(5.0)) {
fillColor = JetBrainsColors.orange
}
layout {
size = 850 to 500
}
}
df1.groupBy { age }.pivotCounts { survived }.sortBy { age }
age | survived | |
---|---|---|
1 | 0 | |
0.166700 | 1 | 0 |
0.333300 | 0 | 1 |
0.416700 | 1 | 0 |
0.666700 | 1 | 0 |
0.750000 | 2 | 1 |
0.833300 | 3 | 0 |
0.916700 | 2 | 0 |
1.000000 | 7 | 3 |
2.000000 | 4 | 8 |
3.000000 | 5 | 2 |
4.000000 | 7 | 3 |
5.000000 | 4 | 1 |
6.000000 | 3 | 3 |
7.000000 | 2 | 2 |
8.000000 | 4 | 2 |
9.000000 | 4 | 6 |
10.000000 | 0 | 4 |
11.000000 | 1 | 3 |
11.500000 | 0 | 1 |
12.000000 | 3 | 0 |
val survivedByAge = df1
.select { survived and age }
.sortBy { age }
.convert { survived }.with { if (it == 1) "Survived" else "Died" }
survivedByAge
survived | age |
---|---|
Survived | 0.166700 |
Died | 0.333300 |
Survived | 0.416700 |
Survived | 0.666700 |
Survived | 0.750000 |
Survived | 0.750000 |
Died | 0.750000 |
Survived | 0.833300 |
Survived | 0.833300 |
Survived | 0.833300 |
Survived | 0.916700 |
Survived | 0.916700 |
Survived | 1.000000 |
Survived | 1.000000 |
Survived | 1.000000 |
Survived | 1.000000 |
Died | 1.000000 |
Survived | 1.000000 |
Died | 1.000000 |
Survived | 1.000000 |
survivedByAge.groupBy { survived }
survived | group | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Survived | DataFrame [500 x 2]
... showing only top 5 of 500 rows | ||||||||||||
Died | DataFrame [809 x 2]
... showing only top 5 of 809 rows |
survivedByAge.groupBy { survived }.plot {
histogram(x = age, binsOption = BinsOption.byWidth(5.0)) {
fillColor(key.survived) {
scale = categorical(
"Survived" to JetBrainsColors.orange,
"Died" to JetBrainsColors.darkGrey,
)
}
alpha = 0.7
position = Position.dodge()
}
layout {
size = 850 to 500
}
}
// Density plot
survivedByAge.groupBy { survived }.plot {
densityPlot(x = age) {
fillColor = Color.GREY
alpha = 0.3
borderLine {
color(key.survived) {
scale = categorical(
"Survived" to JetBrainsColors.orange,
"Died" to JetBrainsColors.darkGrey,
)
}
}
}
layout {
size = 850 to 250
}
}
survivedByAge.groupBy { survived }
survived | group | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Survived | DataFrame [500 x 2]
... showing only top 5 of 500 rows | ||||||||||||
Died | DataFrame [809 x 2]
... showing only top 5 of 809 rows |
// A basic box plot
survivedByAge.plot {
boxplot(x = survived, y = age) {
boxes {
fillColor(Stat.x) {
scale = categorical(
"Survived" to JetBrainsColors.orange,
"Died" to JetBrainsColors.darkGrey,
)
}
}
}
layout {
size = 500 to 400
}
}
Seems like we have the same age distribution among survived and not survived passengers.
To prepare data for the ML algorithms, we should replace all String values in categorical features on numbers. There are a few ways of how to preprocess categorical features, and One Hot Encoding is one of them. We will use pivotMatches
operation to convert categorical columns into sets of nested Boolean
columns per every unique value.
val pivoted = df1.pivotMatches { pclass and sex and embarked }
pivoted.head()
survived | name | age | sibsp | parch | ticket | fare | cabin | boat | body | homedest | pclass | sex | embarked | |||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 3 | other | male | female | S | AA | C | Q | |||||||||||
1 | Allen, Miss. Elisabeth Walton | 29.000000 | 0.000000 | 0.000000 | 24160 | 211.337500 | B5 | 2 | null | St Louis, MO | true | false | false | true | false | false | true | false | false | false |
1 | Allison, Master. Hudson Trevor | 0.916700 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | 11 | null | Montreal, PQ / Chesterville, ON | true | false | false | false | true | false | false | true | false | false |
0 | Allison, Miss. Helen Loraine | 2.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | null | null | Montreal, PQ / Chesterville, ON | true | false | false | false | false | true | true | false | false | false |
0 | Allison, Mr. Hudson Joshua Creighton | 30.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | null | 135 | Montreal, PQ / Chesterville, ON | true | false | false | false | true | false | true | false | false | false |
0 | Allison, Mrs. Hudson J C (Bessie Wald... | 25.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | null | null | Montreal, PQ / Chesterville, ON | true | false | false | false | false | true | true | false | false | false |
val df2 = pivoted
// feature extraction
.select { cols(survived, pclass, sibsp, parch, age, fare, sex, embarked) }
.convert { valueCols() }.toDouble()
df2.head()
survived | pclass | sibsp | parch | age | fare | sex | embarked | |||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 3 | other | male | female | S | AA | C | Q | |||||
1.000000 | true | false | false | 0.000000 | 0.000000 | 29.000000 | 211.337500 | true | false | false | true | false | false | false |
1.000000 | true | false | false | 1.000000 | 2.000000 | 0.916700 | 151.550000 | false | true | false | false | true | false | false |
0.000000 | true | false | false | 1.000000 | 2.000000 | 2.000000 | 151.550000 | false | false | true | true | false | false | false |
0.000000 | true | false | false | 1.000000 | 2.000000 | 30.000000 | 151.550000 | false | true | false | true | false | false | false |
0.000000 | true | false | false | 1.000000 | 2.000000 | 25.000000 | 151.550000 | false | false | true | true | false | false | false |
df2.corr { survived and sibsp and parch and age and fare }.withItself()
column | survived | sibsp | parch | age | fare |
---|---|---|---|---|---|
survived | 1.000000 | -0.027825 | 0.082660 | -0.050199 | 0.244208 |
sibsp | -0.027825 | 1.000000 | 0.373587 | -0.190747 | 0.160224 |
parch | 0.082660 | 0.373587 | 1.000000 | -0.130872 | 0.221522 |
age | -0.050199 | -0.190747 | -0.130872 | 1.000000 | 0.171521 |
fare | 0.244208 | 0.160224 | 0.221522 | 0.171521 | 1.000000 |
val correlationTable = df2
.corr { survived and sibsp and parch and age and fare }.withItself()
.gather { allAfter("column") }.into("row", "value")
correlationTable
column | row | value |
---|---|---|
survived | survived | 1.000000 |
survived | sibsp | -0.027825 |
survived | parch | 0.082660 |
survived | age | -0.050199 |
survived | fare | 0.244208 |
sibsp | survived | -0.027825 |
sibsp | sibsp | 1.000000 |
sibsp | parch | 0.373587 |
sibsp | age | -0.190747 |
sibsp | fare | 0.160224 |
parch | survived | 0.082660 |
parch | sibsp | 0.373587 |
parch | parch | 1.000000 |
parch | age | -0.130872 |
parch | fare | 0.221522 |
age | survived | -0.050199 |
age | sibsp | -0.190747 |
age | parch | -0.130872 |
age | age | 1.000000 |
age | fare | 0.171521 |
fun scaleContinuousColorGradientN() = continuousColorGradientN(
gradientColors = listOf(
JetBrainsColors.orange,
JetBrainsColors.lightGrey,
JetBrainsColors.darkGrey,
),
domainMin = -1.0,
domainMax = 1.0,
)
correlationTable.plot {
tiles {
x(row) { axis.name = "" }
y(column) { axis.name = "" }
fillColor(value) { scale = scaleContinuousColorGradientN() }
}
}
correlationTable.plot {
points {
size(value) {
legend {
breaks(emptyList())
}
}
symbol = Symbol.SQUARE
x(row) {
axis.name = ""
}
y(column) {
axis.name = ""
}
color(value) { scale = scaleContinuousColorGradientN() }
}
layout {
style {
panel.grid {
majorLine {
blank = true
}
}
}
size = 500 to 350
}
}
We suggest to combine both, Sibsp and parch features, into the new one feature with the name FamilyNumber as a simple sum of sibsp and parch.
val familyDF = df1
.add("familyNumber") { sibsp + parch }
familyDF.head()
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | homedest | familyNumber |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | Allen, Miss. Elisabeth Walton | other | 29.000000 | 0.000000 | 0.000000 | 24160 | 211.337500 | B5 | S | 2 | null | St Louis, MO | 0.000000 |
1 | 1 | Allison, Master. Hudson Trevor | male | 0.916700 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | AA | 11 | null | Montreal, PQ / Chesterville, ON | 3.000000 |
1 | 0 | Allison, Miss. Helen Loraine | female | 2.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON | 3.000000 |
1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | S | null | 135 | Montreal, PQ / Chesterville, ON | 3.000000 |
1 | 0 | Allison, Mrs. Hudson J C (Bessie Wald... | female | 25.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON | 3.000000 |
familyDF.corr { familyNumber }.with { survived }
column | survived |
---|---|
familyNumber | 0.026876 |
familyDF.corr { familyNumber }.with { age }
column | age |
---|---|
familyNumber | -0.196996 |
Looks like the new feature has no influence on the survived column, but it has a strong negative correlation with age.
Let's try to extract something from the names. A lot of string in the name column contains special titles, like Done, Mr, Mrs and so on.
val titledDF = df
.select { survived and name }
.add("title") {
name.split(".")[0].split(",")[1].trim()
}
titledDF.head(100)
survived | name | title |
---|---|---|
1 | Allen, Miss. Elisabeth Walton | Miss |
1 | Allison, Master. Hudson Trevor | Master |
0 | Allison, Miss. Helen Loraine | Miss |
0 | Allison, Mr. Hudson Joshua Creighton | Mr |
0 | Allison, Mrs. Hudson J C (Bessie Wald... | Mrs |
1 | Anderson, Mr. Harry | Mr |
1 | Andrews, Miss. Kornelia Theodosia | Miss |
0 | Andrews, Mr. Thomas Jr | Mr |
1 | Appleton, Mrs. Edward Dale (Charlotte... | Mrs |
0 | Artagaveytia, Mr. Ramon | Mr |
0 | Astor, Col. John Jacob | Col |
1 | Astor, Mrs. John Jacob (Madeleine Tal... | Mrs |
1 | Aubart, Mme. Leontine Pauline | Mme |
1 | Barber, Miss. Ellen "Nellie" | Miss |
1 | Barkworth, Mr. Algernon Henry Wilson | Mr |
0 | Baumann, Mr. John D | Mr |
0 | Baxter, Mr. Quigg Edmond | Mr |
1 | Baxter, Mrs. James (Helene DeLaudenie... | Mrs |
1 | Bazzani, Miss. Albina | Miss |
0 | Beattie, Mr. Thomson | Mr |
titledDF.valueCounts { title }
title | count |
---|---|
Mr | 757 |
Miss | 260 |
Mrs | 197 |
Master | 61 |
Dr | 8 |
Rev | 8 |
Col | 4 |
Major | 2 |
Mlle | 2 |
Ms | 2 |
Mme | 1 |
Capt | 1 |
Lady | 1 |
Sir | 1 |
Dona | 1 |
Jonkheer | 1 |
the Countess | 1 |
Don | 1 |
New Title column contains some rare titles and some titles with typos. Let's clean the data and merge rare titles into one category.
val rareTitles = listOf(
"Dona", "Lady", "the Countess", "Capt", "Col", "Don",
"Dr", "Major", "Rev", "Sir", "Jonkheer",
)
val cleanedTitledDF = titledDF.update { title }.with {
when {
it == "Mlle" -> "Miss"
it == "Ms" -> "Miss"
it == "Mme" -> "Mrs"
it in rareTitles -> "Rare Title"
else -> it
}
}
cleanedTitledDF.valueCounts { title }
title | count |
---|---|
Mr | 757 |
Miss | 264 |
Mrs | 198 |
Master | 61 |
Rare Title | 29 |
Now it looks awesome and we have only 5 different titles and could see how it correlates with survival.
val correlations = cleanedTitledDF
.pivotMatches { title }
.corr { title }.with { survived }
correlations
title | survived |
---|---|
Miss | 0.306069 |
Master | 0.057318 |
Mr | -0.528775 |
Mrs | 0.352536 |
Rare Title | -0.000915 |
correlations
.update { title }.with { it.substringAfter('_') }
.filter { title != "survived" }
title | survived |
---|---|
Miss | 0.306069 |
Master | 0.057318 |
Mr | -0.528775 |
Mrs | 0.352536 |
Rare Title | -0.000915 |
The women with title Miss and Mrs have the same chances to survive, but not the same for the men. If you have a title Mr, your deals are bad on the Titanic.
Rare title is really rare and doesn't play a big role.
val groupedCleanedTitledDF = cleanedTitledDF
.valueCounts { title and survived }
.sortBy { title and survived }
groupedCleanedTitledDF
title | survived | count |
---|---|---|
Master | 0 | 30 |
Master | 1 | 31 |
Miss | 0 | 85 |
Miss | 1 | 179 |
Mr | 0 | 634 |
Mr | 1 | 123 |
Mrs | 0 | 42 |
Mrs | 1 | 156 |
Rare Title | 0 | 18 |
Rare Title | 1 | 11 |
It's very interesting to dig deeper into families, home destinations, and we could do start this analysis from surnames which could be easily extracted from Name feature.
val surnameDF = df1
.select { survived and name }
.add("surname") {
name.split(".")[0].split(",")[0].trim()
}
surnameDF.head()
survived | name | surname |
---|---|---|
1 | Allen, Miss. Elisabeth Walton | Allen |
1 | Allison, Master. Hudson Trevor | Allison |
0 | Allison, Miss. Helen Loraine | Allison |
0 | Allison, Mr. Hudson Joshua Creighton | Allison |
0 | Allison, Mrs. Hudson J C (Bessie Wald... | Allison |
surnameDF.valueCounts { surname }
surname | count |
---|---|
Andersson | 11 |
Sage | 11 |
Asplund | 8 |
Goodwin | 8 |
Davies | 7 |
Brown | 6 |
Carter | 6 |
Fortune | 6 |
Smith | 6 |
Ford | 6 |
Johnson | 6 |
Panula | 6 |
Rice | 6 |
Skoog | 6 |
Ryerson | 5 |
Williams | 5 |
Kelly | 5 |
Lefebre | 5 |
Palsson | 5 |
Thomas | 5 |
surnameDF.surname.countDistinct()
875
val firstSymbol by column<String>()
df1
.add(firstSymbol) {
name.split(".")[0].split(",")[0].trim().first().toString()
}
.pivotMatches(firstSymbol)
.corr { firstSymbol }.with { survived }
firstSymbol | survived |
---|---|
A | -0.017914 |
B | 0.050554 |
C | 0.009037 |
D | 0.051711 |
E | -0.034629 |
F | 0.000400 |
G | -0.044483 |
H | 0.042187 |
I | -0.008329 |
J | -0.026790 |
K | -0.014219 |
L | -0.021071 |
M | 0.019041 |
N | 0.028698 |
O | 0.000128 |
P | -0.058996 |
R | -0.021941 |
S | -0.020043 |
T | 0.052264 |
U | -0.021737 |