%use dataframe
%useLatestDescriptors
%use kandy
val df = DataFrame.readCsv(
fileOrUrl = "titanic.csv",
delimiter = ';',
parserOptions = ParserOptions(locale = java.util.Locale.FRENCH),
)
df.head()
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | homedest |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | Allen, Miss. Elisabeth Walton | null | 29.000000 | null | null | 24160 | 211.337500 | B5 | null | 2 | null | St Louis, MO |
1 | 1 | Allison, Master. Hudson Trevor | male | 0.916700 | 1 | 2 | 113781 | 151.550000 | C22 C26 | AA | 11 | null | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Miss. Helen Loraine | female | 2.000000 | 1 | 2 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.000000 | 1 | 2 | 113781 | 151.550000 | C22 C26 | S | null | 135 | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Mrs. Hudson J C (Bessie Wald... | female | 25.000000 | 1 | 2 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON |
We have a dataset that uses an alternative pattern for decimal numbers. This is a reason why the French locale will be used in the example.
But before data conversion, we should handle null values.
df.describe()
name | type | count | unique | nulls | top | freq | mean | std | min | p25 | median | p75 | max |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
pclass | Int | 1309 | 3 | 0 | 3 | 709 | 2.294882 | 0.837836 | 1 | 2.000000 | 3.000000 | 3.000000 | 3 |
survived | Int | 1309 | 2 | 0 | 0 | 809 | 0.381971 | 0.486055 | 0 | 0.000000 | 0.000000 | 1.000000 | 1 |
name | String | 1309 | 1307 | 0 | Connolly, Miss. Kate | 2 | null | null | Abbing, Mr. Anthony | Devaney, Miss. Margaret Delia | Kink, Miss. Maria | Quick, Mrs. Frederick Charles (Jane R... | van Melkebeke, Mr. Philemon |
sex | String? | 1309 | 3 | 1 | male | 843 | null | null | female | female | male | male | male |
age | Double? | 1309 | 99 | 263 | 24.000000 | 47 | 29.881135 | 14.413500 | 0.166700 | 21.000000 | 28.000000 | 39.000000 | 80.000000 |
sibsp | Int? | 1309 | 8 | 1 | 0 | 890 | 0.499235 | 1.041965 | 0 | 0.000000 | 0.000000 | 1.000000 | 8 |
parch | Int? | 1309 | 9 | 1 | 0 | 1001 | 0.385321 | 0.865826 | 0 | 0.000000 | 0.000000 | 0.000000 | 9 |
ticket | String | 1309 | 929 | 0 | CA. 2343 | 11 | null | null | 110152 | 248738 | 347082 | A/5 3536 | WE/P 5735 |
fare | Double? | 1309 | 282 | 1 | 8.050000 | 60 | 33.295479 | 51.758668 | 0.000000 | 7.895800 | 14.454200 | 31.275000 | 512.329200 |
cabin | String? | 1309 | 187 | 1014 | C23 C25 C27 | 6 | null | null | A10 | B73 | C62 C64 | D48 | T |
embarked | String? | 1309 | 5 | 3 | S | 912 | null | null | AA | Q | S | S | S |
boat | String? | 1309 | 28 | 823 | 13 | 39 | null | null | 1 | 14 | 3 | 8 | D |
body | Int? | 1309 | 122 | 1188 | 135 | 1 | 160.809917 | 97.696922 | 1 | 71.333333 | 155.000000 | 256.666667 | 328 |
homedest | String? | 1309 | 370 | 564 | New York, NY | 64 | null | null | ?Havana, Cuba | England | Lucca, Italy / California | Provo, UT | Zurich, Switzerland |
df
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | homedest |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | Allen, Miss. Elisabeth Walton | null | 29.000000 | null | null | 24160 | 211.337500 | B5 | null | 2 | null | St Louis, MO |
1 | 1 | Allison, Master. Hudson Trevor | male | 0.916700 | 1 | 2 | 113781 | 151.550000 | C22 C26 | AA | 11 | null | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Miss. Helen Loraine | female | 2.000000 | 1 | 2 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.000000 | 1 | 2 | 113781 | 151.550000 | C22 C26 | S | null | 135 | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Mrs. Hudson J C (Bessie Wald... | female | 25.000000 | 1 | 2 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON |
1 | 1 | Anderson, Mr. Harry | male | 48.000000 | 0 | 0 | 19952 | 26.550000 | E12 | S | 3 | null | New York, NY |
1 | 1 | Andrews, Miss. Kornelia Theodosia | female | 63.000000 | 1 | 0 | 13502 | 77.958300 | D7 | S | 10 | null | Hudson, NY |
1 | 0 | Andrews, Mr. Thomas Jr | male | 39.000000 | 0 | 0 | 112050 | 0.000000 | A36 | S | null | null | Belfast, NI |
1 | 1 | Appleton, Mrs. Edward Dale (Charlotte... | female | 53.000000 | 2 | 0 | 11769 | 51.479200 | C101 | S | D | null | Bayside, Queens, NY |
1 | 0 | Artagaveytia, Mr. Ramon | male | 71.000000 | 0 | 0 | PC 17609 | 49.504200 | null | C | null | 22 | Montevideo, Uruguay |
1 | 0 | Astor, Col. John Jacob | male | 47.000000 | 1 | 0 | PC 17757 | 227.525000 | C62 C64 | C | null | 124 | New York, NY |
1 | 1 | Astor, Mrs. John Jacob (Madeleine Tal... | female | 18.000000 | 1 | 0 | PC 17757 | 227.525000 | C62 C64 | C | 4 | null | New York, NY |
1 | 1 | Aubart, Mme. Leontine Pauline | female | 24.000000 | 0 | 0 | PC 17477 | 69.300000 | B35 | C | 9 | null | Paris, France |
1 | 1 | Barber, Miss. Ellen "Nellie" | female | 26.000000 | 0 | 0 | 19877 | 78.850000 | null | S | 6 | null | null |
1 | 1 | Barkworth, Mr. Algernon Henry Wilson | male | 80.000000 | 0 | 0 | 27042 | 30.000000 | A23 | S | B | null | Hessle, Yorks |
1 | 0 | Baumann, Mr. John D | male | null | 0 | 0 | PC 17318 | 25.925000 | null | S | null | null | New York, NY |
1 | 0 | Baxter, Mr. Quigg Edmond | male | 24.000000 | 0 | 1 | PC 17558 | 247.520800 | B58 B60 | C | null | null | Montreal, PQ |
1 | 1 | Baxter, Mrs. James (Helene DeLaudenie... | female | 50.000000 | 0 | 1 | PC 17558 | 247.520800 | B58 B60 | C | 6 | null | Montreal, PQ |
1 | 1 | Bazzani, Miss. Albina | female | 32.000000 | 0 | 0 | 11813 | 76.291700 | D15 | C | 8 | null | null |
1 | 0 | Beattie, Mr. Thomson | male | 36.000000 | 0 | 0 | 13050 | 75.241700 | C6 | C | A | null | Winnipeg, MN |
Let's convert all columns of our dataset to non-nullable and impute null values based on mean values.
val df1 = df
// imputing
.fillNulls { sibsp and parch and age and fare }.perCol { mean() }
.fillNulls { sex }.with { "other" }
.fillNulls { embarked }.with { "S" }
.convert { sibsp and parch and age and fare }.toDouble()
df1.head()
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | homedest |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | Allen, Miss. Elisabeth Walton | other | 29.000000 | 0.000000 | 0.000000 | 24160 | 211.337500 | B5 | S | 2 | null | St Louis, MO |
1 | 1 | Allison, Master. Hudson Trevor | male | 0.916700 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | AA | 11 | null | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Miss. Helen Loraine | female | 2.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | S | null | 135 | Montreal, PQ / Chesterville, ON |
1 | 0 | Allison, Mrs. Hudson J C (Bessie Wald... | female | 25.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON |
df1.schema()
pclass: Int survived: Int name: String sex: String age: Double sibsp: Double parch: Double ticket: String fare: Double cabin: String? embarked: String boat: String? body: Int? homedest: String?
df1.corr()
column | pclass | survived | age | sibsp | parch | fare |
---|---|---|---|---|---|---|
pclass | 1.000000 | -0.312469 | -0.366370 | 0.060832 | 0.018322 | -0.558477 |
survived | -0.312469 | 1.000000 | -0.050199 | -0.027825 | 0.082660 | 0.244208 |
age | -0.366370 | -0.050199 | 1.000000 | -0.190747 | -0.130872 | 0.171521 |
sibsp | 0.060832 | -0.027825 | -0.190747 | 1.000000 | 0.373587 | 0.160224 |
parch | 0.018322 | 0.082660 | -0.130872 | 0.373587 | 1.000000 | 0.221522 |
fare | -0.558477 | 0.244208 | 0.171521 | 0.160224 | 0.221522 | 1.000000 |
val correlations = df1
.corr { all() }.with { survived }
.sortBy { survived }
correlations
column | survived |
---|---|
pclass | -0.312469 |
age | -0.050199 |
sibsp | -0.027825 |
parch | 0.082660 |
fare | 0.244208 |
survived | 1.000000 |
Great, at this moment we have five numerical features available for numerical analysis: pclass, age, sibsp, parch, fare.
To confirm some of our observations and assumptions, we can quickly analyze our feature correlations by pivoting features against each other. We can only do so at this stage for features which do not have any empty values. It also makes sense doing so only for features which are categorical (Sex), ordinal (Pclass) or discrete (SibSp, Parch) type.
Pclass: We observe a significant correlation (>0.5) between Pclass=1 and Survived.
Sex: We confirm the observation during problem definition that Sex=female had a very high survival rate at 74%.
SibSp and Parch: These features have zero correlation for the certain values. It may be best to derive a feature or a set of features from these individual features.
df1.groupBy { pclass }.mean { survived }.sortBy { pclass }
pclass | survived |
---|---|
1 | 0.619195 |
2 | 0.429603 |
3 | 0.255289 |
df1.groupBy { sex }.mean { survived }.sortBy { survived }
sex | survived |
---|---|
male | 0.190985 |
female | 0.726882 |
other | 1.000000 |
df1.groupBy { sibsp }.mean { survived }.sortBy { sibsp }
sibsp | survived |
---|---|
0.000000 | 0.346801 |
1.000000 | 0.510972 |
2.000000 | 0.452381 |
3.000000 | 0.300000 |
4.000000 | 0.136364 |
5.000000 | 0.000000 |
8.000000 | 0.000000 |
df1.groupBy { parch }.mean { survived }.sortBy { parch }
parch | survived |
---|---|
0.000000 | 0.335329 |
1.000000 | 0.588235 |
2.000000 | 0.504425 |
3.000000 | 0.625000 |
4.000000 | 0.166667 |
5.000000 | 0.166667 |
6.000000 | 0.000000 |
9.000000 | 0.000000 |
It's interesting to discover both age distributions: among the survived and perished passengers.
val byAge = df1.valueCounts { age }.sortBy { age }
byAge
age | count |
---|---|
0.166700 | 1 |
0.333300 | 1 |
0.416700 | 1 |
0.666700 | 1 |
0.750000 | 3 |
0.833300 | 3 |
0.916700 | 2 |
1.000000 | 10 |
2.000000 | 12 |
3.000000 | 7 |
4.000000 | 10 |
5.000000 | 5 |
6.000000 | 6 |
7.000000 | 4 |
8.000000 | 6 |
9.000000 | 10 |
10.000000 | 4 |
11.000000 | 4 |
11.500000 | 1 |
12.000000 | 3 |
// JetBrains color palette
object JetBrainsColors {
val lightOrange = Color.hex("#ffb59e")
val orange = Color.hex("#ff6632")
val lightGrey = Color.hex("#a6a6a6")
val darkGrey = Color.hex("#4c4c4c")
}
byAge.plot {
points {
x(age)
y(count)
size = 5.0
color = JetBrainsColors.lightGrey
}
layout {
size = 850 to 500
}
}
val age = df.select { age }.dropNulls().sortBy { age }
age.plot {
histogram(x = age, binsOption = BinsOption.byWidth(5.0)) {
fillColor = JetBrainsColors.orange
}
layout {
size = 850 to 500
}
}
df1.groupBy { age }.pivotCounts { survived }.sortBy { age }
age | survived | |
---|---|---|
1 | 0 | |
0.166700 | 1 | 0 |
0.333300 | 0 | 1 |
0.416700 | 1 | 0 |
0.666700 | 1 | 0 |
0.750000 | 2 | 1 |
0.833300 | 3 | 0 |
0.916700 | 2 | 0 |
1.000000 | 7 | 3 |
2.000000 | 4 | 8 |
3.000000 | 5 | 2 |
4.000000 | 7 | 3 |
5.000000 | 4 | 1 |
6.000000 | 3 | 3 |
7.000000 | 2 | 2 |
8.000000 | 4 | 2 |
9.000000 | 4 | 6 |
10.000000 | 0 | 4 |
11.000000 | 1 | 3 |
11.500000 | 0 | 1 |
12.000000 | 3 | 0 |
val survivedByAge = df1
.select { survived and age }
.sortBy { age }
.convert { survived }.with { if (it == 1) "Survived" else "Died" }
survivedByAge
survived | age |
---|---|
Survived | 0.166700 |
Died | 0.333300 |
Survived | 0.416700 |
Survived | 0.666700 |
Survived | 0.750000 |
Survived | 0.750000 |
Died | 0.750000 |
Survived | 0.833300 |
Survived | 0.833300 |
Survived | 0.833300 |
Survived | 0.916700 |
Survived | 0.916700 |
Survived | 1.000000 |
Survived | 1.000000 |
Survived | 1.000000 |
Survived | 1.000000 |
Died | 1.000000 |
Survived | 1.000000 |
Died | 1.000000 |
Survived | 1.000000 |
survivedByAge.groupBy { survived }
survived | group | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Survived | DataFrame [500 x 2]
... showing only top 5 of 500 rows | ||||||||||||
Died | DataFrame [809 x 2]
... showing only top 5 of 809 rows |
survivedByAge.groupBy { survived }.plot {
histogram(x = age, binsOption = BinsOption.byWidth(5.0)) {
fillColor(key.survived) {
scale = categorical(
"Survived" to JetBrainsColors.orange,
"Died" to JetBrainsColors.darkGrey,
)
}
alpha = 0.7
position = Position.dodge()
}
layout {
size = 850 to 500
}
}
// Density plot
survivedByAge.groupBy { survived }.plot {
densityPlot(x = age) {
fillColor = Color.GREY
alpha = 0.3
borderLine {
color(key.survived) {
scale = categorical(
"Survived" to JetBrainsColors.orange,
"Died" to JetBrainsColors.darkGrey,
)
}
}
}
layout {
size = 850 to 250
}
}
survivedByAge.groupBy { survived }
survived | group | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Survived | DataFrame [500 x 2]
... showing only top 5 of 500 rows | ||||||||||||
Died | DataFrame [809 x 2]
... showing only top 5 of 809 rows |
// A basic box plot
survivedByAge.plot {
boxplot(x = survived, y = age) {
boxes {
fillColor(Stat.x) {
scale = categorical(
"Survived" to JetBrainsColors.orange,
"Died" to JetBrainsColors.darkGrey,
)
}
}
}
layout {
size = 500 to 400
}
}
Seems like we have the same age distribution among survived and perished passengers.
To prepare data for ML algorithms, we should replace all String values in categorical features with numbers.
There are a few ways of preprocessing categorical features, One Hot Encoding being one of them.
We will use the pivotMatches
operation to convert categorical columns into sets of nested Boolean
columns per every unique value.
val pivoted = df1.pivotMatches { pclass and sex and embarked }
pivoted.head()
survived | name | age | sibsp | parch | ticket | fare | cabin | boat | body | homedest | pclass | sex | embarked | |||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 3 | other | male | female | S | AA | C | Q | |||||||||||
1 | Allen, Miss. Elisabeth Walton | 29.000000 | 0.000000 | 0.000000 | 24160 | 211.337500 | B5 | 2 | null | St Louis, MO | true | false | false | true | false | false | true | false | false | false |
1 | Allison, Master. Hudson Trevor | 0.916700 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | 11 | null | Montreal, PQ / Chesterville, ON | true | false | false | false | true | false | false | true | false | false |
0 | Allison, Miss. Helen Loraine | 2.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | null | null | Montreal, PQ / Chesterville, ON | true | false | false | false | false | true | true | false | false | false |
0 | Allison, Mr. Hudson Joshua Creighton | 30.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | null | 135 | Montreal, PQ / Chesterville, ON | true | false | false | false | true | false | true | false | false | false |
0 | Allison, Mrs. Hudson J C (Bessie Wald... | 25.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | null | null | Montreal, PQ / Chesterville, ON | true | false | false | false | false | true | true | false | false | false |
val df2 = pivoted
// feature extraction
.select { cols(survived, pclass, sibsp, parch, age, fare, sex, embarked) }
.convert { valueCols() }.toDouble()
df2.head()
survived | pclass | sibsp | parch | age | fare | sex | embarked | |||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 3 | other | male | female | S | AA | C | Q | |||||
1.000000 | true | false | false | 0.000000 | 0.000000 | 29.000000 | 211.337500 | true | false | false | true | false | false | false |
1.000000 | true | false | false | 1.000000 | 2.000000 | 0.916700 | 151.550000 | false | true | false | false | true | false | false |
0.000000 | true | false | false | 1.000000 | 2.000000 | 2.000000 | 151.550000 | false | false | true | true | false | false | false |
0.000000 | true | false | false | 1.000000 | 2.000000 | 30.000000 | 151.550000 | false | true | false | true | false | false | false |
0.000000 | true | false | false | 1.000000 | 2.000000 | 25.000000 | 151.550000 | false | false | true | true | false | false | false |
df2.corr { survived and sibsp and parch and age and fare }.withItself()
column | survived | sibsp | parch | age | fare |
---|---|---|---|---|---|
survived | 1.000000 | -0.027825 | 0.082660 | -0.050199 | 0.244208 |
sibsp | -0.027825 | 1.000000 | 0.373587 | -0.190747 | 0.160224 |
parch | 0.082660 | 0.373587 | 1.000000 | -0.130872 | 0.221522 |
age | -0.050199 | -0.190747 | -0.130872 | 1.000000 | 0.171521 |
fare | 0.244208 | 0.160224 | 0.221522 | 0.171521 | 1.000000 |
val correlationTable = df2
.corr { survived and sibsp and parch and age and fare }.withItself()
.gather { allAfter("column") }.into("row", "value")
correlationTable
column | row | value |
---|---|---|
survived | survived | 1.000000 |
survived | sibsp | -0.027825 |
survived | parch | 0.082660 |
survived | age | -0.050199 |
survived | fare | 0.244208 |
sibsp | survived | -0.027825 |
sibsp | sibsp | 1.000000 |
sibsp | parch | 0.373587 |
sibsp | age | -0.190747 |
sibsp | fare | 0.160224 |
parch | survived | 0.082660 |
parch | sibsp | 0.373587 |
parch | parch | 1.000000 |
parch | age | -0.130872 |
parch | fare | 0.221522 |
age | survived | -0.050199 |
age | sibsp | -0.190747 |
age | parch | -0.130872 |
age | age | 1.000000 |
age | fare | 0.171521 |
fun scaleContinuousColorGradientN() =
continuousColorGradientN(
gradientColors = listOf(
JetBrainsColors.orange,
JetBrainsColors.lightGrey,
JetBrainsColors.darkGrey,
),
domainMin = -1.0,
domainMax = 1.0,
)
correlationTable.plot {
tiles {
x(row) { axis.name = "" }
y(column) { axis.name = "" }
fillColor(value) { scale = scaleContinuousColorGradientN() }
}
}
correlationTable.plot {
points {
size(value) {
legend {
breaks(emptyList())
}
}
symbol = Symbol.SQUARE
x(row) {
axis.name = ""
}
y(column) {
axis.name = ""
}
color(value) { scale = scaleContinuousColorGradientN() }
}
layout {
style {
panel.grid {
majorLine {
blank = true
}
}
}
size = 500 to 350
}
}
We suggest combining both the Sibsp and parch features into a single new feature named FamilyNumber by simply summing them up.
val familyDF = df1
.add("familyNumber") { sibsp + parch }
familyDF.head()
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | homedest | familyNumber |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | Allen, Miss. Elisabeth Walton | other | 29.000000 | 0.000000 | 0.000000 | 24160 | 211.337500 | B5 | S | 2 | null | St Louis, MO | 0.000000 |
1 | 1 | Allison, Master. Hudson Trevor | male | 0.916700 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | AA | 11 | null | Montreal, PQ / Chesterville, ON | 3.000000 |
1 | 0 | Allison, Miss. Helen Loraine | female | 2.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON | 3.000000 |
1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | S | null | 135 | Montreal, PQ / Chesterville, ON | 3.000000 |
1 | 0 | Allison, Mrs. Hudson J C (Bessie Wald... | female | 25.000000 | 1.000000 | 2.000000 | 113781 | 151.550000 | C22 C26 | S | null | null | Montreal, PQ / Chesterville, ON | 3.000000 |
familyDF.corr { familyNumber }.with { survived }
column | survived |
---|---|
familyNumber | 0.026876 |
familyDF.corr { familyNumber }.with { age }
column | age |
---|---|
familyNumber | -0.196996 |
It looks like the new feature has no influence on the survived column, but it has a strong negative correlation with age.
Let's try to extract something from the names. A lot of strings in the name column contain special titles, like Done, Mr, Mrs and so on.
val titledDF = df
.select { survived and name }
.add("title") {
name.split(".")[0].split(",")[1].trim()
}
titledDF.head(100)
survived | name | title |
---|---|---|
1 | Allen, Miss. Elisabeth Walton | Miss |
1 | Allison, Master. Hudson Trevor | Master |
0 | Allison, Miss. Helen Loraine | Miss |
0 | Allison, Mr. Hudson Joshua Creighton | Mr |
0 | Allison, Mrs. Hudson J C (Bessie Wald... | Mrs |
1 | Anderson, Mr. Harry | Mr |
1 | Andrews, Miss. Kornelia Theodosia | Miss |
0 | Andrews, Mr. Thomas Jr | Mr |
1 | Appleton, Mrs. Edward Dale (Charlotte... | Mrs |
0 | Artagaveytia, Mr. Ramon | Mr |
0 | Astor, Col. John Jacob | Col |
1 | Astor, Mrs. John Jacob (Madeleine Tal... | Mrs |
1 | Aubart, Mme. Leontine Pauline | Mme |
1 | Barber, Miss. Ellen "Nellie" | Miss |
1 | Barkworth, Mr. Algernon Henry Wilson | Mr |
0 | Baumann, Mr. John D | Mr |
0 | Baxter, Mr. Quigg Edmond | Mr |
1 | Baxter, Mrs. James (Helene DeLaudenie... | Mrs |
1 | Bazzani, Miss. Albina | Miss |
0 | Beattie, Mr. Thomson | Mr |
titledDF.valueCounts { title }
title | count |
---|---|
Mr | 757 |
Miss | 260 |
Mrs | 197 |
Master | 61 |
Dr | 8 |
Rev | 8 |
Col | 4 |
Major | 2 |
Mlle | 2 |
Ms | 2 |
Mme | 1 |
Capt | 1 |
Lady | 1 |
Sir | 1 |
Dona | 1 |
Jonkheer | 1 |
the Countess | 1 |
Don | 1 |
The new Title column contains some rare titles and some titles with typos. Let's clean the data and merge all rare titles into one category.
val rareTitles = listOf(
"Dona", "Lady", "the Countess", "Capt", "Col", "Don",
"Dr", "Major", "Rev", "Sir", "Jonkheer",
)
val cleanedTitledDF = titledDF.update { title }.with {
when {
it == "Mlle" -> "Miss"
it == "Ms" -> "Miss"
it == "Mme" -> "Mrs"
it in rareTitles -> "Rare Title"
else -> it
}
}
cleanedTitledDF.valueCounts { title }
title | count |
---|---|
Mr | 757 |
Miss | 264 |
Mrs | 198 |
Master | 61 |
Rare Title | 29 |
Now it looks awesome, and we have only five different titles! Let's see how it correlates with the survival rate.
val correlations = cleanedTitledDF
.pivotMatches { title }
.corr { title }.with { survived }
correlations
title | survived |
---|---|
Miss | 0.306069 |
Master | 0.057318 |
Mr | -0.528775 |
Mrs | 0.352536 |
Rare Title | -0.000915 |
correlations
.update { title }.with { it.substringAfter('_') }
.filter { title != "survived" }
title | survived |
---|---|
Miss | 0.306069 |
Master | 0.057318 |
Mr | -0.528775 |
Mrs | 0.352536 |
Rare Title | -0.000915 |
The women with the title Miss and Mrs had the same chances of survival, but this cannot be said for the men.
If you had the title Mr, you were in bad luck on Titanic.
Rare title is really rare and doesn't play a big role.
val groupedCleanedTitledDF = cleanedTitledDF
.valueCounts { title and survived }
.sortBy { title and survived }
groupedCleanedTitledDF
title | survived | count |
---|---|---|
Master | 0 | 30 |
Master | 1 | 31 |
Miss | 0 | 85 |
Miss | 1 | 179 |
Mr | 0 | 634 |
Mr | 1 | 123 |
Mrs | 0 | 42 |
Mrs | 1 | 156 |
Rare Title | 0 | 18 |
Rare Title | 1 | 11 |
It's alluring to dig deeper into families, and home destinations and we could start analyzing these from the surnames that can be easily extracted from the Name feature.
val surnameDF = df1
.select { survived and name }
.add("surname") {
name.split(".")[0].split(",")[0].trim()
}
surnameDF.head()
survived | name | surname |
---|---|---|
1 | Allen, Miss. Elisabeth Walton | Allen |
1 | Allison, Master. Hudson Trevor | Allison |
0 | Allison, Miss. Helen Loraine | Allison |
0 | Allison, Mr. Hudson Joshua Creighton | Allison |
0 | Allison, Mrs. Hudson J C (Bessie Wald... | Allison |
surnameDF.valueCounts { surname }
surname | count |
---|---|
Andersson | 11 |
Sage | 11 |
Asplund | 8 |
Goodwin | 8 |
Davies | 7 |
Brown | 6 |
Carter | 6 |
Fortune | 6 |
Smith | 6 |
Ford | 6 |
Johnson | 6 |
Panula | 6 |
Rice | 6 |
Skoog | 6 |
Ryerson | 5 |
Williams | 5 |
Kelly | 5 |
Lefebre | 5 |
Palsson | 5 |
Thomas | 5 |
surnameDF.surname.countDistinct()
875
val firstSymbol by column<String>()
df1
.add(firstSymbol) {
name.split(".")[0].split(",")[0].trim().first().toString()
}
.pivotMatches(firstSymbol)
.corr { firstSymbol }.with { survived }
firstSymbol | survived |
---|---|
A | -0.017914 |
B | 0.050554 |
C | 0.009037 |
D | 0.051711 |
E | -0.034629 |
F | 0.000400 |
G | -0.044483 |
H | 0.042187 |
I | -0.008329 |
J | -0.026790 |
K | -0.014219 |
L | -0.021071 |
M | 0.019041 |
N | 0.028698 |
O | 0.000128 |
P | -0.058996 |
R | -0.021941 |
S | -0.020043 |
T | 0.052264 |
U | -0.021737 |