getwd()
Cancer = read.csv('cancer.csv')
objects()
head(Cancer)
X | avgAnnCount | medIncome | popEst2015 | povertyPercent | binnedInc | MedianAge | MedianAgeMale | MedianAgeFemale | Geography | ⋯ | PctPrivateCoverage | PctEmpPrivCoverage | PctPublicCoverage | PctWhite | PctBlack | PctAsian | PctOtherRace | PctMarriedHouseholds | BirthRate | deathRate |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1397 | 61898 | 260131 | 11.2 | (61494.5, 125635] | 39.3 | 36.9 | 41.7 | Kitsap County, Washington | ⋯ | 75.1 | 41.6 | 32.9 | 81.78053 | 2.5947283 | 4.8218571 | 1.8434785 | 52.85608 | 6.118831 | 164.9 |
2 | 173 | 48127 | 43269 | 18.6 | (48021.6, 51046.4] | 33.0 | 32.2 | 33.7 | Kittitas County, Washington | ⋯ | 70.2 | 43.6 | 31.1 | 89.22851 | 0.9691025 | 2.2462326 | 3.7413515 | 45.37250 | 4.333096 | 161.3 |
3 | 102 | 49348 | 21026 | 14.6 | (48021.6, 51046.4] | 45.0 | 44.0 | 45.8 | Klickitat County, Washington | ⋯ | 63.7 | 34.9 | 42.1 | 90.92219 | 0.7396734 | 0.4658982 | 2.7473583 | 54.44487 | 3.729488 | 174.7 |
4 | 427 | 44243 | 75882 | 17.1 | (42724.4, 45201] | 42.8 | 42.2 | 43.4 | Lewis County, Washington | ⋯ | 58.4 | 35.0 | 45.3 | 91.74469 | 0.7826260 | 1.1613587 | 1.3626432 | 51.02151 | 4.603841 | 194.8 |
5 | 57 | 49955 | 10321 | 12.5 | (48021.6, 51046.4] | 48.3 | 47.8 | 48.9 | Lincoln County, Washington | ⋯ | 61.6 | 35.1 | 44.0 | 94.10402 | 0.2701920 | 0.6658304 | 0.4921355 | 54.02746 | 6.796657 | 144.4 |
6 | 428 | 52313 | 61023 | 15.6 | (51046.4, 54545.6] | 45.4 | 43.5 | 48.0 | Mason County, Washington | ⋯ | 60.0 | 32.6 | 43.2 | 84.88263 | 1.6532052 | 1.5380566 | 3.3146354 | 51.22036 | 4.964476 | 176.0 |
Cancer$binnedInc[3]
nrow(Cancer)
ncol(Cancer)
colnames(Cancer)
summary(Cancer)
X avgAnnCount medIncome popEst2015 Min. : 1.0 Min. : 6.0 Min. : 22640 Min. : 827 1st Qu.: 762.5 1st Qu.: 76.0 1st Qu.: 38882 1st Qu.: 11684 Median :1524.0 Median : 171.0 Median : 45207 Median : 26643 Mean :1524.0 Mean : 606.3 Mean : 47063 Mean : 102637 3rd Qu.:2285.5 3rd Qu.: 518.0 3rd Qu.: 52492 3rd Qu.: 68671 Max. :3047.0 Max. :38150.0 Max. :125635 Max. :10170292 povertyPercent binnedInc MedianAge MedianAgeMale Min. : 3.20 (45201, 48021.6] : 306 Min. : 22.30 Min. :22.40 1st Qu.:12.15 (54545.6, 61494.5]: 306 1st Qu.: 37.70 1st Qu.:36.35 Median :15.90 [22640, 34218.1] : 306 Median : 41.00 Median :39.60 Mean :16.88 (42724.4, 45201] : 305 Mean : 45.27 Mean :39.57 3rd Qu.:20.40 (48021.6, 51046.4]: 305 3rd Qu.: 44.00 3rd Qu.:42.50 Max. :47.40 (51046.4, 54545.6]: 305 Max. :624.00 Max. :64.70 (Other) :1214 MedianAgeFemale Geography AvgHouseholdSize Min. :22.30 Abbeville County, South Carolina: 1 Min. :0.0221 1st Qu.:39.10 Acadia Parish, Louisiana : 1 1st Qu.:2.3700 Median :42.40 Accomack County, Virginia : 1 Median :2.5000 Mean :42.15 Ada County, Idaho : 1 Mean :2.4797 3rd Qu.:45.30 Adair County, Iowa : 1 3rd Qu.:2.6300 Max. :65.70 Adair County, Kentucky : 1 Max. :3.9700 (Other) :3041 PercentMarried PctNoHS18_24 PctHS18_24 PctSomeCol18_24 Min. :23.10 Min. : 0.00 Min. : 0.0 Min. : 7.10 1st Qu.:47.75 1st Qu.:12.80 1st Qu.:29.2 1st Qu.:34.00 Median :52.40 Median :17.10 Median :34.7 Median :40.40 Mean :51.77 Mean :18.22 Mean :35.0 Mean :40.98 3rd Qu.:56.40 3rd Qu.:22.70 3rd Qu.:40.7 3rd Qu.:46.40 Max. :72.50 Max. :64.10 Max. :72.5 Max. :79.00 NA's :2285 PctBachDeg18_24 PctHS25_Over PctBachDeg25_Over PctEmployed16_Over Min. : 0.000 Min. : 7.50 Min. : 2.50 Min. :17.60 1st Qu.: 3.100 1st Qu.:30.40 1st Qu.: 9.40 1st Qu.:48.60 Median : 5.400 Median :35.30 Median :12.30 Median :54.50 Mean : 6.158 Mean :34.80 Mean :13.28 Mean :54.15 3rd Qu.: 8.200 3rd Qu.:39.65 3rd Qu.:16.10 3rd Qu.:60.30 Max. :51.800 Max. :54.80 Max. :42.20 Max. :80.10 NA's :152 PctUnemployed16_Over PctPrivateCoverage PctEmpPrivCoverage PctPublicCoverage Min. : 0.400 Min. :22.30 Min. :13.5 Min. :11.20 1st Qu.: 5.500 1st Qu.:57.20 1st Qu.:34.5 1st Qu.:30.90 Median : 7.600 Median :65.10 Median :41.1 Median :36.30 Mean : 7.852 Mean :64.35 Mean :41.2 Mean :36.25 3rd Qu.: 9.700 3rd Qu.:72.10 3rd Qu.:47.7 3rd Qu.:41.55 Max. :29.400 Max. :92.30 Max. :70.7 Max. :65.10 PctWhite PctBlack PctAsian PctOtherRace Min. : 10.20 Min. : 0.0000 Min. : 0.0000 Min. : 0.0000 1st Qu.: 77.30 1st Qu.: 0.6207 1st Qu.: 0.2542 1st Qu.: 0.2952 Median : 90.06 Median : 2.2476 Median : 0.5498 Median : 0.8262 Mean : 83.65 Mean : 9.1080 Mean : 1.2540 Mean : 1.9835 3rd Qu.: 95.45 3rd Qu.:10.5097 3rd Qu.: 1.2210 3rd Qu.: 2.1780 Max. :100.00 Max. :85.9478 Max. :42.6194 Max. :41.9303 PctMarriedHouseholds BirthRate deathRate Min. :22.99 Min. : 0.000 Min. : 59.7 1st Qu.:47.76 1st Qu.: 4.521 1st Qu.:161.2 Median :51.67 Median : 5.381 Median :178.1 Mean :51.24 Mean : 5.640 Mean :178.7 3rd Qu.:55.40 3rd Qu.: 6.494 3rd Qu.:195.2 Max. :78.08 Max. :21.326 Max. :362.8
age_error = subset(Cancer, MedianAge > 100)
nrow(age_error)
#age_error
#does mean household size < 1 make sense?
household_error = subset(Cancer3, AvgHouseholdSize < 1)
nrow(household_error)
#Cancer2 = subset(Cancer, MedianAge < 100 | AvgHouseholdSize < 1)
#nrow(Cancer2)
length(unique(Cancer[["Geography"]]))
library(dplyr)
library(tidyr)
Cancer2 = Cancer %>% separate(Geography, c("County", "State"), sep = ",", remove = FALSE)
colnames(Cancer2)
#does mean household size < 1 make sense?
household_error = subset(Cancer3, AvgHouseholdSize < 1)
nrow(household_error)
#age_error
Cancer2$MedianAge[Cancer2$MedianAge > 100] = NA
Cancer2$AvgHouseholdSize[Cancer2$AvgHouseholdSize < 1] = NA
summary(Cancer2)
X avgAnnCount medIncome popEst2015 Min. : 1.0 Min. : 6.0 Min. : 22640 Min. : 827 1st Qu.: 762.5 1st Qu.: 76.0 1st Qu.: 38882 1st Qu.: 11684 Median :1524.0 Median : 171.0 Median : 45207 Median : 26643 Mean :1524.0 Mean : 606.3 Mean : 47063 Mean : 102637 3rd Qu.:2285.5 3rd Qu.: 518.0 3rd Qu.: 52492 3rd Qu.: 68671 Max. :3047.0 Max. :38150.0 Max. :125635 Max. :10170292 povertyPercent binnedInc MedianAge MedianAgeMale Min. : 3.20 (45201, 48021.6] : 306 Min. :22.30 Min. :22.40 1st Qu.:12.15 (54545.6, 61494.5]: 306 1st Qu.:37.70 1st Qu.:36.35 Median :15.90 [22640, 34218.1] : 306 Median :40.90 Median :39.60 Mean :16.88 (42724.4, 45201] : 305 Mean :40.82 Mean :39.57 3rd Qu.:20.40 (48021.6, 51046.4]: 305 3rd Qu.:43.80 3rd Qu.:42.50 Max. :47.40 (51046.4, 54545.6]: 305 Max. :65.30 Max. :64.70 (Other) :1214 NA's :30 MedianAgeFemale Geography County Min. :22.30 Abbeville County, South Carolina: 1 Length:3047 1st Qu.:39.10 Acadia Parish, Louisiana : 1 Class :character Median :42.40 Accomack County, Virginia : 1 Mode :character Mean :42.15 Ada County, Idaho : 1 3rd Qu.:45.30 Adair County, Iowa : 1 Max. :65.70 Adair County, Kentucky : 1 (Other) :3041 State AvgHouseholdSize PercentMarried PctNoHS18_24 Length:3047 Min. :1.86 Min. :23.10 Min. : 0.00 Class :character 1st Qu.:2.37 1st Qu.:47.75 1st Qu.:12.80 Mode :character Median :2.50 Median :52.40 Median :17.10 Mean :2.53 Mean :51.77 Mean :18.22 3rd Qu.:2.64 3rd Qu.:56.40 3rd Qu.:22.70 Max. :3.97 Max. :72.50 Max. :64.10 NA's :61 PctHS18_24 PctSomeCol18_24 PctBachDeg18_24 PctHS25_Over Min. : 0.0 Min. : 7.10 Min. : 0.000 Min. : 7.50 1st Qu.:29.2 1st Qu.:34.00 1st Qu.: 3.100 1st Qu.:30.40 Median :34.7 Median :40.40 Median : 5.400 Median :35.30 Mean :35.0 Mean :40.98 Mean : 6.158 Mean :34.80 3rd Qu.:40.7 3rd Qu.:46.40 3rd Qu.: 8.200 3rd Qu.:39.65 Max. :72.5 Max. :79.00 Max. :51.800 Max. :54.80 NA's :2285 PctBachDeg25_Over PctEmployed16_Over PctUnemployed16_Over PctPrivateCoverage Min. : 2.50 Min. :17.60 Min. : 0.400 Min. :22.30 1st Qu.: 9.40 1st Qu.:48.60 1st Qu.: 5.500 1st Qu.:57.20 Median :12.30 Median :54.50 Median : 7.600 Median :65.10 Mean :13.28 Mean :54.15 Mean : 7.852 Mean :64.35 3rd Qu.:16.10 3rd Qu.:60.30 3rd Qu.: 9.700 3rd Qu.:72.10 Max. :42.20 Max. :80.10 Max. :29.400 Max. :92.30 NA's :152 PctEmpPrivCoverage PctPublicCoverage PctWhite PctBlack Min. :13.5 Min. :11.20 Min. : 10.20 Min. : 0.0000 1st Qu.:34.5 1st Qu.:30.90 1st Qu.: 77.30 1st Qu.: 0.6207 Median :41.1 Median :36.30 Median : 90.06 Median : 2.2476 Mean :41.2 Mean :36.25 Mean : 83.65 Mean : 9.1080 3rd Qu.:47.7 3rd Qu.:41.55 3rd Qu.: 95.45 3rd Qu.:10.5097 Max. :70.7 Max. :65.10 Max. :100.00 Max. :85.9478 PctAsian PctOtherRace PctMarriedHouseholds BirthRate Min. : 0.0000 Min. : 0.0000 Min. :22.99 Min. : 0.000 1st Qu.: 0.2542 1st Qu.: 0.2952 1st Qu.:47.76 1st Qu.: 4.521 Median : 0.5498 Median : 0.8262 Median :51.67 Median : 5.381 Mean : 1.2540 Mean : 1.9835 Mean :51.24 Mean : 5.640 3rd Qu.: 1.2210 3rd Qu.: 2.1780 3rd Qu.:55.40 3rd Qu.: 6.494 Max. :42.6194 Max. :41.9303 Max. :78.08 Max. :21.326 deathRate Min. : 59.7 1st Qu.:161.2 Median :178.1 Mean :178.7 3rd Qu.:195.2 Max. :362.8
str(Cancer2)
'data.frame': 3047 obs. of 32 variables: $ X : int 1 2 3 4 5 6 7 8 9 10 ... $ avgAnnCount : num 1397 173 102 427 57 ... $ medIncome : int 61898 48127 49348 44243 49955 52313 37782 40189 42579 60397 ... $ popEst2015 : int 260131 43269 21026 75882 10321 61023 41516 20848 13088 843954 ... $ povertyPercent : num 11.2 18.6 14.6 17.1 12.5 15.6 23.2 17.8 22.3 13.1 ... $ binnedInc : Factor w/ 10 levels "(34218.1, 37413.8]",..: 9 6 6 4 6 7 2 2 3 8 ... $ MedianAge : num 39.3 33 45 42.8 48.3 45.4 42.6 51.7 49.3 35.8 ... $ MedianAgeMale : num 36.9 32.2 44 42.2 47.8 43.5 42.2 50.8 48.4 34.7 ... $ MedianAgeFemale : num 41.7 33.7 45.8 43.4 48.9 48 43.5 52.5 49.8 37 ... $ Geography : Factor w/ 3047 levels "Abbeville County, South Carolina",..: 1459 1460 1464 1589 1618 1766 2051 2112 2143 2185 ... $ County : chr "Kitsap County" "Kittitas County" "Klickitat County" "Lewis County" ... $ State : chr " Washington" " Washington" " Washington" " Washington" ... $ AvgHouseholdSize : num 2.54 2.34 2.62 2.52 2.34 2.58 2.42 2.24 2.38 2.65 ... $ PercentMarried : num 52.5 44.5 54.2 52.7 57.8 50.4 54.1 52.7 55.9 50 ... $ PctNoHS18_24 : num 11.5 6.1 24 20.2 14.9 29.9 26.1 27.3 34.7 15.6 ... $ PctHS18_24 : num 39.5 22.4 36.6 41.2 43 35.1 41.4 33.9 39.4 36.3 ... $ PctSomeCol18_24 : num 42.1 64 NA 36.1 40 NA NA 36.5 NA NA ... $ PctBachDeg18_24 : num 6.9 7.5 9.5 2.5 2 4.5 5.8 2.2 1.4 7.1 ... $ PctHS25_Over : num 23.2 26 29 31.6 33.4 30.4 29.8 31.6 32.2 28.8 ... $ PctBachDeg25_Over : num 19.6 22.7 16 9.3 15 11.9 11.9 11.3 12 16.2 ... $ PctEmployed16_Over : num 51.9 55.9 45.9 48.3 48.2 44.1 51.8 40.9 39.5 56.6 ... $ PctUnemployed16_Over: num 8 7.8 7 12.1 4.8 12.9 8.9 8.9 10.3 9.2 ... $ PctPrivateCoverage : num 75.1 70.2 63.7 58.4 61.6 60 49.5 55.8 55.5 69.9 ... $ PctEmpPrivCoverage : num 41.6 43.6 34.9 35 35.1 32.6 28.3 25.9 29.9 44.4 ... $ PctPublicCoverage : num 32.9 31.1 42.1 45.3 44 43.2 46.4 50.9 48.1 31.4 ... $ PctWhite : num 81.8 89.2 90.9 91.7 94.1 ... $ PctBlack : num 2.595 0.969 0.74 0.783 0.27 ... $ PctAsian : num 4.822 2.246 0.466 1.161 0.666 ... $ PctOtherRace : num 1.843 3.741 2.747 1.363 0.492 ... $ PctMarriedHouseholds: num 52.9 45.4 54.4 51 54 ... $ BirthRate : num 6.12 4.33 3.73 4.6 6.8 ... $ deathRate : num 165 161 175 195 144 ...
library(car)
Loading required package: carData Attaching package: ‘car’ The following object is masked from ‘package:dplyr’: recode
scatterplotMatrix( ~ avgAnnCount + medIncome + povertyPercent + binnedInc + MedianAge + PctPrivateCoverage
+ PctEmpPrivCoverage + PctPublicCoverage,diagonal=list(method="histogram"), data = Cancer2,
main = "Scatterplot Matrix for Key Cancer Data Variables")
cor(Cancer2[ , c("avgAnnCount", "medIncome", "povertyPercent", "MedianAge", "PctPrivateCoverage",
"PctEmpPrivCoverage", "PctPublicCoverage")], use = "complete.obs")
avgAnnCount | medIncome | povertyPercent | MedianAge | PctPrivateCoverage | PctEmpPrivCoverage | PctPublicCoverage | |
---|---|---|---|---|---|---|---|
avgAnnCount | 1.0000000 | 0.2692593 | -0.1354088 | -0.1224273 | 0.1318993 | 0.2021700 | -0.1726959 |
medIncome | 0.2692593 | 1.0000000 | -0.7882737 | -0.1174798 | 0.7236568 | 0.7464407 | -0.7543571 |
povertyPercent | -0.1354088 | -0.7882737 | 1.0000000 | -0.1937799 | -0.8223484 | -0.6820074 | 0.6511874 |
MedianAge | -0.1224273 | -0.1174798 | -0.1937799 | 1.0000000 | 0.0691826 | -0.2299898 | 0.4268144 |
PctPrivateCoverage | 0.1318993 | 0.7236568 | -0.8223484 | 0.0691826 | 1.0000000 | 0.8263302 | -0.7207873 |
PctEmpPrivCoverage | 0.2021700 | 0.7464407 | -0.6820074 | -0.2299898 | 0.8263302 | 1.0000000 | -0.7784563 |
PctPublicCoverage | -0.1726959 | -0.7543571 | 0.6511874 | 0.4268144 | -0.7207873 | -0.7784563 | 1.0000000 |