PS 2 Solutions
library(tidyverse)
#> ── Attaching core tidyverse packages ──── tidyverse 2.0.0 ──
#> ✔ dplyr 1.1.4 ✔ readr 2.1.5
#> ✔ forcats 1.0.0 ✔ stringr 1.5.1
#> ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
#> ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
#> ✔ purrr 1.0.2
#> ── Conflicts ────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Dataset_1 <- as_tibble(read.csv("C:/Users/alexp/OneDrive/Desktop/R Bootcamp/R_bootcamp/Dataset_1.csv"))
Dataset_2 <- as_tibble(read.csv("C:/Users/alexp/OneDrive/Desktop/R Bootcamp/R_bootcamp/Dataset_2.csv"))
Dataset_3 <- as_tibble(read.csv("C:/Users/alexp/OneDrive/Desktop/R Bootcamp/R_bootcamp/Dataset_3.csv"))
- In Dataset_1 the
area
variable is spread across multiple columns that represent values of thestate
variable. Transform/Tidy the dataset as follows: move the column names (IL-WI) to thestate
variable and the column values to thearea
variable. Then, remove all the missing value from the dataset. Name the new datasetdata1
.
data1 <- Dataset_1 %>%
pivot_longer(IL:WI, names_to="state", values_to="area", values_drop_na=T)
data1
#> # A tibble: 457 × 5
#> PID county poptotal state area
#> <int> <chr> <int> <chr> <dbl>
#> 1 561 ADAMS 66090 IL 0.052
#> 2 562 ALEXANDER 10626 IL 0.014
#> 3 563 BOND 14991 IL 0.022
#> 4 564 BOONE 30806 IL 0.017
#> 5 565 BROWN 5836 IL 0.018
#> 6 566 BUREAU 35688 IL 0.05
#> 7 567 CALHOUN 5322 IL 0.017
#> 8 568 CARROLL 16805 IL 0.027
#> 9 569 CASS 13437 IL 0.024
#> 10 570 CHAMPAIGN 173025 IL 0.058
#> # ℹ 447 more rows
- In Dataset_2 the
race
variable combines information about the total population for each racial group, given in following order: White, Black, American Indians, Asians, Other. Pull apart this column into multiple columns, by splitting wherever a separator character appers. Name these variablespopwhite
,popblack
,popamerindian
,popasian
, andpopother
, respectively. Name the new datasetdata2
.
data2 <- Dataset_2 %>%
separate(race, into=c("popwhite", "popblack", "popamerindian", "popasian", "popother"), sep="/", convert=T)
data2
#> # A tibble: 487 × 9
#> county state popwhite popblack popamerindian popasian
#> <chr> <chr> <int> <int> <int> <int>
#> 1 ADAMS IL 63917 1702 98 249
#> 2 ALEXANDER IL 7054 3496 19 48
#> 3 BOND IL 14477 429 35 16
#> 4 BOONE IL 29344 127 46 150
#> 5 BROWN IL 5264 547 14 5
#> 6 BUREAU IL 35157 50 65 195
#> 7 CALHOUN IL 5298 1 8 15
#> 8 CARROLL IL 16519 111 30 61
#> 9 CASS IL 13384 16 8 23
#> 10 CHAMPAIGN IL 146506 16559 331 8033
#> # ℹ 477 more rows
#> # ℹ 3 more variables: popother <int>, inmetro <int>,
#> # category <chr>
-
Merge
data1
anddata2
: they should be joined by thestate
variable and the output result should contain all the observation fromdata1
, regardless of whether they match or not. What are the dimensions of the output? How many missing values does it contain?Repeat the above procedure again by joining the datasets by the
state
andcounty
variables.
out3_1 <- data1 %>%
left_join(data2, by="state")
#> Warning in left_join(., data2, by = "state"): Detected an unexpected many-to-many relationship between
#> `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set
#> `relationship = "many-to-many"` to silence this warning.
out3_1
#> # A tibble: 46,825 × 13
#> PID county.x poptotal state area county.y popwhite
#> <int> <chr> <int> <chr> <dbl> <chr> <int>
#> 1 561 ADAMS 66090 IL 0.052 ADAMS 63917
#> 2 561 ADAMS 66090 IL 0.052 ALEXANDER 7054
#> 3 561 ADAMS 66090 IL 0.052 BOND 14477
#> 4 561 ADAMS 66090 IL 0.052 BOONE 29344
#> 5 561 ADAMS 66090 IL 0.052 BROWN 5264
#> 6 561 ADAMS 66090 IL 0.052 BUREAU 35157
#> 7 561 ADAMS 66090 IL 0.052 CALHOUN 5298
#> 8 561 ADAMS 66090 IL 0.052 CARROLL 16519
#> 9 561 ADAMS 66090 IL 0.052 CASS 13384
#> 10 561 ADAMS 66090 IL 0.052 CHAMPAIGN 146506
#> # ℹ 46,815 more rows
#> # ℹ 6 more variables: popblack <int>, popamerindian <int>,
#> # popasian <int>, popother <int>, inmetro <int>,
#> # category <chr>
dim_desc(out3_1)
#> [1] "[46,825 x 13]"
out3_1 %>%
summarize(across(everything(), ~sum(is.na(.))))
#> # A tibble: 1 × 13
#> PID county.x poptotal state area county.y popwhite
#> <int> <int> <int> <int> <int> <int> <int>
#> 1 0 0 0 0 0 0 0
#> # ℹ 6 more variables: popblack <int>, popamerindian <int>,
#> # popasian <int>, popother <int>, inmetro <int>,
#> # category <int>
out3_2 <- data1 %>%
left_join(data2, by=c("state", "county"))
#> Warning in left_join(., data2, by = c("state", "county")): Detected an unexpected many-to-many relationship between
#> `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set
#> `relationship = "many-to-many"` to silence this warning.
out3_2
#> # A tibble: 527 × 12
#> PID county poptotal state area popwhite popblack
#> <int> <chr> <int> <chr> <dbl> <int> <int>
#> 1 561 ADAMS 66090 IL 0.052 63917 1702
#> 2 561 ADAMS 66090 IL 0.052 63917 1702
#> 3 562 ALEXANDER 10626 IL 0.014 7054 3496
#> 4 562 ALEXANDER 10626 IL 0.014 7054 3496
#> 5 563 BOND 14991 IL 0.022 14477 429
#> 6 563 BOND 14991 IL 0.022 14477 429
#> 7 564 BOONE 30806 IL 0.017 29344 127
#> 8 564 BOONE 30806 IL 0.017 29344 127
#> 9 565 BROWN 5836 IL 0.018 5264 547
#> 10 565 BROWN 5836 IL 0.018 5264 547
#> # ℹ 517 more rows
#> # ℹ 5 more variables: popamerindian <int>, popasian <int>,
#> # popother <int>, inmetro <int>, category <chr>
dim_desc(out3_2)
#> [1] "[527 x 12]"
out3_2 %>%
summarize(across(everything(), ~sum(is.na(.))))
#> # A tibble: 1 × 12
#> PID county poptotal state area popwhite popblack
#> <int> <int> <int> <int> <int> <int> <int>
#> 1 0 0 0 0 0 0 0
#> # ℹ 5 more variables: popamerindian <int>, popasian <int>,
#> # popother <int>, inmetro <int>, category <int>
-
Merge
data1
anddata2
: they should be joined by thestate
variable and the output result should contain all the observation fromdata2
, regardless of whether they match or not. What are the dimensions of the output? How many missing values does it contain?Repeat the above procedure again by joining the datasets by the
state
andcounty
variables.
out4_1 <- data1 %>%
right_join(data2, by="state")
#> Warning in right_join(., data2, by = "state"): Detected an unexpected many-to-many relationship between
#> `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set
#> `relationship = "many-to-many"` to silence this warning.
out4_1
#> # A tibble: 46,825 × 13
#> PID county.x poptotal state area county.y popwhite
#> <int> <chr> <int> <chr> <dbl> <chr> <int>
#> 1 561 ADAMS 66090 IL 0.052 ADAMS 63917
#> 2 561 ADAMS 66090 IL 0.052 ALEXANDER 7054
#> 3 561 ADAMS 66090 IL 0.052 BOND 14477
#> 4 561 ADAMS 66090 IL 0.052 BOONE 29344
#> 5 561 ADAMS 66090 IL 0.052 BROWN 5264
#> 6 561 ADAMS 66090 IL 0.052 BUREAU 35157
#> 7 561 ADAMS 66090 IL 0.052 CALHOUN 5298
#> 8 561 ADAMS 66090 IL 0.052 CARROLL 16519
#> 9 561 ADAMS 66090 IL 0.052 CASS 13384
#> 10 561 ADAMS 66090 IL 0.052 CHAMPAIGN 146506
#> # ℹ 46,815 more rows
#> # ℹ 6 more variables: popblack <int>, popamerindian <int>,
#> # popasian <int>, popother <int>, inmetro <int>,
#> # category <chr>
dim_desc(out4_1)
#> [1] "[46,825 x 13]"
out4_1 %>%
summarize(across(everything(), ~sum(is.na(.))))
#> # A tibble: 1 × 13
#> PID county.x poptotal state area county.y popwhite
#> <int> <int> <int> <int> <int> <int> <int>
#> 1 0 0 0 0 0 0 0
#> # ℹ 6 more variables: popblack <int>, popamerindian <int>,
#> # popasian <int>, popother <int>, inmetro <int>,
#> # category <int>
out4_2 <- data1 %>%
right_join(data2, by=c("state","county"))
#> Warning in right_join(., data2, by = c("state", "county")): Detected an unexpected many-to-many relationship between
#> `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set
#> `relationship = "many-to-many"` to silence this warning.
out4_2
#> # A tibble: 527 × 12
#> PID county poptotal state area popwhite popblack
#> <int> <chr> <int> <chr> <dbl> <int> <int>
#> 1 561 ADAMS 66090 IL 0.052 63917 1702
#> 2 561 ADAMS 66090 IL 0.052 63917 1702
#> 3 562 ALEXANDER 10626 IL 0.014 7054 3496
#> 4 562 ALEXANDER 10626 IL 0.014 7054 3496
#> 5 563 BOND 14991 IL 0.022 14477 429
#> 6 563 BOND 14991 IL 0.022 14477 429
#> 7 564 BOONE 30806 IL 0.017 29344 127
#> 8 564 BOONE 30806 IL 0.017 29344 127
#> 9 565 BROWN 5836 IL 0.018 5264 547
#> 10 565 BROWN 5836 IL 0.018 5264 547
#> # ℹ 517 more rows
#> # ℹ 5 more variables: popamerindian <int>, popasian <int>,
#> # popother <int>, inmetro <int>, category <chr>
dim_desc(out4_2)
#> [1] "[527 x 12]"
out4_2 %>%
summarize(across(everything(), ~sum(is.na(.))))
#> # A tibble: 1 × 12
#> PID county poptotal state area popwhite popblack
#> <int> <int> <int> <int> <int> <int> <int>
#> 1 0 0 0 0 0 0 0
#> # ℹ 5 more variables: popamerindian <int>, popasian <int>,
#> # popother <int>, inmetro <int>, category <int>
-
Merge
data1
anddata2
: they should be joined by thestate
variable and the output result should contain all the observation from bothdata1
anddata2
, regardless of whether they match or not. What are the dimensions of the output? How many missing values does it contain?Repeat the above procedure again by joining the datasets by the
state
andcounty
variables.
out5_1 <- data1 %>%
full_join(data2, by="state")
#> Warning in full_join(., data2, by = "state"): Detected an unexpected many-to-many relationship between
#> `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set
#> `relationship = "many-to-many"` to silence this warning.
out5_1
#> # A tibble: 46,825 × 13
#> PID county.x poptotal state area county.y popwhite
#> <int> <chr> <int> <chr> <dbl> <chr> <int>
#> 1 561 ADAMS 66090 IL 0.052 ADAMS 63917
#> 2 561 ADAMS 66090 IL 0.052 ALEXANDER 7054
#> 3 561 ADAMS 66090 IL 0.052 BOND 14477
#> 4 561 ADAMS 66090 IL 0.052 BOONE 29344
#> 5 561 ADAMS 66090 IL 0.052 BROWN 5264
#> 6 561 ADAMS 66090 IL 0.052 BUREAU 35157
#> 7 561 ADAMS 66090 IL 0.052 CALHOUN 5298
#> 8 561 ADAMS 66090 IL 0.052 CARROLL 16519
#> 9 561 ADAMS 66090 IL 0.052 CASS 13384
#> 10 561 ADAMS 66090 IL 0.052 CHAMPAIGN 146506
#> # ℹ 46,815 more rows
#> # ℹ 6 more variables: popblack <int>, popamerindian <int>,
#> # popasian <int>, popother <int>, inmetro <int>,
#> # category <chr>
dim_desc(out5_1)
#> [1] "[46,825 x 13]"
out5_1 %>%
summarize(across(everything(), ~sum(is.na(.))))
#> # A tibble: 1 × 13
#> PID county.x poptotal state area county.y popwhite
#> <int> <int> <int> <int> <int> <int> <int>
#> 1 0 0 0 0 0 0 0
#> # ℹ 6 more variables: popblack <int>, popamerindian <int>,
#> # popasian <int>, popother <int>, inmetro <int>,
#> # category <int>
out5_2 <- data1 %>%
full_join(data2, by=c("state","county"))
#> Warning in full_join(., data2, by = c("state", "county")): Detected an unexpected many-to-many relationship between
#> `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set
#> `relationship = "many-to-many"` to silence this warning.
out5_2
#> # A tibble: 527 × 12
#> PID county poptotal state area popwhite popblack
#> <int> <chr> <int> <chr> <dbl> <int> <int>
#> 1 561 ADAMS 66090 IL 0.052 63917 1702
#> 2 561 ADAMS 66090 IL 0.052 63917 1702
#> 3 562 ALEXANDER 10626 IL 0.014 7054 3496
#> 4 562 ALEXANDER 10626 IL 0.014 7054 3496
#> 5 563 BOND 14991 IL 0.022 14477 429
#> 6 563 BOND 14991 IL 0.022 14477 429
#> 7 564 BOONE 30806 IL 0.017 29344 127
#> 8 564 BOONE 30806 IL 0.017 29344 127
#> 9 565 BROWN 5836 IL 0.018 5264 547
#> 10 565 BROWN 5836 IL 0.018 5264 547
#> # ℹ 517 more rows
#> # ℹ 5 more variables: popamerindian <int>, popasian <int>,
#> # popother <int>, inmetro <int>, category <chr>
dim_desc(out5_2)
#> [1] "[527 x 12]"
out5_2 %>%
summarize(across(everything(), ~sum(is.na(.))))
#> # A tibble: 1 × 12
#> PID county poptotal state area popwhite popblack
#> <int> <int> <int> <int> <int> <int> <int>
#> 1 0 0 0 0 0 0 0
#> # ℹ 5 more variables: popamerindian <int>, popasian <int>,
#> # popother <int>, inmetro <int>, category <int>
- Join
data1
anddata2
by thestate
andcounty
variables such that the unmatched rows in either input dataset are not included in the result. Name the output datasetdata
.
data <- data1 %>%
inner_join(data2, by=c("state","county"))
#> Warning in inner_join(., data2, by = c("state", "county")): Detected an unexpected many-to-many relationship between
#> `x` and `y`.
#> ℹ Row 1 of `x` matches multiple rows in `y`.
#> ℹ Row 1 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set
#> `relationship = "many-to-many"` to silence this warning.
data
#> # A tibble: 527 × 12
#> PID county poptotal state area popwhite popblack
#> <int> <chr> <int> <chr> <dbl> <int> <int>
#> 1 561 ADAMS 66090 IL 0.052 63917 1702
#> 2 561 ADAMS 66090 IL 0.052 63917 1702
#> 3 562 ALEXANDER 10626 IL 0.014 7054 3496
#> 4 562 ALEXANDER 10626 IL 0.014 7054 3496
#> 5 563 BOND 14991 IL 0.022 14477 429
#> 6 563 BOND 14991 IL 0.022 14477 429
#> 7 564 BOONE 30806 IL 0.017 29344 127
#> 8 564 BOONE 30806 IL 0.017 29344 127
#> 9 565 BROWN 5836 IL 0.018 5264 547
#> 10 565 BROWN 5836 IL 0.018 5264 547
#> # ℹ 517 more rows
#> # ℹ 5 more variables: popamerindian <int>, popasian <int>,
#> # popother <int>, inmetro <int>, category <chr>
- Remove
PID
andcategory
fromdata
by selecting all variables except these two. Then, remove all duplicates from the dataset.
data <- data %>%
select(!c(PID, category)) %>%
distinct()
data
#> # A tibble: 437 × 10
#> county poptotal state area popwhite popblack
#> <chr> <int> <chr> <dbl> <int> <int>
#> 1 ADAMS 66090 IL 0.052 63917 1702
#> 2 ALEXANDER 10626 IL 0.014 7054 3496
#> 3 BOND 14991 IL 0.022 14477 429
#> 4 BOONE 30806 IL 0.017 29344 127
#> 5 BROWN 5836 IL 0.018 5264 547
#> 6 BUREAU 35688 IL 0.05 35157 50
#> 7 CALHOUN 5322 IL 0.017 5298 1
#> 8 CARROLL 16805 IL 0.027 16519 111
#> 9 CASS 13437 IL 0.024 13384 16
#> 10 CHAMPAIGN 173025 IL 0.058 146506 16559
#> # ℹ 427 more rows
#> # ℹ 4 more variables: popamerindian <int>, popasian <int>,
#> # popother <int>, inmetro <int>
-
Add a new variable to the dataset (name it
popdensity
), which represents a population density in each county (in order to compute a population density you need to divide the total population by the corresponding area).Then, for each racial group, add a new variable to the dataset that will represent the percentage of the total population that belongs to this group (name them
percwhite
,percblack
,percamerindan
,percasian
, andpercother
, respectively).
data <- data %>%
mutate(popdensity=poptotal/area,
percwhite=100*popwhite/poptotal,
percblack=100*popblack/poptotal,
percamerindian=100*popamerindian/poptotal,
percasian=100*popasian/poptotal,
percother=100*popother/poptotal)
data[8:13]
#> # A tibble: 437 × 6
#> popasian popother inmetro popdensity percwhite percblack
#> <int> <int> <int> <dbl> <dbl> <dbl>
#> 1 249 124 0 1270962. 96.7 2.58
#> 2 48 9 0 759000 66.4 32.9
#> 3 16 34 0 681409. 96.6 2.86
#> 4 150 1139 1 1812118. 95.3 0.412
#> 5 5 6 0 324222. 90.2 9.37
#> 6 195 221 0 713760 98.5 0.140
#> 7 15 0 0 313059. 99.5 0.0188
#> 8 61 84 0 622407. 98.3 0.661
#> 9 23 6 0 559875 99.6 0.119
#> 10 8033 1596 1 2983190. 84.7 9.57
#> # ℹ 427 more rows
- Join
data
andDataset_3
by thestate
andcounty
variables such that the unmatched rows in either input dataset are not included in the result.
data3 <- Dataset_3
data3
#> # A tibble: 437 × 10
#> county state popadults perchsd percollege percprof
#> <chr> <chr> <int> <dbl> <dbl> <dbl>
#> 1 ADAMS IL 43298 75.1 19.6 4.36
#> 2 ALEXANDER IL 6724 59.7 11.2 2.87
#> 3 BOND IL 9669 69.3 17.0 4.49
#> 4 BOONE IL 19272 75.5 17.3 4.20
#> 5 BROWN IL 3979 68.9 14.5 3.37
#> 6 BUREAU IL 23444 76.6 18.9 3.28
#> 7 CALHOUN IL 3583 62.8 11.9 3.21
#> 8 CARROLL IL 11323 76.0 16.2 3.06
#> 9 CASS IL 8825 72.3 14.1 3.21
#> 10 CHAMPAIGN IL 95971 87.5 41.3 17.8
#> # ℹ 427 more rows
#> # ℹ 4 more variables: percbelowpoverty <dbl>,
#> # percchildbelowpovert <dbl>, percadultpoverty <dbl>,
#> # percelderlypoverty <dbl>
data <- data %>%
inner_join(data3, by=c("state","county"))
data
#> # A tibble: 437 × 24
#> county poptotal state area popwhite popblack
#> <chr> <int> <chr> <dbl> <int> <int>
#> 1 ADAMS 66090 IL 0.052 63917 1702
#> 2 ALEXANDER 10626 IL 0.014 7054 3496
#> 3 BOND 14991 IL 0.022 14477 429
#> 4 BOONE 30806 IL 0.017 29344 127
#> 5 BROWN 5836 IL 0.018 5264 547
#> 6 BUREAU 35688 IL 0.05 35157 50
#> 7 CALHOUN 5322 IL 0.017 5298 1
#> 8 CARROLL 16805 IL 0.027 16519 111
#> 9 CASS 13437 IL 0.024 13384 16
#> 10 CHAMPAIGN 173025 IL 0.058 146506 16559
#> # ℹ 427 more rows
#> # ℹ 18 more variables: popamerindian <int>, popasian <int>,
#> # popother <int>, inmetro <int>, popdensity <dbl>,
#> # percwhite <dbl>, percblack <dbl>, percamerindian <dbl>,
#> # percasian <dbl>, percother <dbl>, popadults <int>,
#> # perchsd <dbl>, percollege <dbl>, percprof <dbl>,
#> # percbelowpoverty <dbl>, percchildbelowpovert <dbl>, …
- Add new variables to the dataset that correspond to the
percbelowpoverty
andpercadultpoverty
variables and represent the actual number of people that belong to these groups. Name these variablespopcbelowpoverty
, andpopcadultpoverty
, respectively.
data <- data %>%
mutate(popcbelowpoverty=poptotal*percbelowpoverty,
popcadultpoverty=popadults*percadultpoverty)
data[24:26]
#> # A tibble: 437 × 3
#> percelderlypoverty popcbelowpoverty popcadultpoverty
#> <dbl> <dbl> <dbl>
#> 1 12.4 869179. 476701.
#> 2 25.2 342628. 184141.
#> 3 12.7 180924. 104929.
#> 4 6.22 222081. 106690.
#> 5 19.2 78904. 44339.
#> 6 11.0 371142. 191755.
#> 7 21.1 80627. 46337.
#> 8 9.53 196799. 113536.
#> 9 13.7 186440. 105144.
#> 10 8.11 2694421. 1685513.
#> # ℹ 427 more rows
- Move the
inmetro
variable to the end of the dataset; change the position of thepoptotal
variable so that it comes after thearea
variables; change the position of thepopdensity
variable so that it comes after thepoptotal
variables.
data <- data %>%
relocate(inmetro, .after=last_col()) %>%
relocate(poptotal, .after=area) %>%
relocate(popdensity, .after=poptotal)
data[3:8]
#> # A tibble: 437 × 6
#> area poptotal popdensity popwhite popblack popamerindian
#> <dbl> <int> <dbl> <int> <int> <int>
#> 1 0.052 66090 1270962. 63917 1702 98
#> 2 0.014 10626 759000 7054 3496 19
#> 3 0.022 14991 681409. 14477 429 35
#> 4 0.017 30806 1812118. 29344 127 46
#> 5 0.018 5836 324222. 5264 547 14
#> 6 0.05 35688 713760 35157 50 65
#> 7 0.017 5322 313059. 5298 1 8
#> 8 0.027 16805 622407. 16519 111 30
#> 9 0.024 13437 559875 13384 16 8
#> 10 0.058 173025 2983190. 146506 16559 331
#> # ℹ 427 more rows
data[23:26]
#> # A tibble: 437 × 4
#> percelderlypoverty popcbelowpoverty popcadultpoverty
#> <dbl> <dbl> <dbl>
#> 1 12.4 869179. 476701.
#> 2 25.2 342628. 184141.
#> 3 12.7 180924. 104929.
#> 4 6.22 222081. 106690.
#> 5 19.2 78904. 44339.
#> 6 11.0 371142. 191755.
#> 7 21.1 80627. 46337.
#> 8 9.53 196799. 113536.
#> 9 13.7 186440. 105144.
#> 10 8.11 2694421. 1685513.
#> # ℹ 427 more rows
#> # ℹ 1 more variable: inmetro <int>
- What is the biggest/smallest state by the total population? What is the biggest/smallest state by the total area? What is the average population density in each state? In each state, compare the population densities of counties that are in metro area to counties that are not. Hint: you might want to use the grouping functions.
The state with minimum total population is “WI”, the state with maximum total population is “IL”.
data %>%
group_by(state) %>%
summarise(tot_pop = sum(poptotal)) %>%
arrange(tot_pop)
#> # A tibble: 5 × 2
#> state tot_pop
#> <chr> <int>
#> 1 WI 4891769
#> 2 IN 5544159
#> 3 MI 9295297
#> 4 OH 10847115
#> 5 IL 11430602
The state with minimum total area is “IN”, the state with maximum total area is “MI”.
data %>%
group_by(state) %>%
summarise(tot_area = sum(area)) %>%
arrange(tot_area)
#> # A tibble: 5 × 2
#> state tot_area
#> <chr> <dbl>
#> 1 IN 2.13
#> 2 OH 2.42
#> 3 WI 3.29
#> 4 IL 3.30
#> 5 MI 3.36
data %>%
group_by(state) %>%
summarise(avg_popdensity = mean(popdensity))
#> # A tibble: 5 × 2
#> state avg_popdensity
#> <chr> <dbl>
#> 1 IL 2823731.
#> 2 IN 2573130.
#> 3 MI 3010809.
#> 4 OH 4639024.
#> 5 WI 2372694.
data %>%
group_by(state, inmetro) %>%
summarise(avg_popdensity = mean(popdensity))
#> `summarise()` has grouped output by 'state'. You can
#> override using the `.groups` argument.
#> # A tibble: 10 × 3
#> # Groups: state [5]
#> state inmetro avg_popdensity
#> <chr> <int> <dbl>
#> 1 IL 0 773783.
#> 2 IL 1 8241451.
#> 3 IN 0 1183804.
#> 4 IN 1 4638344.
#> 5 MI 0 743790.
#> 6 MI 1 8270292.
#> 7 OH 0 1506462.
#> 8 OH 1 8398098.
#> 9 WI 0 674527.
#> 10 WI 1 6787929.
- In all states combined, what is the average percentage of each racial group? In each state separately, what is the average percentage of each racial group? In each state separately, compare the average percentage of each racial group in metro areas to non-metro areas. Discuss the results you obtained.
data %>%
summarise(across(percwhite:percother, ~mean(.x)))
#> # A tibble: 1 × 5
#> percwhite percblack percamerindian percasian percother
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 95.6 2.68 0.799 0.487 0.479
data %>%
group_by(state) %>%
summarise(across(percwhite:percother, ~mean(.x)))
#> # A tibble: 5 × 6
#> state percwhite percblack percamerindian percasian
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 IL 95.0 3.65 0.174 0.564
#> 2 IN 97.2 1.89 0.222 0.383
#> 3 MI 94.4 3.07 1.37 0.507
#> 4 OH 95.4 3.51 0.184 0.433
#> 5 WI 95.8 0.822 2.52 0.556
#> # ℹ 1 more variable: percother <dbl>
Percentages of white are higher in non-metro area. Percentages of black are higher in metro area. Percentages of American indian are higher in non-metro area. Percentages of Asian are higher in metro area. Percentages of other race groups are higher in metro area.
data %>%
group_by(state, inmetro) %>%
summarise(across(percwhite:percother, ~mean(.x)))
#> `summarise()` has grouped output by 'state'. You can
#> override using the `.groups` argument.
#> # A tibble: 10 × 7
#> # Groups: state [5]
#> state inmetro percwhite percblack percamerindian
#> <chr> <int> <dbl> <dbl> <dbl>
#> 1 IL 0 96.5 2.70 0.169
#> 2 IL 1 90.9 6.17 0.186
#> 3 IN 0 98.4 0.881 0.233
#> 4 IN 1 95.4 3.39 0.206
#> 5 MI 0 96.5 1.06 1.73
#> 6 MI 1 89.6 7.71 0.508
#> 7 OH 0 97.4 1.61 0.188
#> 8 OH 1 93.0 5.80 0.179
#> 9 WI 0 95.9 0.248 3.27
#> 10 WI 1 95.4 2.31 0.564
#> # ℹ 2 more variables: percasian <dbl>, percother <dbl>
- For each, state calculate the average percentage of the total population who are below poverty. Now using the same measurement, compare the metro areas to non-metro areas in each state. What do you observe? Now filter out counties with less than 50000 population and repeat the previous step. What do you observe?
data %>%
group_by(state) %>%
summarise(mean(percbelowpoverty))
#> # A tibble: 5 × 2
#> state `mean(percbelowpoverty)`
#> <chr> <dbl>
#> 1 IL 13.1
#> 2 IN 10.3
#> 3 MI 14.2
#> 4 OH 13.0
#> 5 WI 11.9
Percentages below poverty are higher in non-metro area.
data %>%
group_by(state, inmetro) %>%
summarise(mean(percbelowpoverty))
#> `summarise()` has grouped output by 'state'. You can
#> override using the `.groups` argument.
#> # A tibble: 10 × 3
#> # Groups: state [5]
#> state inmetro `mean(percbelowpoverty)`
#> <chr> <int> <dbl>
#> 1 IL 0 14.4
#> 2 IL 1 9.56
#> 3 IN 0 10.8
#> 4 IN 1 9.59
#> 5 MI 0 15.5
#> 6 MI 1 11.3
#> 7 OH 0 14.3
#> 8 OH 1 11.5
#> 9 WI 0 13.0
#> 10 WI 1 9.01
The significance of the percentages of below-poverty in non-metro area higher than that of metro area becomes less.
data %>%
filter(poptotal >= 50000) %>%
group_by(state, inmetro) %>%
summarise(mean(percbelowpoverty))
#> `summarise()` has grouped output by 'state'. You can
#> override using the `.groups` argument.
#> # A tibble: 10 × 3
#> # Groups: state [5]
#> state inmetro `mean(percbelowpoverty)`
#> <chr> <int> <dbl>
#> 1 IL 0 15.7
#> 2 IL 1 10.6
#> 3 IN 0 10.6
#> 4 IN 1 10.6
#> 5 MI 0 13.0
#> 6 MI 1 11.3
#> 7 OH 0 13.8
#> 8 OH 1 11.8
#> 9 WI 0 8.70
#> 10 WI 1 8.82