library(tidyverse)
<- modeldata::ames
housing_dat
housing_dat## # A tibble: 2,930 × 74
## MS_SubClass MS_Zoning Lot_Frontage Lot_Area Street Alley Lot_Shape
## * <fct> <fct> <dbl> <int> <fct> <fct> <fct>
## 1 One_Story_1946_and_Ne… Resident… 141 31770 Pave No_A… Slightly…
## 2 One_Story_1946_and_Ne… Resident… 80 11622 Pave No_A… Regular
## 3 One_Story_1946_and_Ne… Resident… 81 14267 Pave No_A… Slightly…
## 4 One_Story_1946_and_Ne… Resident… 93 11160 Pave No_A… Regular
## 5 Two_Story_1946_and_Ne… Resident… 74 13830 Pave No_A… Slightly…
## 6 Two_Story_1946_and_Ne… Resident… 78 9978 Pave No_A… Slightly…
## 7 One_Story_PUD_1946_an… Resident… 41 4920 Pave No_A… Regular
## 8 One_Story_PUD_1946_an… Resident… 43 5005 Pave No_A… Slightly…
## 9 One_Story_PUD_1946_an… Resident… 39 5389 Pave No_A… Slightly…
## 10 Two_Story_1946_and_Ne… Resident… 60 7500 Pave No_A… Regular
## # ℹ 2,920 more rows
## # ℹ 67 more variables: Land_Contour <fct>, Utilities <fct>, Lot_Config <fct>,
## # Land_Slope <fct>, Neighborhood <fct>, Condition_1 <fct>, Condition_2 <fct>,
## # Bldg_Type <fct>, House_Style <fct>, Overall_Cond <fct>, Year_Built <int>,
## # Year_Remod_Add <int>, Roof_Style <fct>, Roof_Matl <fct>,
## # Exterior_1st <fct>, Exterior_2nd <fct>, Mas_Vnr_Type <fct>,
## # Mas_Vnr_Area <dbl>, Exter_Cond <fct>, Foundation <fct>, Bsmt_Cond <fct>, …
5 Levels of Data Wrangling Every R User Must Master
In today’s blog post, I’ll show you how to go from {dplyr}
beginner to pro in 5 simple stages. As always there’s also a video version available to this blog post:
On this endeavor, let’s work with the Ames housing data set.
Level 1: Do everything manually
Let’s run a little analysis of comparing area sizes of different parts of the house by neighborhood. All of the columns that contain sizes are indicated by “SF” (as in square fee).
In our analysis, we’ll grab the required columns, transform them to square meters and then calculate the mean size by neighborhood. In Level 1, this will all be done manually:
|>
housing_dat select(
Neighborhood,
BsmtFin_SF_1,
BsmtFin_SF_2,
Bsmt_Unf_SF,
Total_Bsmt_SF,
First_Flr_SF,
Second_Flr_SF,
Wood_Deck_SF,
Open_Porch_SF|>
) mutate(
BsmtFin_SF_1 = BsmtFin_SF_1 / 10.7639,
BsmtFin_SF_2 = BsmtFin_SF_2 / 10.7639,
Bsmt_Unf_SF = Bsmt_Unf_SF / 10.7639,
Total_Bsmt_SF = Total_Bsmt_SF / 10.7639,
First_Flr_SF = First_Flr_SF / 10.7639,
Second_Flr_SF = Second_Flr_SF / 10.7639,
Wood_Deck_SF = Wood_Deck_SF / 10.7639,
Open_Porch_SF = Open_Porch_SF / 10.7639
|>
) summarize(
BsmtFin_SF_1 = mean(BsmtFin_SF_1),
BsmtFin_SF_2 = mean(BsmtFin_SF_2),
Bsmt_Unf_SF = mean(Bsmt_Unf_SF),
Total_Bsmt_SF = mean(Total_Bsmt_SF),
First_Flr_SF = mean(First_Flr_SF),
Second_Flr_SF = mean(Second_Flr_SF),
Wood_Deck_SF = mean(Wood_Deck_SF),
Open_Porch_SF = mean(Open_Porch_SF),
.by = Neighborhood
)## # A tibble: 28 × 9
## Neighborhood BsmtFin_SF_1 BsmtFin_SF_2 Bsmt_Unf_SF Total_Bsmt_SF First_Flr_SF
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 North_Ames 0.340 7.77 43.0 95.8 109.
## 2 Gilbert 0.480 0.0636 58.6 81.3 87.4
## 3 Stone_Brook 0.341 2.76 70.8 145. 147.
## 4 Northwest_A… 0.277 5.81 50.6 109. 122.
## 5 Somerset 0.427 0.776 72.6 110. 110.
## 6 Briardale 0.356 0.910 27.4 52.3 52.3
## 7 Northpark_V… 0.137 8.34 41.2 85.8 85.3
## 8 Northridge_… 0.371 0.847 80.8 151. 150.
## 9 Bloomington… 0.392 0 91.0 121. 131.
## 10 Northridge 0.335 8.40 56.4 133. 137.
## # ℹ 18 more rows
## # ℹ 3 more variables: Second_Flr_SF <dbl>, Wood_Deck_SF <dbl>,
## # Open_Porch_SF <dbl>
Level 2: Use tidyselect helpers in select()
In Level 2, we can replace the long list of names by a tidyselect helper inside the select()
call.
|>
housing_dat select(Neighborhood, contains('SF')) |>
mutate(
BsmtFin_SF_1 = BsmtFin_SF_1 / 10.7639,
BsmtFin_SF_2 = BsmtFin_SF_2 / 10.7639,
Bsmt_Unf_SF = Bsmt_Unf_SF / 10.7639,
Total_Bsmt_SF = Total_Bsmt_SF / 10.7639,
First_Flr_SF = First_Flr_SF / 10.7639,
Second_Flr_SF = Second_Flr_SF / 10.7639,
Wood_Deck_SF = Wood_Deck_SF / 10.7639,
Open_Porch_SF = Open_Porch_SF / 10.7639
|>
) summarize(
BsmtFin_SF_1 = mean(BsmtFin_SF_1),
BsmtFin_SF_2 = mean(BsmtFin_SF_2),
Bsmt_Unf_SF = mean(Bsmt_Unf_SF),
Total_Bsmt_SF = mean(Total_Bsmt_SF),
First_Flr_SF = mean(First_Flr_SF),
Second_Flr_SF = mean(Second_Flr_SF),
Wood_Deck_SF = mean(Wood_Deck_SF),
Open_Porch_SF = mean(Open_Porch_SF),
.by = Neighborhood
)## # A tibble: 28 × 9
## Neighborhood BsmtFin_SF_1 BsmtFin_SF_2 Bsmt_Unf_SF Total_Bsmt_SF First_Flr_SF
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 North_Ames 0.340 7.77 43.0 95.8 109.
## 2 Gilbert 0.480 0.0636 58.6 81.3 87.4
## 3 Stone_Brook 0.341 2.76 70.8 145. 147.
## 4 Northwest_A… 0.277 5.81 50.6 109. 122.
## 5 Somerset 0.427 0.776 72.6 110. 110.
## 6 Briardale 0.356 0.910 27.4 52.3 52.3
## 7 Northpark_V… 0.137 8.34 41.2 85.8 85.3
## 8 Northridge_… 0.371 0.847 80.8 151. 150.
## 9 Bloomington… 0.392 0 91.0 121. 131.
## 10 Northridge 0.335 8.40 56.4 133. 137.
## # ℹ 18 more rows
## # ℹ 3 more variables: Second_Flr_SF <dbl>, Wood_Deck_SF <dbl>,
## # Open_Porch_SF <dbl>
Level 3: Use across with built-in functions
Now, we can remove a lot of duplicate code inside of summarize()
with help from the across()
function. Inside this helper function, we specify which columns (.cols
) we want to iterate over and what function .fns
we want to apply each time.
|>
housing_dat select(Neighborhood, contains('SF')) |>
mutate(
BsmtFin_SF_1 = BsmtFin_SF_1 / 10.7639,
BsmtFin_SF_2 = BsmtFin_SF_2 / 10.7639,
Bsmt_Unf_SF = Bsmt_Unf_SF / 10.7639,
Total_Bsmt_SF = Total_Bsmt_SF / 10.7639,
First_Flr_SF = First_Flr_SF / 10.7639,
Second_Flr_SF = Second_Flr_SF / 10.7639,
Wood_Deck_SF = Wood_Deck_SF / 10.7639,
Open_Porch_SF = Open_Porch_SF / 10.7639
|>
) summarize(
across(
.cols = c(
BsmtFin_SF_1,
BsmtFin_SF_2,
Bsmt_Unf_SF,
Total_Bsmt_SF,
First_Flr_SF,
Second_Flr_SF,
Wood_Deck_SF,
Open_Porch_SF
),.fns = mean
),.by = Neighborhood
)## # A tibble: 28 × 9
## Neighborhood BsmtFin_SF_1 BsmtFin_SF_2 Bsmt_Unf_SF Total_Bsmt_SF First_Flr_SF
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 North_Ames 0.340 7.77 43.0 95.8 109.
## 2 Gilbert 0.480 0.0636 58.6 81.3 87.4
## 3 Stone_Brook 0.341 2.76 70.8 145. 147.
## 4 Northwest_A… 0.277 5.81 50.6 109. 122.
## 5 Somerset 0.427 0.776 72.6 110. 110.
## 6 Briardale 0.356 0.910 27.4 52.3 52.3
## 7 Northpark_V… 0.137 8.34 41.2 85.8 85.3
## 8 Northridge_… 0.371 0.847 80.8 151. 150.
## 9 Bloomington… 0.392 0 91.0 121. 131.
## 10 Northridge 0.335 8.40 56.4 133. 137.
## # ℹ 18 more rows
## # ℹ 3 more variables: Second_Flr_SF <dbl>, Wood_Deck_SF <dbl>,
## # Open_Porch_SF <dbl>
Level 4: Use across with custom function
Similarly, we can use across()
also inside of mutate()
. But this time we also have to define a function that transforms square feet to square meters.
|>
housing_dat select(Neighborhood, contains('SF')) |>
mutate(
across(
.cols = c(
BsmtFin_SF_1,
BsmtFin_SF_2,
Bsmt_Unf_SF,
Total_Bsmt_SF,
First_Flr_SF,
Second_Flr_SF,
Wood_Deck_SF,
Open_Porch_SF
),.fns = \(sqft) sqft / 10.7639
)|>
) summarize(
across(
.cols = c(
BsmtFin_SF_1,
BsmtFin_SF_2,
Bsmt_Unf_SF,
Total_Bsmt_SF,
First_Flr_SF,
Second_Flr_SF,
Wood_Deck_SF,
Open_Porch_SF
),.fns = mean
),.by = Neighborhood
)## # A tibble: 28 × 9
## Neighborhood BsmtFin_SF_1 BsmtFin_SF_2 Bsmt_Unf_SF Total_Bsmt_SF First_Flr_SF
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 North_Ames 0.340 7.77 43.0 95.8 109.
## 2 Gilbert 0.480 0.0636 58.6 81.3 87.4
## 3 Stone_Brook 0.341 2.76 70.8 145. 147.
## 4 Northwest_A… 0.277 5.81 50.6 109. 122.
## 5 Somerset 0.427 0.776 72.6 110. 110.
## 6 Briardale 0.356 0.910 27.4 52.3 52.3
## 7 Northpark_V… 0.137 8.34 41.2 85.8 85.3
## 8 Northridge_… 0.371 0.847 80.8 151. 150.
## 9 Bloomington… 0.392 0 91.0 121. 131.
## 10 Northridge 0.335 8.40 56.4 133. 137.
## # ℹ 18 more rows
## # ℹ 3 more variables: Second_Flr_SF <dbl>, Wood_Deck_SF <dbl>,
## # Open_Porch_SF <dbl>
And of course you can use this technique to use custom function arguments as well.
|>
housing_dat select(Neighborhood, contains('SF')) |>
mutate(
across(
.cols = c(
BsmtFin_SF_1,
BsmtFin_SF_2,
Bsmt_Unf_SF,
Total_Bsmt_SF,
First_Flr_SF,
Second_Flr_SF,
Wood_Deck_SF,
Open_Porch_SF
),.fns = \(sqft) sqft / 10.7639
)|>
) summarize(
across(
.cols = c(
BsmtFin_SF_1,
BsmtFin_SF_2,
Bsmt_Unf_SF,
Total_Bsmt_SF,
First_Flr_SF,
Second_Flr_SF,
Wood_Deck_SF,
Open_Porch_SF
),.fns = \(sqft) mean(sqft, na.rm = TRUE)
),.by = Neighborhood
)## # A tibble: 28 × 9
## Neighborhood BsmtFin_SF_1 BsmtFin_SF_2 Bsmt_Unf_SF Total_Bsmt_SF First_Flr_SF
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 North_Ames 0.340 7.77 43.0 95.8 109.
## 2 Gilbert 0.480 0.0636 58.6 81.3 87.4
## 3 Stone_Brook 0.341 2.76 70.8 145. 147.
## 4 Northwest_A… 0.277 5.81 50.6 109. 122.
## 5 Somerset 0.427 0.776 72.6 110. 110.
## 6 Briardale 0.356 0.910 27.4 52.3 52.3
## 7 Northpark_V… 0.137 8.34 41.2 85.8 85.3
## 8 Northridge_… 0.371 0.847 80.8 151. 150.
## 9 Bloomington… 0.392 0 91.0 121. 131.
## 10 Northridge 0.335 8.40 56.4 133. 137.
## # ℹ 18 more rows
## # ℹ 3 more variables: Second_Flr_SF <dbl>, Wood_Deck_SF <dbl>,
## # Open_Porch_SF <dbl>
Level 5: Use tidyselect helpers in across()
Finally, we can also use the tidyselect helpers inside of acrouss()
. This is incredibly convenient and makes the code much shorter.
|>
housing_dat select(Neighborhood, contains('SF')) |>
mutate(
across(
.cols = contains('SF'),
.fns = \(sqft) sqft / 10.7639
)|>
) summarize(
across(
.cols = contains('SF'),
.fns = mean
),.by = Neighborhood
)## # A tibble: 28 × 9
## Neighborhood BsmtFin_SF_1 BsmtFin_SF_2 Bsmt_Unf_SF Total_Bsmt_SF First_Flr_SF
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 North_Ames 0.340 7.77 43.0 95.8 109.
## 2 Gilbert 0.480 0.0636 58.6 81.3 87.4
## 3 Stone_Brook 0.341 2.76 70.8 145. 147.
## 4 Northwest_A… 0.277 5.81 50.6 109. 122.
## 5 Somerset 0.427 0.776 72.6 110. 110.
## 6 Briardale 0.356 0.910 27.4 52.3 52.3
## 7 Northpark_V… 0.137 8.34 41.2 85.8 85.3
## 8 Northridge_… 0.371 0.847 80.8 151. 150.
## 9 Bloomington… 0.392 0 91.0 121. 131.
## 10 Northridge 0.335 8.40 56.4 133. 137.
## # ℹ 18 more rows
## # ℹ 3 more variables: Second_Flr_SF <dbl>, Wood_Deck_SF <dbl>,
## # Open_Porch_SF <dbl>
Bonus-Level: Use clean names
And to throw in a little bonus level for you, let me mention the clean_names()
function from the {janitor}
package. It transforms all column names into snake_case. That way, you’ll have much more programming friendly column names.
And I know: Sounds mundane but it will save you lots of time in the long run.
|>
housing_dat ::clean_names() |>
janitorselect(neighborhood, contains('_sf')) |>
mutate(
across(
.cols = contains('_sf'),
.fns = \(sqft) sqft / 10.7639
)|>
) summarize(
across(
.cols = contains('_sf'),
.fns = mean
),.by = neighborhood
)## # A tibble: 28 × 9
## neighborhood bsmt_fin_sf_1 bsmt_fin_sf_2 bsmt_unf_sf total_bsmt_sf
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 North_Ames 0.340 7.77 43.0 95.8
## 2 Gilbert 0.480 0.0636 58.6 81.3
## 3 Stone_Brook 0.341 2.76 70.8 145.
## 4 Northwest_Ames 0.277 5.81 50.6 109.
## 5 Somerset 0.427 0.776 72.6 110.
## 6 Briardale 0.356 0.910 27.4 52.3
## 7 Northpark_Villa 0.137 8.34 41.2 85.8
## 8 Northridge_Heights 0.371 0.847 80.8 151.
## 9 Bloomington_Heights 0.392 0 91.0 121.
## 10 Northridge 0.335 8.40 56.4 133.
## # ℹ 18 more rows
## # ℹ 4 more variables: first_flr_sf <dbl>, second_flr_sf <dbl>,
## # wood_deck_sf <dbl>, open_porch_sf <dbl>