library(dplyr)
library(data.table)
<- modeldata::ames |>
ames ::clean_names() |>
janitoras_tibble() # this is already the case but for comparability done here again
ames## # A tibble: 2,930 × 74
## ms_sub_class ms_zoning lot_frontage lot_area street alley lot_shape
## <fct> <fct> <dbl> <int> <fct> <fct> <fct>
## 1 One_Story_1946_and_Ne… Resident… 141 31770 Pave No_A… Slightly…
## 2 One_Story_1946_and_Ne… Resident… 80 11622 Pave No_A… Regular
## 3 One_Story_1946_and_Ne… Resident… 81 14267 Pave No_A… Slightly…
## 4 One_Story_1946_and_Ne… Resident… 93 11160 Pave No_A… Regular
## 5 Two_Story_1946_and_Ne… Resident… 74 13830 Pave No_A… Slightly…
## 6 Two_Story_1946_and_Ne… Resident… 78 9978 Pave No_A… Slightly…
## 7 One_Story_PUD_1946_an… Resident… 41 4920 Pave No_A… Regular
## 8 One_Story_PUD_1946_an… Resident… 43 5005 Pave No_A… Slightly…
## 9 One_Story_PUD_1946_an… Resident… 39 5389 Pave No_A… Slightly…
## 10 Two_Story_1946_and_Ne… Resident… 60 7500 Pave No_A… Regular
## # ℹ 2,920 more rows
## # ℹ 67 more variables: land_contour <fct>, utilities <fct>, lot_config <fct>,
## # land_slope <fct>, neighborhood <fct>, condition_1 <fct>, condition_2 <fct>,
## # bldg_type <fct>, house_style <fct>, overall_cond <fct>, year_built <int>,
## # year_remod_add <int>, roof_style <fct>, roof_matl <fct>,
## # exterior_1st <fct>, exterior_2nd <fct>, mas_vnr_type <fct>,
## # mas_vnr_area <dbl>, exter_cond <fct>, foundation <fct>, bsmt_cond <fct>, …
data.table
vs dplyr
: A Side-by-Side Comparison
dplyr
and data.table
.
In today’s blogpost I show you how to do common data cleaning operations via both {data.table}
and {dplyr}
. These are two fantastic frameworks inside the R ecosystem. As always, there’s also a video version available:
Get data
First, let us take a look at our data.
We will need to convert it to a data.table
.
<- as.data.table(ames)
df_ames
df_ames## ms_sub_class ms_zoning lot_frontage
## <fctr> <fctr> <num>
## 1: One_Story_1946_and_Newer_All_Styles Residential_Low_Density 141
## 2: One_Story_1946_and_Newer_All_Styles Residential_High_Density 80
## 3: One_Story_1946_and_Newer_All_Styles Residential_Low_Density 81
## 4: One_Story_1946_and_Newer_All_Styles Residential_Low_Density 93
## 5: Two_Story_1946_and_Newer Residential_Low_Density 74
## ---
## 2926: Split_or_Multilevel Residential_Low_Density 37
## 2927: One_Story_1946_and_Newer_All_Styles Residential_Low_Density 0
## 2928: Split_Foyer Residential_Low_Density 62
## 2929: One_Story_1946_and_Newer_All_Styles Residential_Low_Density 77
## 2930: Two_Story_1946_and_Newer Residential_Low_Density 74
## lot_area street alley lot_shape land_contour utilities
## <int> <fctr> <fctr> <fctr> <fctr> <fctr>
## 1: 31770 Pave No_Alley_Access Slightly_Irregular Lvl AllPub
## 2: 11622 Pave No_Alley_Access Regular Lvl AllPub
## 3: 14267 Pave No_Alley_Access Slightly_Irregular Lvl AllPub
## 4: 11160 Pave No_Alley_Access Regular Lvl AllPub
## 5: 13830 Pave No_Alley_Access Slightly_Irregular Lvl AllPub
## ---
## 2926: 7937 Pave No_Alley_Access Slightly_Irregular Lvl AllPub
## 2927: 8885 Pave No_Alley_Access Slightly_Irregular Low AllPub
## 2928: 10441 Pave No_Alley_Access Regular Lvl AllPub
## 2929: 10010 Pave No_Alley_Access Regular Lvl AllPub
## 2930: 9627 Pave No_Alley_Access Regular Lvl AllPub
## lot_config land_slope neighborhood condition_1 condition_2 bldg_type
## <fctr> <fctr> <fctr> <fctr> <fctr> <fctr>
## 1: Corner Gtl North_Ames Norm Norm OneFam
## 2: Inside Gtl North_Ames Feedr Norm OneFam
## 3: Corner Gtl North_Ames Norm Norm OneFam
## 4: Corner Gtl North_Ames Norm Norm OneFam
## 5: Inside Gtl Gilbert Norm Norm OneFam
## ---
## 2926: CulDSac Gtl Mitchell Norm Norm OneFam
## 2927: Inside Mod Mitchell Norm Norm OneFam
## 2928: Inside Gtl Mitchell Norm Norm OneFam
## 2929: Inside Mod Mitchell Norm Norm OneFam
## 2930: Inside Mod Mitchell Norm Norm OneFam
## house_style overall_cond year_built year_remod_add roof_style roof_matl
## <fctr> <fctr> <int> <int> <fctr> <fctr>
## 1: One_Story Average 1960 1960 Hip CompShg
## 2: One_Story Above_Average 1961 1961 Gable CompShg
## 3: One_Story Above_Average 1958 1958 Hip CompShg
## 4: One_Story Average 1968 1968 Hip CompShg
## 5: Two_Story Average 1997 1998 Gable CompShg
## ---
## 2926: SLvl Above_Average 1984 1984 Gable CompShg
## 2927: One_Story Average 1983 1983 Gable CompShg
## 2928: SFoyer Average 1992 1992 Gable CompShg
## 2929: One_Story Average 1974 1975 Gable CompShg
## 2930: Two_Story Average 1993 1994 Gable CompShg
## exterior_1st exterior_2nd mas_vnr_type mas_vnr_area exter_cond foundation
## <fctr> <fctr> <fctr> <num> <fctr> <fctr>
## 1: BrkFace Plywood Stone 112 Typical CBlock
## 2: VinylSd VinylSd None 0 Typical CBlock
## 3: Wd Sdng Wd Sdng BrkFace 108 Typical CBlock
## 4: BrkFace BrkFace None 0 Typical CBlock
## 5: VinylSd VinylSd None 0 Typical PConc
## ---
## 2926: HdBoard HdBoard None 0 Typical CBlock
## 2927: HdBoard HdBoard None 0 Typical CBlock
## 2928: HdBoard Wd Shng None 0 Typical PConc
## 2929: HdBoard HdBoard None 0 Typical CBlock
## 2930: HdBoard HdBoard BrkFace 94 Typical PConc
## bsmt_cond bsmt_exposure bsmt_fin_type_1 bsmt_fin_sf_1 bsmt_fin_type_2
## <fctr> <fctr> <fctr> <num> <fctr>
## 1: Good Gd BLQ 2 Unf
## 2: Typical No Rec 6 LwQ
## 3: Typical No ALQ 1 Unf
## 4: Typical No ALQ 1 Unf
## 5: Typical No GLQ 3 Unf
## ---
## 2926: Typical Av GLQ 3 Unf
## 2927: Typical Av BLQ 2 ALQ
## 2928: Typical Av GLQ 3 Unf
## 2929: Typical Av ALQ 1 LwQ
## 2930: Typical Av LwQ 4 Unf
## bsmt_fin_sf_2 bsmt_unf_sf total_bsmt_sf heating heating_qc central_air
## <num> <num> <num> <fctr> <fctr> <fctr>
## 1: 0 441 1080 GasA Fair Y
## 2: 144 270 882 GasA Typical Y
## 3: 0 406 1329 GasA Typical Y
## 4: 0 1045 2110 GasA Excellent Y
## 5: 0 137 928 GasA Good Y
## ---
## 2926: 0 184 1003 GasA Typical Y
## 2927: 324 239 864 GasA Typical Y
## 2928: 0 575 912 GasA Typical Y
## 2929: 123 195 1389 GasA Good Y
## 2930: 0 238 996 GasA Excellent Y
## electrical first_flr_sf second_flr_sf gr_liv_area bsmt_full_bath
## <fctr> <int> <int> <int> <num>
## 1: SBrkr 1656 0 1656 1
## 2: SBrkr 896 0 896 0
## 3: SBrkr 1329 0 1329 0
## 4: SBrkr 2110 0 2110 1
## 5: SBrkr 928 701 1629 0
## ---
## 2926: SBrkr 1003 0 1003 1
## 2927: SBrkr 902 0 902 1
## 2928: SBrkr 970 0 970 0
## 2929: SBrkr 1389 0 1389 1
## 2930: SBrkr 996 1004 2000 0
## bsmt_half_bath full_bath half_bath bedroom_abv_gr kitchen_abv_gr
## <num> <int> <int> <int> <int>
## 1: 0 1 0 3 1
## 2: 0 1 0 2 1
## 3: 0 1 1 3 1
## 4: 0 2 1 3 1
## 5: 0 2 1 3 1
## ---
## 2926: 0 1 0 3 1
## 2927: 0 1 0 2 1
## 2928: 1 1 0 3 1
## 2929: 0 1 0 2 1
## 2930: 0 2 1 3 1
## tot_rms_abv_grd functional fireplaces garage_type garage_finish
## <int> <fctr> <int> <fctr> <fctr>
## 1: 7 Typ 2 Attchd Fin
## 2: 5 Typ 0 Attchd Unf
## 3: 6 Typ 0 Attchd Unf
## 4: 8 Typ 2 Attchd Fin
## 5: 6 Typ 1 Attchd Fin
## ---
## 2926: 6 Typ 0 Detchd Unf
## 2927: 5 Typ 0 Attchd Unf
## 2928: 6 Typ 0 No_Garage No_Garage
## 2929: 6 Typ 1 Attchd RFn
## 2930: 9 Typ 1 Attchd Fin
## garage_cars garage_area garage_cond paved_drive wood_deck_sf
## <num> <num> <fctr> <fctr> <int>
## 1: 2 528 Typical Partial_Pavement 210
## 2: 1 730 Typical Paved 140
## 3: 1 312 Typical Paved 393
## 4: 2 522 Typical Paved 0
## 5: 2 482 Typical Paved 212
## ---
## 2926: 2 588 Typical Paved 120
## 2927: 2 484 Typical Paved 164
## 2928: 0 0 No_Garage Paved 80
## 2929: 2 418 Typical Paved 240
## 2930: 3 650 Typical Paved 190
## open_porch_sf enclosed_porch three_season_porch screen_porch pool_area
## <int> <int> <int> <int> <int>
## 1: 62 0 0 0 0
## 2: 0 0 0 120 0
## 3: 36 0 0 0 0
## 4: 0 0 0 0 0
## 5: 34 0 0 0 0
## ---
## 2926: 0 0 0 0 0
## 2927: 0 0 0 0 0
## 2928: 32 0 0 0 0
## 2929: 38 0 0 0 0
## 2930: 48 0 0 0 0
## pool_qc fence misc_feature misc_val mo_sold year_sold sale_type
## <fctr> <fctr> <fctr> <int> <int> <int> <fctr>
## 1: No_Pool No_Fence None 0 5 2010 WD
## 2: No_Pool Minimum_Privacy None 0 6 2010 WD
## 3: No_Pool No_Fence Gar2 12500 6 2010 WD
## 4: No_Pool No_Fence None 0 4 2010 WD
## 5: No_Pool Minimum_Privacy None 0 3 2010 WD
## ---
## 2926: No_Pool Good_Privacy None 0 3 2006 WD
## 2927: No_Pool Minimum_Privacy None 0 6 2006 WD
## 2928: No_Pool Minimum_Privacy Shed 700 7 2006 WD
## 2929: No_Pool No_Fence None 0 4 2006 WD
## 2930: No_Pool No_Fence None 0 11 2006 WD
## sale_condition sale_price longitude latitude
## <fctr> <int> <num> <num>
## 1: Normal 215000 -93.61975 42.05403
## 2: Normal 105000 -93.61976 42.05301
## 3: Normal 172000 -93.61939 42.05266
## 4: Normal 244000 -93.61732 42.05125
## 5: Normal 189900 -93.63893 42.06090
## ---
## 2926: Normal 142500 -93.60478 41.98896
## 2927: Normal 131000 -93.60268 41.98831
## 2928: Normal 132000 -93.60685 41.98651
## 2929: Normal 170000 -93.60019 41.99092
## 2930: Normal 188000 -93.60000 41.98927
Also, let us set options for nicer data.table
outputs. I don’t want lot’s of columns and rows to flood my console.
# Compact printing similar to tibble
options(
datatable.print.topn = 5, # Show only top 10 rows
datatable.print.nrows = 10, # Limit number of printed rows
datatable.print.class = TRUE, # Show column classes
datatable.print.trunc.cols = TRUE # Truncate wide columns
)
The general difference
{dplyr}
uses functions/verbs that are chained together. This has the advantage that it’s easily readable to anybody because the function names are usually understandable even to non-programmers.
On the other hand {data.table}
uses a super concise code style with lots of abbreviations. This has the advantage that the code is short but if you don’t know the system then you might feel lost.
So let me explain the system behind {data.table}
. The most basic thing to understand is that a data.table df
can be modified via a bracket using up to three different components. This might look something like df[i, j, by]
.
The first component i
refers to rows that are “modified”. The second component j
refers to columns that might be modified
. And the third component by
refers to any grouping you might want to use.
But enough theory. Let’s see this in action.
Sort specific columns
|> arrange(lot_frontage)
ames ## # A tibble: 2,930 × 74
## ms_sub_class ms_zoning lot_frontage lot_area street alley lot_shape
## <fct> <fct> <dbl> <int> <fct> <fct> <fct>
## 1 One_Story_1946_and_Ne… Resident… 0 7980 Pave No_A… Slightly…
## 2 One_Story_PUD_1946_an… Resident… 0 6820 Pave No_A… Slightly…
## 3 Two_Story_1946_and_Ne… Floating… 0 7500 Pave No_A… Regular
## 4 One_Story_1946_and_Ne… Resident… 0 11241 Pave No_A… Slightly…
## 5 One_Story_1946_and_Ne… Resident… 0 12537 Pave No_A… Slightly…
## 6 Two_Story_1946_and_Ne… Resident… 0 7851 Pave No_A… Regular
## 7 Split_or_Multilevel Resident… 0 7750 Pave No_A… Slightly…
## 8 Two_Story_1946_and_Ne… Resident… 0 9505 Pave No_A… Slightly…
## 9 Two_Story_1946_and_Ne… Resident… 0 8880 Pave No_A… Slightly…
## 10 Two_Story_1946_and_Ne… Resident… 0 9453 Pave No_A… Slightly…
## # ℹ 2,920 more rows
## # ℹ 67 more variables: land_contour <fct>, utilities <fct>, lot_config <fct>,
## # land_slope <fct>, neighborhood <fct>, condition_1 <fct>, condition_2 <fct>,
## # bldg_type <fct>, house_style <fct>, overall_cond <fct>, year_built <int>,
## # year_remod_add <int>, roof_style <fct>, roof_matl <fct>,
## # exterior_1st <fct>, exterior_2nd <fct>, mas_vnr_type <fct>,
## # mas_vnr_area <dbl>, exter_cond <fct>, foundation <fct>, bsmt_cond <fct>, …
order(lot_frontage)]
df_ames[## ms_sub_class ms_zoning
## <fctr> <fctr>
## 1: One_Story_1946_and_Newer_All_Styles Residential_Low_Density
## 2: One_Story_PUD_1946_and_Newer Residential_Low_Density
## 3: Two_Story_1946_and_Newer Floating_Village_Residential
## 4: One_Story_1946_and_Newer_All_Styles Residential_Low_Density
## 5: One_Story_1946_and_Newer_All_Styles Residential_Low_Density
## ---
## 2926: One_Story_1946_and_Newer_All_Styles Residential_Low_Density
## 2927: Two_Family_conversion_All_Styles_and_Ages Residential_Low_Density
## 2928: One_Story_1946_and_Newer_All_Styles Residential_Low_Density
## 2929: One_Story_1946_and_Newer_All_Styles Residential_Low_Density
## 2930: Two_Story_1946_and_Newer Residential_Low_Density
## 72 variable(s) not shown: [lot_frontage <num>, lot_area <int>, street <fctr>, alley <fctr>, lot_shape <fctr>, land_contour <fctr>, utilities <fctr>, lot_config <fctr>, land_slope <fctr>, neighborhood <fctr>, ...]
Filter for specific rows
|> filter(sale_price > 300000)
ames ## # A tibble: 230 × 74
## ms_sub_class ms_zoning lot_frontage lot_area street alley lot_shape
## <fct> <fct> <dbl> <int> <fct> <fct> <fct>
## 1 Two_Story_1946_and_Ne… Resident… 47 53504 Pave No_A… Moderate…
## 2 One_Story_1946_and_Ne… Resident… 88 11394 Pave No_A… Regular
## 3 Two_Story_1946_and_Ne… Resident… 102 12858 Pave No_A… Slightly…
## 4 One_Story_1946_and_Ne… Resident… 98 11478 Pave No_A… Regular
## 5 One_Story_1946_and_Ne… Resident… 83 10159 Pave No_A… Slightly…
## 6 One_Story_1946_and_Ne… Resident… 100 12919 Pave No_A… Slightly…
## 7 One_Story_1946_and_Ne… Resident… 110 14300 Pave No_A… Regular
## 8 Two_Story_1946_and_Ne… Resident… 105 13650 Pave No_A… Regular
## 9 One_Story_PUD_1946_an… Resident… 61 7658 Pave No_A… Regular
## 10 Two_Story_1946_and_Ne… Resident… 108 14774 Pave No_A… Slightly…
## # ℹ 220 more rows
## # ℹ 67 more variables: land_contour <fct>, utilities <fct>, lot_config <fct>,
## # land_slope <fct>, neighborhood <fct>, condition_1 <fct>, condition_2 <fct>,
## # bldg_type <fct>, house_style <fct>, overall_cond <fct>, year_built <int>,
## # year_remod_add <int>, roof_style <fct>, roof_matl <fct>,
## # exterior_1st <fct>, exterior_2nd <fct>, mas_vnr_type <fct>,
## # mas_vnr_area <dbl>, exter_cond <fct>, foundation <fct>, bsmt_cond <fct>, …
> 300000]
df_ames[sale_price ## ms_sub_class ms_zoning lot_frontage
## <fctr> <fctr> <num>
## 1: Two_Story_1946_and_Newer Residential_Low_Density 47
## 2: One_Story_1946_and_Newer_All_Styles Residential_Low_Density 88
## 3: Two_Story_1946_and_Newer Residential_Low_Density 102
## 4: One_Story_1946_and_Newer_All_Styles Residential_Low_Density 98
## 5: One_Story_1946_and_Newer_All_Styles Residential_Low_Density 83
## ---
## 226: Two_Story_1946_and_Newer Residential_Low_Density 42
## 227: One_Story_1946_and_Newer_All_Styles Residential_Low_Density 85
## 228: One_Story_1946_and_Newer_All_Styles Residential_Low_Density 95
## 229: One_Story_1946_and_Newer_All_Styles Residential_Low_Density 88
## 230: One_Story_1946_and_Newer_All_Styles Residential_Low_Density 88
## 71 variable(s) not shown: [lot_area <int>, street <fctr>, alley <fctr>, lot_shape <fctr>, land_contour <fctr>, utilities <fctr>, lot_config <fctr>, land_slope <fctr>, neighborhood <fctr>, condition_1 <fctr>, ...]
Select specific columns
|>
ames filter(sale_price > 300000) |>
select(neighborhood, sale_price, lot_area)
## # A tibble: 230 × 3
## neighborhood sale_price lot_area
## <fct> <int> <int>
## 1 Stone_Brook 538000 53504
## 2 Stone_Brook 394432 11394
## 3 Northridge_Heights 376162 12858
## 4 Northridge_Heights 306000 11478
## 5 Northridge_Heights 395192 10159
## 6 Northridge_Heights 611657 12919
## 7 Northridge_Heights 500000 14300
## 8 Northridge_Heights 320000 13650
## 9 Northridge_Heights 319900 7658
## 10 Northridge 333168 14774
## # ℹ 220 more rows
df_ames[> 300000,
sale_price list(neighborhood, sale_price, lot_area)
]## neighborhood sale_price lot_area
## <fctr> <int> <int>
## 1: Stone_Brook 538000 53504
## 2: Stone_Brook 394432 11394
## 3: Northridge_Heights 376162 12858
## 4: Northridge_Heights 306000 11478
## 5: Northridge_Heights 395192 10159
## ---
## 226: Timberland 335000 26178
## 227: Timberland 312500 14331
## 228: Timberland 320000 13618
## 229: Timberland 369900 11443
## 230: Timberland 359900 11577
If you don’t want to type out the list()
command, you can also use data.table
’s shorthand notation .()
.
df_ames[> 300000,
sale_price
.(neighborhood, sale_price, lot_area)
]## neighborhood sale_price lot_area
## <fctr> <int> <int>
## 1: Stone_Brook 538000 53504
## 2: Stone_Brook 394432 11394
## 3: Northridge_Heights 376162 12858
## 4: Northridge_Heights 306000 11478
## 5: Northridge_Heights 395192 10159
## ---
## 226: Timberland 335000 26178
## 227: Timberland 312500 14331
## 228: Timberland 320000 13618
## 229: Timberland 369900 11443
## 230: Timberland 359900 11577
Select many columns by name
|> select(contains('area'))
ames ## # A tibble: 2,930 × 5
## lot_area mas_vnr_area gr_liv_area garage_area pool_area
## <int> <dbl> <int> <dbl> <int>
## 1 31770 112 1656 528 0
## 2 11622 0 896 730 0
## 3 14267 108 1329 312 0
## 4 11160 0 2110 522 0
## 5 13830 0 1629 482 0
## 6 9978 20 1604 470 0
## 7 4920 0 1338 582 0
## 8 5005 0 1280 506 0
## 9 5389 0 1616 608 0
## 10 7500 0 1804 442 0
## # ℹ 2,920 more rows
= patterns('area')]
df_ames[, .SD, .SDcols ## lot_area mas_vnr_area gr_liv_area garage_area pool_area
## <int> <num> <int> <num> <int>
## 1: 31770 112 1656 528 0
## 2: 11622 0 896 730 0
## 3: 14267 108 1329 312 0
## 4: 11160 0 2110 522 0
## 5: 13830 0 1629 482 0
## ---
## 2926: 7937 0 1003 588 0
## 2927: 8885 0 902 484 0
## 2928: 10441 0 970 0 0
## 2929: 10010 0 1389 418 0
## 2930: 9627 94 2000 650 0
Select many columns by type
|>
ames select(where(is.numeric))
## # A tibble: 2,930 × 34
## lot_frontage lot_area year_built year_remod_add mas_vnr_area bsmt_fin_sf_1
## <dbl> <int> <int> <int> <dbl> <dbl>
## 1 141 31770 1960 1960 112 2
## 2 80 11622 1961 1961 0 6
## 3 81 14267 1958 1958 108 1
## 4 93 11160 1968 1968 0 1
## 5 74 13830 1997 1998 0 3
## 6 78 9978 1998 1998 20 3
## 7 41 4920 2001 2001 0 3
## 8 43 5005 1992 1992 0 1
## 9 39 5389 1995 1996 0 3
## 10 60 7500 1999 1999 0 7
## # ℹ 2,920 more rows
## # ℹ 28 more variables: bsmt_fin_sf_2 <dbl>, bsmt_unf_sf <dbl>,
## # total_bsmt_sf <dbl>, first_flr_sf <int>, second_flr_sf <int>,
## # gr_liv_area <int>, bsmt_full_bath <dbl>, bsmt_half_bath <dbl>,
## # full_bath <int>, half_bath <int>, bedroom_abv_gr <int>,
## # kitchen_abv_gr <int>, tot_rms_abv_grd <int>, fireplaces <int>,
## # garage_cars <dbl>, garage_area <dbl>, wood_deck_sf <int>, …
= is.numeric]
df_ames[, .SD, .SDcols ## lot_frontage lot_area year_built year_remod_add mas_vnr_area
## <num> <int> <int> <int> <num>
## 1: 141 31770 1960 1960 112
## 2: 80 11622 1961 1961 0
## 3: 81 14267 1958 1958 108
## 4: 93 11160 1968 1968 0
## 5: 74 13830 1997 1998 0
## ---
## 2926: 37 7937 1984 1984 0
## 2927: 0 8885 1983 1983 0
## 2928: 62 10441 1992 1992 0
## 2929: 77 10010 1974 1975 0
## 2930: 74 9627 1993 1994 94
## 29 variable(s) not shown: [bsmt_fin_sf_1 <num>, bsmt_fin_sf_2 <num>, bsmt_unf_sf <num>, total_bsmt_sf <num>, first_flr_sf <int>, second_flr_sf <int>, gr_liv_area <int>, bsmt_full_bath <num>, bsmt_half_bath <num>, full_bath <int>, ...]
Compute a new column
|>
ames filter(sale_price > 300000) |>
select(neighborhood, sale_price, lot_area) |>
mutate(
price_by_lot_area = sale_price / lot_area
)## # A tibble: 230 × 4
## neighborhood sale_price lot_area price_by_lot_area
## <fct> <int> <int> <dbl>
## 1 Stone_Brook 538000 53504 10.1
## 2 Stone_Brook 394432 11394 34.6
## 3 Northridge_Heights 376162 12858 29.3
## 4 Northridge_Heights 306000 11478 26.7
## 5 Northridge_Heights 395192 10159 38.9
## 6 Northridge_Heights 611657 12919 47.3
## 7 Northridge_Heights 500000 14300 35.0
## 8 Northridge_Heights 320000 13650 23.4
## 9 Northridge_Heights 319900 7658 41.8
## 10 Northridge 333168 14774 22.6
## # ℹ 220 more rows
df_ames[> 300000,
sale_price list(
neighborhood,
sale_price,
lot_area,price_by_lot_area = sale_price / lot_area
)
]## neighborhood sale_price lot_area price_by_lot_area
## <fctr> <int> <int> <num>
## 1: Stone_Brook 538000 53504 10.05532
## 2: Stone_Brook 394432 11394 34.61752
## 3: Northridge_Heights 376162 12858 29.25509
## 4: Northridge_Heights 306000 11478 26.65970
## 5: Northridge_Heights 395192 10159 38.90068
## ---
## 226: Timberland 335000 26178 12.79701
## 227: Timberland 312500 14331 21.80588
## 228: Timberland 320000 13618 23.49831
## 229: Timberland 369900 11443 32.32544
## 230: Timberland 359900 11577 31.08750
Specialty of data.table
: modify in place
In the last step we have seen that mutate()
and using a list()
inside the []
of a data.table
work almost similarly. But there’s a specialty that you might want to keep in mind.
You see, in the previous example we created a new data.table
. And that’s why we have seen an output after df_ames[...]
. But checkout what happens if I don’t use list()
and instead use the :=
operator inside the brackets.
df_ames[> 300000,
sale_price := sale_price / lot_area
price_by_lot_area ]
See? No output. But if we look at df_ames
, we’ll notice that the price_by_lot_area
was actually created. And that happened even though we never overwrote df_ames
like df_ames <-
.
# Notice no filter on rows here as first argument was left blank.
|> head(30)
df_ames[, price_by_lot_area] ## [1] NA NA NA NA NA NA NA NA
## [9] NA NA NA NA NA NA NA 10.05532
## [17] NA 34.61752 NA NA NA NA NA NA
## [25] NA NA NA NA NA NA
Notice now that we have lots of NA
s in that column now. This happens because we instructed R to only compute the price_by_lot_area
column where sale_price > 300000
. Interesting how that works, isn’t it?
If you wanted to calculate all values, you could leave out the filtering condition.
:= sale_price / lot_area]
df_ames[, price_by_lot_area |> head(30)
df_ames[, price_by_lot_area] ## [1] 6.767391 9.034590 12.055793 21.863799 13.731020 19.593105 43.394309
## [8] 38.261738 43.885693 25.200000 17.590000 23.182957 21.471078 16.853381
## [15] 31.085044 10.055323 13.515741 34.617518 7.367541 15.939279 16.168837
## [22] 16.000000 28.800000 13.255048 11.956608 16.804734 15.000000 10.952381
## [29] 31.410038 57.142857
Calculate multiple new columns
|>
ames filter(sale_price > 300000) |>
select(neighborhood, sale_price, lot_area) |>
mutate(
price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price),
log_price_by_lot_area = log_sale_price / lot_area
)## # A tibble: 230 × 6
## neighborhood sale_price lot_area price_by_lot_area log_sale_price
## <fct> <int> <int> <dbl> <dbl>
## 1 Stone_Brook 538000 53504 10.1 13.2
## 2 Stone_Brook 394432 11394 34.6 12.9
## 3 Northridge_Heights 376162 12858 29.3 12.8
## 4 Northridge_Heights 306000 11478 26.7 12.6
## 5 Northridge_Heights 395192 10159 38.9 12.9
## 6 Northridge_Heights 611657 12919 47.3 13.3
## 7 Northridge_Heights 500000 14300 35.0 13.1
## 8 Northridge_Heights 320000 13650 23.4 12.7
## 9 Northridge_Heights 319900 7658 41.8 12.7
## 10 Northridge 333168 14774 22.6 12.7
## # ℹ 220 more rows
## # ℹ 1 more variable: log_price_by_lot_area <dbl>
df_ames[> 300000,
sale_price list(
neighborhood,
sale_price,
lot_area,price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price),
log_price_by_lot_area = log(sale_price) / lot_area
)
]## neighborhood sale_price lot_area price_by_lot_area log_sale_price
## <fctr> <int> <int> <num> <num>
## 1: Stone_Brook 538000 53504 10.05532 13.19561
## 2: Stone_Brook 394432 11394 34.61752 12.88520
## 3: Northridge_Heights 376162 12858 29.25509 12.83778
## 4: Northridge_Heights 306000 11478 26.65970 12.63134
## 5: Northridge_Heights 395192 10159 38.90068 12.88713
## ---
## 226: Timberland 335000 26178 12.79701 12.72189
## 227: Timberland 312500 14331 21.80588 12.65236
## 228: Timberland 320000 13618 23.49831 12.67608
## 229: Timberland 369900 11443 32.32544 12.82099
## 230: Timberland 359900 11577 31.08750 12.79358
## 1 variable(s) not shown: [log_price_by_lot_area <num>]
To create multiple columns in place, you can either use the let()
command
df_ames[> 300000,
sale_price let(
price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price),
log_price_by_lot_area = log(sale_price) / lot_area
) ]
or the :=
operator.
df_ames[> 300000,
sale_price `:=`(
price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price),
log_price_by_lot_area = log(sale_price) / lot_area
) ]
Specialty of data.table
: chaining
Notice that in both previous data.table
examples we have used log(sale_price)
twice. That means after calculating the log_sale_price
column, we didn’t actually use that column later on. The reason for this is that it simply doesn’t work. You cannot calculate a new column and use it in the same step.
# reset df_ames so that `log_sale_price` doesn't exist
# due to previous calculations.
<- as.data.table(ames)
df_ames
df_ames[> 300000,
sale_price list(
neighborhood,
sale_price,
lot_area,price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price),
log_price_by_lot_area = log_sale_price / lot_area # log_sale_price here
)
]## Error in eval(jsub, SDenv, parent.frame()): object 'log_sale_price' not found
One way to fix that is to use chain operations. You can do so by using multiple brackets like df_ames[...][...]
, you can chain multiple operations.
df_ames[> 300000,
sale_price list(
neighborhood,
sale_price,
lot_area,price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price)
):= log_sale_price / lot_area] ][, log_price_by_lot_area
Notice that this technically worked and didn’t throw an error, but you cannot find any of the new columns in df_ames
.
list(neighborhood, log_price_by_lot_area)]
df_ames[, ## Error in eval(jsub, SDenv, parent.frame()): object 'log_price_by_lot_area' not found
list(neighborhood, log_sale_price)]
df_ames[, ## Error in eval(jsub, SDenv, parent.frame()): object 'log_sale_price' not found
Here’s why that happened: First, we created a new data.table by not using the in-place version. Remember how the list()
command creates a new data.table
? That’s why you get an output here:
df_ames[> 300000,
sale_price list(
neighborhood,
sale_price,
lot_area,price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price)
)
]## neighborhood sale_price lot_area price_by_lot_area log_sale_price
## <fctr> <int> <int> <num> <num>
## 1: Stone_Brook 538000 53504 10.05532 13.19561
## 2: Stone_Brook 394432 11394 34.61752 12.88520
## 3: Northridge_Heights 376162 12858 29.25509 12.83778
## 4: Northridge_Heights 306000 11478 26.65970 12.63134
## 5: Northridge_Heights 395192 10159 38.90068 12.88713
## ---
## 226: Timberland 335000 26178 12.79701 12.72189
## 227: Timberland 312500 14331 21.80588 12.65236
## 228: Timberland 320000 13618 23.49831 12.67608
## 229: Timberland 369900 11443 32.32544 12.82099
## 230: Timberland 359900 11577 31.08750 12.79358
But if you don’t save this into a new variable, then this new data table isn’t saved. Now, in the next step, we use the in-place mode of operations on the new data.table.
df_ames[> 300000,
sale_price list(
neighborhood,
sale_price,
lot_area,price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price)
):= log_sale_price / lot_area] ][, log_price_by_lot_area
This technically works but in-place calculations don’t return anything. They just modify the existing object. But the existing object is the new data.table that we still never saved anywhere. Hence, that calculation runs successfully but the results vanish the moment the calculation finishes.
What’s worse is that even if you try to save these results later, it’s no use. See? No results here:
<- df_ames[
df_test > 300000,
sale_price list(
neighborhood,
sale_price,
lot_area,price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price)
):= log_sale_price / lot_area]
][, log_price_by_lot_area df_test
So there are two ways you could fix this. Either chain your results without using the in-place calculations. This means either using list()
or .()
in later steps of the chain.
df_ames[> 300000,
sale_price list(
neighborhood,
sale_price,
lot_area,price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price)
)log_price_by_lot_area = log_sale_price / lot_area)]
][, .(## log_price_by_lot_area
## <num>
## 1: 0.0002466285
## 2: 0.0011308761
## 3: 0.0009984271
## 4: 0.0011004827
## 5: 0.0012685429
## ---
## 226: 0.0004859762
## 227: 0.0008828665
## 228: 0.0009308324
## 229: 0.0011204219
## 230: 0.0011050861
As we’ve seen before, this will then only return a data.table
with the filtered rows. Or alternatively, you could use only in-place calculation.
df_ames[> 300000,
sale_price let(
price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price)
):= log_sale_price / lot_area]
][, log_price_by_lot_area
df_ames[# Set filter to see both NA and non-NA values
> 200000,
sale_price
.(neighborhood, price_by_lot_area, log_sale_price, log_price_by_lot_area)
]## neighborhood price_by_lot_area log_sale_price log_price_by_lot_area
## <fctr> <num> <num> <num>
## 1: North_Ames NA NA NA
## 2: North_Ames NA NA NA
## 3: Stone_Brook NA NA NA
## 4: Stone_Brook NA NA NA
## 5: Stone_Brook NA NA NA
## ---
## 853: Timberland 21.80588 12.65236 0.0008828665
## 854: Timberland 23.49831 12.67608 0.0009308324
## 855: Timberland 32.32544 12.82099 0.0011204219
## 856: Timberland 31.08750 12.79358 0.0011050861
## 857: Mitchell NA NA NA
But personally I prefer to chain things with a pipe operator. With the R native pipe operator |>
and its placeholder variable _
this works pretty well.
df_ames[> 300000,
sale_price list(
neighborhood,
sale_price,
lot_area,price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price)
)|>
] log_price_by_lot_area = log_sale_price / lot_area)]
_[, .(## log_price_by_lot_area
## <num>
## 1: 0.0002466285
## 2: 0.0011308761
## 3: 0.0009984271
## 4: 0.0011004827
## 5: 0.0012685429
## ---
## 226: 0.0004859762
## 227: 0.0008828665
## 228: 0.0009308324
## 229: 0.0011204219
## 230: 0.0011050861
Calculate summary statistics (without grouping)
|>
ames filter(sale_price > 300000) |>
select(neighborhood, sale_price, lot_area) |>
mutate(
price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price),
log_price_by_lot_area = log_sale_price / lot_area
|>
) summarize(
mean_sale_price = mean(sale_price),
n_houses = n()
)## # A tibble: 1 × 2
## mean_sale_price n_houses
## <dbl> <int>
## 1 378693. 230
In data.tables
the special variable .N
refers to the amount of rows. In this case .N
refers to the amount of all rows. In a grouped setting, .N
would refer to the group sizes.
df_ames[> 300000,
sale_price list(
mean_sale_price = mean(sale_price),
n_houses = .N
)
]## mean_sale_price n_houses
## <num> <int>
## 1: 378693.4 230
Calculate summary statistics (with grouping)
|>
ames filter(sale_price > 300000) |>
select(neighborhood, sale_price, lot_area) |>
mutate(
price_by_lot_area = sale_price / lot_area,
log_sale_price = log(sale_price),
log_price_by_lot_area = log_sale_price / lot_area
|>
) summarize(
mean_sale_price = mean(sale_price),
n_houses = n(),
.by = neighborhood # use grouping
)## # A tibble: 16 × 3
## neighborhood mean_sale_price n_houses
## <fct> <dbl> <int>
## 1 Stone_Brook 417381. 27
## 2 Northridge_Heights 389422. 91
## 3 Northridge 387282. 37
## 4 Clear_Creek 315000 2
## 5 Edwards 347677. 3
## 6 Sawyer_West 312533. 3
## 7 College_Creek 363104. 10
## 8 Crawford 347410 10
## 9 Timberland 353320 15
## 10 North_Ames 323300 2
## 11 Gilbert 335500 3
## 12 Somerset 345156 19
## 13 Veenker 350188. 4
## 14 Northwest_Ames 306000 1
## 15 Old_Town 400000 2
## 16 Green_Hills 330000 1
In data.tables
the special variable .N
refers to the amount of rows. In this case .N
refers to the amount of all rows. In a grouped setting, .N
would refer to the group sizes.
df_ames[> 300000,
sale_price list(
mean_sale_price = mean(sale_price),
n_houses = .N
),= neighborhood # use grouping
by
]## neighborhood mean_sale_price n_houses
## <fctr> <num> <int>
## 1: Stone_Brook 417381.1 27
## 2: Northridge_Heights 389421.9 91
## 3: Northridge 387281.6 37
## 4: Clear_Creek 315000.0 2
## 5: Edwards 347676.7 3
## ---
## 12: Somerset 345156.0 19
## 13: Veenker 350187.5 4
## 14: Northwest_Ames 306000.0 1
## 15: Old_Town 400000.0 2
## 16: Green_Hills 330000.0 1
Iterate over multiple columns
Let’s combine our knowledge of selecting multiple columns to calculate summary stats for many columns.
In dplyr
this needs the across()
function inside of summarize()
.
|>
ames summarize(
across(
.cols = contains('area'),
.fns = mean
),.by = neighborhood
)## # A tibble: 28 × 6
## neighborhood lot_area mas_vnr_area gr_liv_area garage_area pool_area
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 North_Ames 10040. 92.2 1292. 423. 1.99
## 2 Gilbert 11342. 39.2 1621. 454. 0
## 3 Stone_Brook 11383. 211. 1949. 626. 0
## 4 Northwest_Ames 11662. 171. 1689. 535. 8.91
## 5 Somerset 7611. 138. 1605. 620. 0
## 6 Briardale 1841. 364. 1115. 292 0
## 7 Northpark_Villa 2756. 0 1244. 421. 0
## 8 Northridge_Heights 11217. 331. 1943. 748. 0.867
## 9 Bloomington_Heights 3399. 41.6 1405. 493. 0
## 10 Northridge 12976. 417. 2481. 721. 7.82
## # ℹ 18 more rows
In data.table
, you can just use the .SD
variable again and combine that with lapply()
.
df_ames[, c(
lapply(.SD, mean, na.rm = TRUE),
n_houses = .N
),= neighborhood,
by = patterns('area')
.SDcols
]## neighborhood lot_area mas_vnr_area gr_liv_area garage_area pool_area
## <fctr> <num> <num> <num> <num> <num>
## 1: North_Ames 10039.702 92.24605 1292.054 423.2460 1.986456
## 2: Gilbert 11342.370 39.17576 1620.897 453.8000 0.000000
## 3: Stone_Brook 11383.392 211.15686 1949.216 625.7059 0.000000
## 4: Northwest_Ames 11661.695 170.78626 1688.771 534.9084 8.908397
## 5: Somerset 7610.676 137.56044 1604.830 620.3407 0.000000
## ---
## 24: Timberland 18233.542 179.87500 1714.639 604.6667 0.000000
## 25: Meadow_Village 2109.027 7.27027 1066.703 226.3784 0.000000
## 26: Veenker 15482.208 139.00000 1819.542 574.0833 33.333333
## 27: Green_Hills 9001.000 119.50000 1398.500 299.0000 0.000000
## 28: Landmark 3612.000 0.00000 1320.000 484.0000 0.000000
## 3 variable(s) not shown: [price_by_lot_area <num>, log_price_by_lot_area <num>, N <int>]
Notice that for data.table
we had to use the c()
function to comine results. Had we used a list via .()
, then results wouldn’t look quite the same.
df_ames[,
.(lapply(.SD, mean, na.rm = TRUE),
n_houses = .N
),= neighborhood,
by = patterns('area')
.SDcols
]## neighborhood V1 n_houses
## <fctr> <list> <int>
## 1: North_Ames 10039.7 443
## 2: North_Ames 92.24605 443
## 3: North_Ames 1292.054 443
## 4: North_Ames 423.246 443
## 5: North_Ames 1.986456 443
## ---
## 192: Landmark 1320 1
## 193: Landmark 484 1
## 194: Landmark 0 1
## 195: Landmark NaN 1
## 196: Landmark NaN 1
Or we could also cover, say, numeric columns. In that case we shouldn’t use .cols = contains()
and .SDcols = patterns()
. Instead we’d use .cols = where(is.numeric)
and .SDcols = is.numeric
:
|>
ames summarize(
across(
.cols = where(is.numeric),
.fns = mean
),.by = neighborhood
)## # A tibble: 28 × 35
## neighborhood lot_frontage lot_area year_built year_remod_add mas_vnr_area
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 North_Ames 63.7 10040. 1960. 1971. 92.2
## 2 Gilbert 49.9 11342. 1998. 1999. 39.2
## 3 Stone_Brook 56.1 11383. 1999. 2000. 211.
## 4 Northwest_Ames 52.9 11662. 1976. 1980. 171.
## 5 Somerset 57.5 7611. 2005. 2005. 138.
## 6 Briardale 21.5 1841. 1972. 1973 364.
## 7 Northpark_Villa 25.7 2756. 1976. 1976. 0
## 8 Northridge_Heig… 82.7 11217. 2006. 2006. 331.
## 9 Bloomington_Hei… 33.5 3399. 2005. 2005. 41.6
## 10 Northridge 69.7 12976. 1995. 1997. 417.
## # ℹ 18 more rows
## # ℹ 29 more variables: bsmt_fin_sf_1 <dbl>, bsmt_fin_sf_2 <dbl>,
## # bsmt_unf_sf <dbl>, total_bsmt_sf <dbl>, first_flr_sf <dbl>,
## # second_flr_sf <dbl>, gr_liv_area <dbl>, bsmt_full_bath <dbl>,
## # bsmt_half_bath <dbl>, full_bath <dbl>, half_bath <dbl>,
## # bedroom_abv_gr <dbl>, kitchen_abv_gr <dbl>, tot_rms_abv_grd <dbl>,
## # fireplaces <dbl>, garage_cars <dbl>, garage_area <dbl>, …
In data.tables
the special variable .N
refers to the amount of rows. In this case .N
refers to the amount of all rows. In a grouped setting, .N
would refer to the group sizes.
df_ames[, c(
lapply(.SD, mean, na.rm = TRUE),
n_houses = .N
),= neighborhood,
by = is.numeric
.SDcols
]## neighborhood lot_frontage lot_area year_built year_remod_add
## <fctr> <num> <num> <num> <num>
## 1: North_Ames 63.66591 10039.702 1959.876 1970.652
## 2: Gilbert 49.92121 11342.370 1998.303 1998.994
## 3: Stone_Brook 56.07843 11383.392 1999.431 1999.725
## 4: Northwest_Ames 52.89313 11661.695 1975.588 1980.252
## 5: Somerset 57.45604 7610.676 2004.621 2004.956
## ---
## 24: Timberland 64.25000 18233.542 1995.333 1996.667
## 25: Meadow_Village 22.83784 2109.027 1972.243 1976.027
## 26: Veenker 48.00000 15482.208 1981.708 1987.750
## 27: Green_Hills 0.00000 9001.000 1992.000 1992.000
## 28: Landmark 0.00000 3612.000 1993.000 1994.000
## 34 variable(s) not shown: [mas_vnr_area <num>, bsmt_fin_sf_1 <num>, bsmt_fin_sf_2 <num>, bsmt_unf_sf <num>, total_bsmt_sf <num>, first_flr_sf <num>, second_flr_sf <num>, gr_liv_area <num>, bsmt_full_bath <num>, bsmt_half_bath <num>, ...]