```
library(tidyverse)
<- palmerpenguins::penguins |>
penguins_dat filter(!is.na(sex))
|>
penguins_dat ggplot(aes(x = body_mass_g, y = '')) +
geom_point(
size = 4, alpha = 0.75,
shape = 21,
fill = 'dodgerblue4',
color = 'black',
position = position_jitter(
height = 0.25,
seed = 2343
)+
) theme_minimal(
base_size = 18,
base_family = 'Source Sans Pro'
+
) labs(
x = 'Weight (in g)',
y = element_blank(),
title = 'Distribution of Penguin Weights'
+
) theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.title = element_text(
size = rel(1.5),
family = 'Merriweather',
face = 'bold'
) )
```

# Why you shouldn’t use boxplots

Box plots are a very common tool in data visualization to show how your data is distributed. But they have a crucial flaw. Let’s find out what that flaw is.

And if you’re interested in the video version of this blog post, you can find it here:

## The flaw of box plots

Imagine that you have a numeric variable, like the weight of penguins.

You might be interested in

- what’s the lowest weight of the penguin,
- what’s the highest weight of the penguin,
- what are the smallest 25% of the data,
- what are the middle 50% of the data, and
- what are the highest 25% of the data

## Code

```
<- function(quantiles) {
highlight_points |>
penguins_dat mutate(
highlight = between(
body_mass_g, 1],
quantiles[2]
quantiles[
)|>
) ggplot(aes(x = body_mass_g, y = '', fill = highlight)) +
geom_point(
size = 4, alpha = 0.75,
shape = 21,
color = 'black',
position = position_jitter(
height = 0.25,
width = 0,
seed = 2343
)+
) theme_minimal(
base_size = 18,
base_family = 'Source Sans Pro'
+
) labs(
x = 'Weight (in g)',
y = element_blank(),
title = 'Distribution of Penguin Weights'
+
) theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.title = element_text(
size = rel(1.5),
family = 'Merriweather',
face = 'bold'
),legend.position = 'none'
+
) scale_fill_manual(values = c('dodgerblue4', 'firebrick4'))
}
```

`highlight_points(quantile(penguins_dat$body_mass_g, c(0, 0)))`

`highlight_points(quantile(penguins_dat$body_mass_g, c(1, 1)))`

`highlight_points(quantile(penguins_dat$body_mass_g, c(0, 0.25)))`

`highlight_points(quantile(penguins_dat$body_mass_g, c(0.25, 0.75)))`

`highlight_points(quantile(penguins_dat$body_mass_g, c(0.75, 1)))`

A box plot can show you this. In fact, that’s exactly what it shows. But there’s a crucial flaw. Let’s check out what it does. What a box plot does is that it computes key quantities like

- the lowest value,
- the highest value,
- the point where 25% of the data are lower than that,
- the point where 50% of the data are lower than that,
- and the point where 75% of the data are lower than that.

```
quantile(penguins_dat$body_mass_g)
## 0% 25% 50% 75% 100%
## 2700 3550 4050 4775 6300
```

Once these values are calculated, the box plot can be assembled by just connecting the dots. First, you draw a line from the first to the second point which would represent the lowest 25% of the data.

```
<- ggplot() +
lower_line_plot geom_line(
aes(
x = quantile(penguins_dat$body_mass_g, c(0, 0.25)),
y = 1
)+
) theme_minimal(
base_size = 18,
base_family = 'Source Sans Pro'
+
) labs(
x = 'Weight (in g)',
y = element_blank(),
title = 'Distribution of Penguin Weights'
+
) theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.title = element_text(
size = rel(1.5),
family = 'Merriweather',
face = 'bold'
),legend.position = 'none'
+
) coord_cartesian(
xlim = range(penguins_dat$body_mass_g),
ylim = 1 + c(-0.5, 0.5)
) lower_line_plot
```

Then, you draw a box from the 25% point to the 75% point.

```
<- lower_line_plot +
lower_line_and_box geom_rect(
aes(
xmin = quantile(penguins_dat$body_mass_g, 0.25),
xmax = quantile(penguins_dat$body_mass_g, 0.75),
ymin = 1.25,
ymax = 0.75
),fill = 'white',
color = 'black'
) lower_line_and_box
```

And then you draw another line from the 75% point to the 100% point.

```
<- lower_line_and_box +
boxplot_wo_median geom_line(
aes(
x = quantile(penguins_dat$body_mass_g, c(0.75, 1)),
y = 1
)
) boxplot_wo_median
```

Finally, you can highlight the median, i.e. the 50% point, with a line inside the box.

```
+
boxplot_wo_median geom_line(
aes(
x = quantile(penguins_dat$body_mass_g, c(0.5, 0.5)),
y = c(1.25, 0.75),
linewidth = 1
) )
```

Or you do all of that in one go:

```
|>
penguins_dat ggplot(aes(x = body_mass_g, y = 1)) +
geom_boxplot() +
theme_minimal(
base_size = 18,
base_family = 'Source Sans Pro'
+
) labs(
x = 'Weight (in g)',
y = element_blank(),
title = 'Distribution of Penguin Weights'
+
) theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.title = element_text(
size = rel(1.5),
family = 'Merriweather',
face = 'bold'
),legend.position = 'none'
+
) coord_cartesian(
xlim = range(penguins_dat$body_mass_g),
ylim = 1 + c(-0.5, 0.5)
)
```

The problem with that is that the underlying data can look wildly different even if two box plots are the same. The thing is: As long as these key points (0%, 25%, …, 100%) are the same, the boxplot will look the same. So that’s how you can end up with a whole bunch of box plots that look exactly the same but the underlying data is completely different.

## Code

```
<- function(vals_tib) {
plot_comparison |>
vals_tib ggplot() +
geom_boxplot(
aes(x = 'boxplot', y = boxplot_vals),
fill = '#CC79A7',
col = 'black',
linewidth = 1
+
) geom_violin(
aes(x = 'violin', y = boxplot_vals),
fill = '#009E73',
col = 'black',
linewidth = 1
+
) coord_cartesian(ylim = c(0, 100)) +
theme_minimal(
base_size = 15,
base_family = 'Source Sans Pro'
+
) labs(
x = element_blank(),
y = element_blank(),
title = 'Different Data, Same Boxplot',
subtitle = 'Boxplots can look exactly the same, even when the distribution is wildly different.'
+
) theme(
plot.title = element_text(
family = 'Merriweather',
face = 'bold',
size = rel(2)
),plot.title.position = 'plot',
panel.grid.minor = element_blank()
)
}<- function(x, sample_size = 25) {
runif_boxplot_vals tibble(
seq = x,
boxplot_vals = c(
0,
runif(sample_size, min = 0, max = 25),
25,
runif(sample_size, min = 25, max = 50),
50,
runif(sample_size, min = 50, max = 75),
75,
runif(sample_size, min = 75, max = 100),
100
)
)
}
set.seed(5345234)
library(gganimate)
<- map2_dfr(
sim_dat c(1, 3, 5, 7),
c(25, 25, 25, 1000),
runif_boxplot_vals|>
) bind_rows(
tibble(
seq = 2,
boxplot_vals = c(
0,
rep(12.5, 100),
25,
rep(75 / 2, 100),
50,
rep(125 / 2, 100),
75,
rep(175 / 2, 100),
100
)
)|>
) bind_rows(
tibble(
seq = 4,
boxplot_vals = c(
0,
pmin(rexp(100, rate = 1 / 12), 25),
25,
25 + pmin(rexp(100, rate = 1 / 12), 25),
50,
50 +pmin(rexp(100, rate = 1 / 12), 25),
75,
75 + pmin(rexp(100, rate = 1 / 12), 25),
100
)
)|>
) bind_rows(
tibble(
seq = 6,
boxplot_vals = c(
0,
25 - pmin(rexp(100, rate = 12), 25),
25,
50 - pmin(rexp(100, rate = 12), 25),
50,
75 - pmin(rexp(100, rate = 1 / 12), 25),
75,
100 - pmin(rexp(100, rate = 1 / 12), 25),
100
)
)
)
<- sim_dat |>
anim ggplot() +
geom_boxplot(
aes(x = 'boxplot', y = boxplot_vals),
fill = '#CC79A7',
col = 'black',
linewidth = 1
+
) geom_violin(
aes(x = 'violin', y = boxplot_vals),
fill = '#009E73',
col = 'black',
linewidth = 1
+
) ::geom_sina(
ggforceaes(x = 'violin', y = boxplot_vals, size = as.factor(seq)),
alpha = 0.5
+
) scale_size_manual(values = c(3, 3, 3, 3, 3, 3, 1)) +
coord_cartesian(ylim = c(0, 100)) +
theme_minimal(
base_size = 15,
base_family = 'Source Sans Pro'
+
) labs(
x = element_blank(),
y = element_blank(),
title = 'Different Data, Same Boxplot',
subtitle = 'Boxplots can look exactly the same (even when the underlying data is wildly different.)'
+
) theme(
plot.title = element_text(
family = 'Merriweather',
face = 'bold',
size = rel(2)
),plot.title.position = 'plot',
panel.grid.minor = element_blank(),
legend.position = 'none'
+
) transition_states(
seq, transition_length = 1,
state_length = 5
+
) enter_fade() +
exit_fade()
animate(
anim, nframes = 200,
fps = 30,
renderer = gifski_renderer(),
width = 900,
height = 600,
res = 100,
units = 'px',
quality = 90
)
```

Think about it. The way to compute these key points is very simple:

- You just sort all your data that you have in ascending order.
- Assuming that you have approximately 100 points, all you have to do to get these points is to just take the first, the 25th, the 50th, the 75th, and the 100th points.

All the other data in between these points does not matter for the box plot. The data points can vary as much as they like as long as they stay within the range of the two key points that they are put in between.

So that’s why it’s generally not a good idea to solely rely on box plots. Instead, you can additionally use something like violin plots that try to show the underlying distribution and not just the key quantities.

```
|>
penguins_dat ggplot(aes(x = body_mass_g, y = 1)) +
geom_violin(fill = 'dodgerblue4') +
geom_boxplot(width = 0.25, color = 'black', linewidth = 1) +
theme_minimal(
base_size = 18,
base_family = 'Source Sans Pro'
+
) labs(
x = 'Weight (in g)',
y = element_blank(),
title = 'Distribution of Penguin Weights'
+
) theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.title = element_text(
size = rel(1.5),
family = 'Merriweather',
face = 'bold'
),legend.position = 'none'
+
) coord_cartesian(
xlim = range(penguins_dat$body_mass_g),
ylim = 1 + c(-0.5, 0.5)
)
```

Or you could even go further and go for a rain cloud plot that combines box plots and violin plots with another histogram that shows the data more explicitly. And the way they are set up, they look like rain clouds, hence the name.

```
|>
penguins_dat ggplot(aes(x = body_mass_g, y = 1)) +
::stat_halfeye(fill = 'dodgerblue4') +
ggdist::stat_dots(
ggdistaes(y = 0.8),
fill = 'dodgerblue4',
color = 'black',
side = 'bottom'
+
) geom_boxplot(width = 0.25, color = 'black', linewidth = 1) +
theme_minimal(
base_size = 18,
base_family = 'Source Sans Pro'
+
) labs(
x = 'Weight (in g)',
y = element_blank(),
title = 'Distribution of Penguin Weights'
+
) theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.title = element_text(
size = rel(1.5),
family = 'Merriweather',
face = 'bold'
),legend.position = 'none'
+
) coord_cartesian(
xlim = range(penguins_dat$body_mass_g)
)
```

Or you can plot the points explicitly instead of using a histogram.

```
|>
penguins_dat ggplot(aes(x = body_mass_g, y = 1)) +
::stat_halfeye(fill = 'dodgerblue4') +
ggdistgeom_point(
aes(y = 0.75),
size = 4, alpha = 0.75,
shape = 21,
fill = 'dodgerblue4',
color = 'black',
position = position_jitter(
height = 0.1,
seed = 2343
)+
) geom_boxplot(width = 0.25, color = 'black', linewidth = 1) +
theme_minimal(
base_size = 18,
base_family = 'Source Sans Pro'
+
) labs(
x = 'Weight (in g)',
y = element_blank(),
title = 'Distribution of Penguin Weights'
+
) theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.title = element_text(
size = rel(1.5),
family = 'Merriweather',
face = 'bold'
),legend.position = 'none'
+
) coord_cartesian(
xlim = range(penguins_dat$body_mass_g)
)
```

## Conclusion

I hope you’ve enjoyed this short blog post. Have a great day and see you next time. And if you found this helpful, here are some other ways I can help you:

- 3 Minute Wednesdays: A weekly newsletter with bite-sized tips and tricks for R users
- Insightful Data Visualizations for “Uncreative” R Users: A course that teaches you how to leverage
`{ggplot2}`

to make charts that communicate effectively without being a design expert.