library(tidyverse)
library(rvest)
<- 'https://de.wikipedia.org/wiki/Land_(Deutschland)#Rahmendaten_der_L%C3%A4nder'
url_wiki
<- read_html(url_wiki) |>
html_tables html_table(dec = ',')
glimpse(html_tables)
## List of 6
## $ : tibble [17 × 14] (S3: tbl_df/tbl/data.frame)
## ..$ Wappen : logi [1:17] NA NA NA NA NA NA ...
## ..$ Land : chr [1:17] "Baden-Württemberg" "Bayern" "Berlin" "Brandenburg" ...
## ..$ Abk. : chr [1:17] "BW" "BY" "BE" "BB" ...
## ..$ Hauptstadt : chr [1:17] "Stuttgart" "München" "—" "Potsdam" ...
## ..$ bevölkerungs-reichste Stadt.mw-parser-output .fussnoten-marke{font-size:0.75rem;font-style:normal;font-variant:normal;font-weight:normal;unicode-bidi:isolate;white-space:nowrap}.mw-parser-output .fussnoten-marke.reference,.mw-parser-output span.fussnoten-inhalt{padding-left:0.1rem}.mw-parser-output .fussnoten-marke.reference~.fussnoten-marke.reference,.mw-parser-output span.fussnoten-inhalt~span.fussnoten-inhalt{padding-left:0.15rem}.mw-parser-output .fussnoten-block{margin-bottom:0.1rem}.mw-parser-output div.fussnoten-inhalt{display:inline-block;padding-left:0.8rem;text-indent:-0.8rem}.mw-parser-output div.fussnoten-inhalt p,.mw-parser-output div.fussnoten-inhalt dl,.mw-parser-output div.fussnoten-inhalt ol,.mw-parser-output div.fussnoten-inhalt ul{text-indent:0}.mw-parser-output div.fussnoten-inhalt.fussnoten-floatfix{display:block}.mw-parser-output .fussnoten-box{margin-top:0.5rem;padding-left:0.8rem}.mw-parser-output .fussnoten-box,.mw-parser-output div.fussnoten-inhalt{font-size:94%}.mw-parser-output .fussnoten-box div.fussnoten-inhalt,.mw-parser-output span.fussnoten-inhalt,.mw-parser-output .fussnoten-inhalt .reference-text{font-size:inherit}.mw-parser-output .fussnoten-inhalt .reference-text{display:inline}.mw-parser-output .fussnoten-linie{display:inline-block;position:relative;top:-1em;border-top:solid 1px #808080;width:8rem}.mw-parser-output .fussnoten-linie+p,.mw-parser-output .fussnoten-linie+dl,.mw-parser-output .fussnoten-linie+ol,.mw-parser-output .fussnoten-linie+ul,.mw-parser-output .fussnoten-linie+link+div{margin-top:-1em}.mw-parser-output .fussnoten-marke.reference:target,.mw-parser-output .fussnoten-inhalt:target{background-color:#eaf3ff;box-shadow:0 0 0 0.25em #eaf3ff}.mw-parser-output .fussnoten-marke.reference:target,.mw-parser-output .fussnoten-inhalt:target .fussnoten-marke{font-weight:bold} a: chr [1:17] "Stuttgart" "München" "—" "Potsdam" ...
## ..$ Beitrittzum Bund : chr [1:17] "1949/52[15]" "1949" "1990[16]" "1990" ...
## ..$ Regierungs-chef : chr [1:17] "Winfried Kretschmann (Grüne)" "Markus Söder (CSU)" "Kai Wegner (CDU)" "Dietmar Woidke (SPD)" ...
## ..$ Regierungs-partei(en) : chr [1:17] "Grüne und CDU" "CSU und Freie Wähler" "CDU und SPD" "SPD und BSW" ...
## ..$ Bundesrats-stimmen : chr [1:17] "6" "6" "4" "4" ...
## ..$ Fläche(km²)[13] : chr [1:17] "35.748" "70.542" "891" "29.654" ...
## ..$ Ein-wohner(Mio.)[13] : num [1:17] 11.28 13.369 3.755 2.573 0.685 ...
## ..$ Ein-wohnerje km²[13] : chr [1:17] "316" "190" "4.210" "87" ...
## ..$ Ausländer(%)[14] : num [1:17] 16.1 13.7 19.6 5.2 19 16.8 16.9 4.8 9.9 13.8 ...
## ..$ Sprachen : chr [1:17] "Deutsch" "Deutsch" "Deutsch" "Deutsch, Niedersorbisch, Niederdeutsch" ...
## $ : tibble [18 × 10] (S3: tbl_df/tbl/data.frame)
## ..$ Wappen : logi [1:18] NA NA NA NA NA NA ...
## ..$ Land : chr [1:18] "BW" "BY" "BE" "BB" ...
## ..$ BIP (2018)in Mrd. €[17] : chr [1:18] "511,4" "625,2" "147,1" "73,7" ...
## ..$ Pro Kopf (2018)in €[17] : chr [1:18] "46.279" "47.946" "40.568" "29.411" ...
## ..$ EK/Kin €[18] : chr [1:18] "19.261" "18.775" "14.797" "14.634" ...
## ..$ Schulden (2012)in Mrd. €[19] : chr [1:18] "67,471" "42,794" "61,220" "21,336" ...
## ..$ Pro Kopf (2012)in €[20] : chr [1:18] "6.255" "3.397" "17.482" "8.549" ...
## ..$ Schulden (31. Dezember 2018)in Mrd. €[21]: chr [1:18] "44,009" "14,613" "54,403" "16,122" ...
## ..$ Pro Kopf (31. Dezember 2018)in €[21] : chr [1:18] "3.976" "1.117" "14.926" "6.418" ...
## ..$ AQ[22] : chr [1:18] "3,9" "3,8" "9,8" "8,2" ...
## $ : tibble [4 × 4] (S3: tbl_df/tbl/data.frame)
## ..$ X1: chr [1:4] "Land Baden-WürttembergSeitenverhältnis: 3:5" "Freie Hansestadt Bremen2:3" "Land Niedersachsen2:3" "Freistaat Sachsen3:5"
## ..$ X2: chr [1:4] "Freistaat Bayern3:5" "Freie und Hansestadt Hamburg2:3" "Land Nordrhein-Westfalen3:5" "Land Sachsen-Anhalt3:5"
## ..$ X3: chr [1:4] "Land Berlin3:5" "Land Hessen3:5" "Land Rheinland-Pfalz2:3" "Land Schleswig-Holstein3:5"
## ..$ X4: chr [1:4] "Land Brandenburg3:5" "Land Mecklenburg-Vorpommern3:5" "Saarland3:5" "Freistaat Thüringen1:2"
## $ : tibble [8 × 3] (S3: tbl_df/tbl/data.frame)
## ..$ Fläche in km² : chr [1:8] ">250.000" "50.000–100.000" "20.000–50.000" "10.000–20.000" ...
## ..$ WeimarerRepublik : int [1:8] 1 1 0 5 2 3 3 4
## ..$ BundesrepublikDeutschland: int [1:8] 0 1 7 4 0 1 0 3
## $ : tibble [8 × 3] (S3: tbl_df/tbl/data.frame)
## ..$ Einwohner (1925 bzw. 2018): chr [1:8] ">30.000.000" "10.000.000–20.000.000" "5.000.000–10.000.000" "2.000.000–5.000.000" ...
## ..$ WeimarerRepublik : int [1:8] 1 0 1 3 3 4 5 2
## ..$ BundesrepublikDeutschland : int [1:8] 0 3 2 7 2 2 0 0
## $ : tibble [2 × 3] (S3: tbl_df/tbl/data.frame)
## ..$ X1: chr [1:2] "Aktuelle Länder:" "Ehemalige Länder:"
## ..$ X2: chr [1:2] "Baden-Württemberg Baden-Württemberg | Bayern Bayern | Berlin Berlin | Brandenburg Brandenburg | Bremen Bremen |"| __truncated__ "(Süd-)Baden | Württemberg-Baden | Württemberg-Hohenzollern"
## ..$ X3: logi [1:2] NA NA
Creating fancy interactive tables using Internet data with rvest
and reactable
{rvest}
and then I’ll show you how to create an elaborate table with {reactable}
This blog post will be a long one as we’ll create this fancy interactive table:
Code
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reactable)
library(reactablefmtr)
##
## Attaching package: 'reactablefmtr'
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(htmltools)
library(ggiraph)
<- '#104E8B'
main_color read_rds(
::here('posts/29_reactable_germany/final_table.rds')
here|>
) browsable()
To do so, we’ll proceed in two steps:
- We’ll web-scrape the data for this table from the Internet using
{rvest}
. - We’ll put everything into a nice table using
{reactable}
.
As always, you can find the video version of this blog post on YouTube:
Scraping & cleaning tables and texts from the Internet
In this first part, we’re going to learn how to get the source data for our table. After that’s done, we’ll wrap that data into a nice package
Scraping a table from Wikipedia
Extracting tables from Wikipedia is really easy. All you have to do is to pass a link to read_html()
and html_table()
from the {rvest}
package. The only tricky thing: Make sure that you specify what decimal number indicator the website uses. On English sites this will likely be .
(and that’s the default auf html_table()
) but for German information, we will need ,
.
Notice that this gives you a list of tibbles that were on the website. The conversion to tables isn’t perfect but it’s a great start. Let’s get the first table from that and check out the column names.
colnames(html_tables[[1]])
## [1] "Wappen"
## [2] "Land"
## [3] "Abk."
## [4] "Hauptstadt"
## [5] "bevölkerungs-reichste Stadt.mw-parser-output .fussnoten-marke{font-size:0.75rem;font-style:normal;font-variant:normal;font-weight:normal;unicode-bidi:isolate;white-space:nowrap}.mw-parser-output .fussnoten-marke.reference,.mw-parser-output span.fussnoten-inhalt{padding-left:0.1rem}.mw-parser-output .fussnoten-marke.reference~.fussnoten-marke.reference,.mw-parser-output span.fussnoten-inhalt~span.fussnoten-inhalt{padding-left:0.15rem}.mw-parser-output .fussnoten-block{margin-bottom:0.1rem}.mw-parser-output div.fussnoten-inhalt{display:inline-block;padding-left:0.8rem;text-indent:-0.8rem}.mw-parser-output div.fussnoten-inhalt p,.mw-parser-output div.fussnoten-inhalt dl,.mw-parser-output div.fussnoten-inhalt ol,.mw-parser-output div.fussnoten-inhalt ul{text-indent:0}.mw-parser-output div.fussnoten-inhalt.fussnoten-floatfix{display:block}.mw-parser-output .fussnoten-box{margin-top:0.5rem;padding-left:0.8rem}.mw-parser-output .fussnoten-box,.mw-parser-output div.fussnoten-inhalt{font-size:94%}.mw-parser-output .fussnoten-box div.fussnoten-inhalt,.mw-parser-output span.fussnoten-inhalt,.mw-parser-output .fussnoten-inhalt .reference-text{font-size:inherit}.mw-parser-output .fussnoten-inhalt .reference-text{display:inline}.mw-parser-output .fussnoten-linie{display:inline-block;position:relative;top:-1em;border-top:solid 1px #808080;width:8rem}.mw-parser-output .fussnoten-linie+p,.mw-parser-output .fussnoten-linie+dl,.mw-parser-output .fussnoten-linie+ol,.mw-parser-output .fussnoten-linie+ul,.mw-parser-output .fussnoten-linie+link+div{margin-top:-1em}.mw-parser-output .fussnoten-marke.reference:target,.mw-parser-output .fussnoten-inhalt:target{background-color:#eaf3ff;box-shadow:0 0 0 0.25em #eaf3ff}.mw-parser-output .fussnoten-marke.reference:target,.mw-parser-output .fussnoten-inhalt:target .fussnoten-marke{font-weight:bold} a"
## [6] "Beitrittzum Bund"
## [7] "Regierungs-chef"
## [8] "Regierungs-partei(en)"
## [9] "Bundesrats-stimmen"
## [10] "Fläche(km²)[13]"
## [11] "Ein-wohner(Mio.)[13]"
## [12] "Ein-wohnerje km²[13]"
## [13] "Ausländer(%)[14]"
## [14] "Sprachen"
Notice that the first column labels may look right in the console. But when you copy them into a script file, you will notice that there are dumb invisible special characters. So with that in mind, let us simply select the columns that are of interest to us with numbers instead of the column names.
<- html_tables[[1]] |>
selected_state_dat select(
state = 2,
capital = 4,
most_populous_city = 5,
federal_assembly_votes = 9,
area_km2 = 10,
pop_million = 11,
pop_per_km2 = 12
)
selected_state_dat## # A tibble: 17 × 7
## state capital most_populous_city federal_assembly_votes area_km2 pop_million
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Baden… Stuttg… Stuttgart 6 35.748 11.3
## 2 Bayern München München 6 70.542 13.4
## 3 Berlin — — 4 891 3.76
## 4 Brand… Potsdam Potsdam 4 29.654 2.57
## 5 Bremen Bremen… Bremen 3 420 0.685
## 6 Hambu… — — 3 755 1.89
## 7 Hessen Wiesba… Frankfurt am Main 5 21.116 6.39
## 8 Meckl… Schwer… Rostock 3 23.295 1.63
## 9 Niede… Hannov… Hannover 6 47.710 8.14
## 10 Nordr… Düssel… Köln 6 34.112 18.1
## 11 Rhein… Mainz Mainz 4 19.858 4.16
## 12 Saarl… Saarbr… Saarbrücken 3 2.571 0.993
## 13 Sachs… Dresden Leipzig 4 18.450 4.09
## 14 Sachs… Magdeb… Halle (Saale) 4 20.459 2.19
## 15 Schle… Kiel Kiel 4 15.804 2.95
## 16 Thüri… Erfurt Erfurt 4 16.202 2.13
## 17 Bunde… Berlin Berlin b – 357.588 84.4
## # ℹ 1 more variable: pop_per_km2 <chr>
Parsing texts to numbers
Notice how the columns area_km2
, federal_assembly_votes
and pop_per_km2
are encoded as characters instead of as numbers. The reason for that is that in those columns there are always characters that R doesn’t know how to handle. For one, German numbers that are written as 4.001
mean 4,001
in English terminology. Also, in the last row of the federal_assembly_votes
column there’s still a b
character which was a footnote from Wikipedia. Let’s clean all of this up.
<- locale(
german_locale decimal_mark = ',',
grouping_mark = '.'
)|>
selected_state_dat mutate(
federal_assembly_votes = parse_number(
federal_assembly_votes,na = selected_state_dat$federal_assembly_votes[[17]],
locale = german_locale
),federal_assembly_votes = if_else(
is.na(federal_assembly_votes),
sum(federal_assembly_votes, na.rm = TRUE),
federal_assembly_votes
),across(
c(area_km2, pop_per_km2),
parse_number(
\(x)
x,locale = german_locale
)
)
)## # A tibble: 17 × 7
## state capital most_populous_city federal_assembly_votes area_km2 pop_million
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Baden… Stuttg… Stuttgart 6 35748 11.3
## 2 Bayern München München 6 70542 13.4
## 3 Berlin — — 4 891 3.76
## 4 Brand… Potsdam Potsdam 4 29654 2.57
## 5 Bremen Bremen… Bremen 3 420 0.685
## 6 Hambu… — — 3 755 1.89
## 7 Hessen Wiesba… Frankfurt am Main 5 21116 6.39
## 8 Meckl… Schwer… Rostock 3 23295 1.63
## 9 Niede… Hannov… Hannover 6 47710 8.14
## 10 Nordr… Düssel… Köln 6 34112 18.1
## 11 Rhein… Mainz Mainz 4 19858 4.16
## 12 Saarl… Saarbr… Saarbrücken 3 2571 0.993
## 13 Sachs… Dresden Leipzig 4 18450 4.09
## 14 Sachs… Magdeb… Halle (Saale) 4 20459 2.19
## 15 Schle… Kiel Kiel 4 15804 2.95
## 16 Thüri… Erfurt Erfurt 4 16202 2.13
## 17 Bunde… Berlin Berlin 69 357588 84.4
## # ℹ 1 more variable: pop_per_km2 <dbl>
Finally, we can also clean up the text in the capital
column where we transform "Bremen (de facto)"
to "Bremen"
. And we can transform "Bundesrepublik Deutschland"
to "Deutschland"
in the state
column. This will let us join nicely with our second data set that we’re going to scrape from the interwebs.
<- selected_state_dat |>
cleaned_state_dat mutate(
federal_assembly_votes = parse_number(
federal_assembly_votes,na = selected_state_dat$federal_assembly_votes[[17]],
locale = german_locale
),federal_assembly_votes = if_else(
is.na(federal_assembly_votes),
sum(federal_assembly_votes, na.rm = TRUE),
federal_assembly_votes
),across(
c(area_km2, pop_per_km2),
parse_number(
\(x)
x,locale = german_locale
)
),state = if_else(
== "Bundesrepublik Deutschland",
state "Deutschland",
state
),capital = if_else(
str_detect(capital, 'Bremen'),
'Bremen',
capital
)
)
|>
cleaned_state_dat glimpse()
## Rows: 17
## Columns: 7
## $ state <chr> "Baden-Württemberg", "Bayern", "Berlin", "Brand…
## $ capital <chr> "Stuttgart", "München", "—", "Potsdam", "Bremen…
## $ most_populous_city <chr> "Stuttgart", "München", "—", "Potsdam", "Bremen…
## $ federal_assembly_votes <dbl> 6, 6, 4, 4, 3, 3, 5, 3, 6, 6, 4, 3, 4, 4, 4, 4,…
## $ area_km2 <dbl> 35748, 70542, 891, 29654, 420, 755, 21116, 2329…
## $ pop_million <dbl> 11.280, 13.369, 3.755, 2.573, 0.685, 1.892, 6.3…
## $ pop_per_km2 <dbl> 316, 190, 4210, 87, 1633, 2506, 303, 70, 171, 5…
Scraping data from Statistikportal
Alright, now let’s get GDP data from the website Statistikportal.de
<- 'https://www.statistikportal.de/de/ugrdl/ergebnisse/wirtschaft-und-bevoelkerung/bipbws'
url_gdp
read_html(url_gdp) |>
html_table(dec = ',') |>
glimpse()
## List of 3
## $ : tibble [19 × 2] (S3: tbl_df/tbl/data.frame)
## ..$ Bruttoinlandsprodukt in jeweiligen Preisen 2022 nach Bundesländern: chr [1:19] "Land" "Land" "Baden-\n\t\t\tWürttemberg" "Bayern" ...
## ..$ Bruttoinlandsprodukt in jeweiligen Preisen 2022 nach Bundesländern: chr [1:19] "2022" "Mill. Euro" "538 948" "666 388" ...
## $ : tibble [19 × 11] (S3: tbl_df/tbl/data.frame)
## ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "Land" "Land" "Baden-Württemberg" "Bayern" ...
## ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "1991" "Index (2015 = 100)" "72,3" "65,1" ...
## ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2000" "Index (2015 = 100)" "80,7" "77,4" ...
## ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2010" "Index (2015 = 100)" "89,4" "88,4" ...
## ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2015" "Index (2015 = 100)" "100" "100" ...
## ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2017" "Index (2015 = 100)" "104,7" "106,3" ...
## ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2018" "Index (2015 = 100)" "107,0" "106,7" ...
## ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2019" "Index (2015 = 100)" "106,6" "108,6" ...
## ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2020" "Index (2015 = 100)" "101,5" "104,6" ...
## ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2021" "Index (2015 = 100)" "104,8" "107,5" ...
## ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2022" "Index (2015 = 100)" "106,2" "109,8" ...
## $ : tibble [21 × 7] (S3: tbl_df/tbl/data.frame)
## ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Land" "Land" "Land" "Land" ...
## ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Wirtschaft\n\t\t\tinsgesamt\n\t\t\t(A-T)" "Wirtschaft\n\t\t\tinsgesamt\n\t\t\t(A-T)" "Wirtschaft\n\t\t\tinsgesamt\n\t\t\t(A-T)" "Mill. EUR" ...
## ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Davon" "Land- und\n\t\t\tForstwirtschaft,\n\t\t\tFischerei\n\t\t\t(A)" "Land- und\n\t\t\tForstwirtschaft,\n\t\t\tFischerei\n\t\t\t(A)" "Mill. EUR" ...
## ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Davon" "Produzierendes\n\t\t\tGewerbe\n\t\t\t(B-F)" "Produzierendes\n\t\t\tGewerbe\n\t\t\t(B-F)" "Mill. EUR" ...
## ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Davon" "darunter" "Verarbeitendes\n\t\t\tGewerbe\n\t\t\t(C)" "Mill. EUR" ...
## ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Davon" "darunter" "Baugewerbe\n\t\t\t(F)" "Mill. EUR" ...
## ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Davon" "Dienstleistungs-\n\t\t\tbereiche\n\t\t\t(G-T)" "Dienstleistungs-\n\t\t\tbereiche\n\t\t\t(G-T)" "Mill. EUR" ...
In this case, we see that column names are always the same because that table has a merged cell on top. So let’s avoid that by telling html_table()
to avoid headers. And then let’s grab the second table from the list of extracted tables.
<- read_html(url_gdp) |>
html_tables html_table(dec = ',', header = FALSE)
2]]
html_tables[[## # A tibble: 20 × 11
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Bruttoinlandspro… Brut… Brut… Brut… Brut… Brut… Brut… Brut… Brut… Brut… Brut…
## 2 Land 1991 2000 2010 2015 2017 2018 2019 2020 2021 2022
## 3 Land Inde… Inde… Inde… Inde… Inde… Inde… Inde… Inde… Inde… Inde…
## 4 Baden-Württemberg 72,3 80,7 89,4 100 104,7 107,0 106,6 101,5 104,8 106,2
## 5 Bayern 65,1 77,4 88,4 100 106,3 106,7 108,6 104,6 107,5 109,8
## 6 Berlin 77,0 83,7 90,3 100 109,6 113,5 116,8 114,2 117,8 123,6
## 7 Brandenburg 50,2 84,6 93,1 100 104,7 105,2 107,0 104,6 107,2 110,7
## 8 Bremen 86,5 89,1 94,5 100 103,3 103,0 101,5 96,4 102,3 107,5
## 9 Hamburg 76,5 85,8 94,6 100 104,0 103,8 107,1 102,0 105,8 110,5
## 10 Hessen 80,5 91,8 94,8 100 105,0 105,5 107,1 102,0 104,5 106,2
## 11 Mecklenburg-Vorp… 59,2 89,7 94,8 100 105,8 103,8 108,3 104,7 107,2 107,4
## 12 Niedersachsen 78,1 85,1 93,6 100 106,9 108,3 110,6 106,2 107,0 108,2
## 13 Nordrhein-Westfa… 81,6 88,6 94,2 100 103,7 105,1 105,1 101,8 103,6 104,7
## 14 Rheinland-Pfalz 79,5 85,1 91,7 100 102,5 102,6 103,1 99,5 108,2 107,9
## 15 Saarland 84,2 90,8 96,2 100 101,4 100,8 98,8 94,0 95,2 96,8
## 16 Sachsen 51,2 80,5 90,8 100 104,1 104,9 106,4 102,7 104,6 107,4
## 17 Sachsen-Anhalt 59,5 91,2 97,2 100 102,6 102,1 103,7 101,3 103,6 106,3
## 18 Schleswig-Holste… 81,3 88,7 93,1 100 105,2 105,7 108,2 106,4 107,7 109,1
## 19 Thüringen 47,9 83,1 90,5 100 103,3 103,0 102,9 99,8 101,8 103,3
## 20 Deutschland 73,3 84,5 92,0 100 105,0 106,0 107,1 103,2 105,9 107,8
Alright, let’s get rid of the 1991, 2000 and 2010 column. I only want to start at 2015.
2]] |>
html_tables[[select(-(2:4))
## # A tibble: 20 × 8
## X1 X5 X6 X7 X8 X9 X10 X11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Bruttoinlandsprodukt (preisbereini… Brut… Brut… Brut… Brut… Brut… Brut… Brut…
## 2 Land 2015 2017 2018 2019 2020 2021 2022
## 3 Land Inde… Inde… Inde… Inde… Inde… Inde… Inde…
## 4 Baden-Württemberg 100 104,7 107,0 106,6 101,5 104,8 106,2
## 5 Bayern 100 106,3 106,7 108,6 104,6 107,5 109,8
## 6 Berlin 100 109,6 113,5 116,8 114,2 117,8 123,6
## 7 Brandenburg 100 104,7 105,2 107,0 104,6 107,2 110,7
## 8 Bremen 100 103,3 103,0 101,5 96,4 102,3 107,5
## 9 Hamburg 100 104,0 103,8 107,1 102,0 105,8 110,5
## 10 Hessen 100 105,0 105,5 107,1 102,0 104,5 106,2
## 11 Mecklenburg-Vorpommern 100 105,8 103,8 108,3 104,7 107,2 107,4
## 12 Niedersachsen 100 106,9 108,3 110,6 106,2 107,0 108,2
## 13 Nordrhein-Westfalen 100 103,7 105,1 105,1 101,8 103,6 104,7
## 14 Rheinland-Pfalz 100 102,5 102,6 103,1 99,5 108,2 107,9
## 15 Saarland 100 101,4 100,8 98,8 94,0 95,2 96,8
## 16 Sachsen 100 104,1 104,9 106,4 102,7 104,6 107,4
## 17 Sachsen-Anhalt 100 102,6 102,1 103,7 101,3 103,6 106,3
## 18 Schleswig-Holstein 100 105,2 105,7 108,2 106,4 107,7 109,1
## 19 Thüringen 100 103,3 103,0 102,9 99,8 101,8 103,3
## 20 Deutschland 100 105,0 106,0 107,1 103,2 105,9 107,8
Now, notice that the column names that we actually want are in the second row. The {janitor}
package has us covered there.
2]] |>
html_tables[[select(-(2:4)) |>
::row_to_names(row_number = 2)
janitor## # A tibble: 18 × 8
## Land `2015` `2017` `2018` `2019` `2020` `2021` `2022`
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Land Index (2015… Index… Index… Index… Index… Index… Index…
## 2 Baden-Württemberg 100 104,7 107,0 106,6 101,5 104,8 106,2
## 3 Bayern 100 106,3 106,7 108,6 104,6 107,5 109,8
## 4 Berlin 100 109,6 113,5 116,8 114,2 117,8 123,6
## 5 Brandenburg 100 104,7 105,2 107,0 104,6 107,2 110,7
## 6 Bremen 100 103,3 103,0 101,5 96,4 102,3 107,5
## 7 Hamburg 100 104,0 103,8 107,1 102,0 105,8 110,5
## 8 Hessen 100 105,0 105,5 107,1 102,0 104,5 106,2
## 9 Mecklenburg-Vorpommern 100 105,8 103,8 108,3 104,7 107,2 107,4
## 10 Niedersachsen 100 106,9 108,3 110,6 106,2 107,0 108,2
## 11 Nordrhein-Westfalen 100 103,7 105,1 105,1 101,8 103,6 104,7
## 12 Rheinland-Pfalz 100 102,5 102,6 103,1 99,5 108,2 107,9
## 13 Saarland 100 101,4 100,8 98,8 94,0 95,2 96,8
## 14 Sachsen 100 104,1 104,9 106,4 102,7 104,6 107,4
## 15 Sachsen-Anhalt 100 102,6 102,1 103,7 101,3 103,6 106,3
## 16 Schleswig-Holstein 100 105,2 105,7 108,2 106,4 107,7 109,1
## 17 Thüringen 100 103,3 103,0 102,9 99,8 101,8 103,3
## 18 Deutschland 100 105,0 106,0 107,1 103,2 105,9 107,8
This leaves us an extra first row. So let’s remove that.
2]] |>
html_tables[[select(-(2:4)) |>
::row_to_names(row_number = 2) |>
janitorslice(-1)
## # A tibble: 17 × 8
## Land `2015` `2017` `2018` `2019` `2020` `2021` `2022`
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Baden-Württemberg 100 104,7 107,0 106,6 101,5 104,8 106,2
## 2 Bayern 100 106,3 106,7 108,6 104,6 107,5 109,8
## 3 Berlin 100 109,6 113,5 116,8 114,2 117,8 123,6
## 4 Brandenburg 100 104,7 105,2 107,0 104,6 107,2 110,7
## 5 Bremen 100 103,3 103,0 101,5 96,4 102,3 107,5
## 6 Hamburg 100 104,0 103,8 107,1 102,0 105,8 110,5
## 7 Hessen 100 105,0 105,5 107,1 102,0 104,5 106,2
## 8 Mecklenburg-Vorpommern 100 105,8 103,8 108,3 104,7 107,2 107,4
## 9 Niedersachsen 100 106,9 108,3 110,6 106,2 107,0 108,2
## 10 Nordrhein-Westfalen 100 103,7 105,1 105,1 101,8 103,6 104,7
## 11 Rheinland-Pfalz 100 102,5 102,6 103,1 99,5 108,2 107,9
## 12 Saarland 100 101,4 100,8 98,8 94,0 95,2 96,8
## 13 Sachsen 100 104,1 104,9 106,4 102,7 104,6 107,4
## 14 Sachsen-Anhalt 100 102,6 102,1 103,7 101,3 103,6 106,3
## 15 Schleswig-Holstein 100 105,2 105,7 108,2 106,4 107,7 109,1
## 16 Thüringen 100 103,3 103,0 102,9 99,8 101,8 103,3
## 17 Deutschland 100 105,0 106,0 107,1 103,2 105,9 107,8
Did you notice that the 2016 data is missing? Maybe you did. Maybe not. But let’s make this really obvious.
2]] |>
html_tables[[select(-(2:4)) |>
::row_to_names(row_number = 2) |>
janitorslice(-1) |>
mutate(
`2016` = NA_character_,
.before = 3
) ## # A tibble: 17 × 9
## Land `2015` `2016` `2017` `2018` `2019` `2020` `2021` `2022`
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Baden-Württemberg 100 <NA> 104,7 107,0 106,6 101,5 104,8 106,2
## 2 Bayern 100 <NA> 106,3 106,7 108,6 104,6 107,5 109,8
## 3 Berlin 100 <NA> 109,6 113,5 116,8 114,2 117,8 123,6
## 4 Brandenburg 100 <NA> 104,7 105,2 107,0 104,6 107,2 110,7
## 5 Bremen 100 <NA> 103,3 103,0 101,5 96,4 102,3 107,5
## 6 Hamburg 100 <NA> 104,0 103,8 107,1 102,0 105,8 110,5
## 7 Hessen 100 <NA> 105,0 105,5 107,1 102,0 104,5 106,2
## 8 Mecklenburg-Vorpomme… 100 <NA> 105,8 103,8 108,3 104,7 107,2 107,4
## 9 Niedersachsen 100 <NA> 106,9 108,3 110,6 106,2 107,0 108,2
## 10 Nordrhein-Westfalen 100 <NA> 103,7 105,1 105,1 101,8 103,6 104,7
## 11 Rheinland-Pfalz 100 <NA> 102,5 102,6 103,1 99,5 108,2 107,9
## 12 Saarland 100 <NA> 101,4 100,8 98,8 94,0 95,2 96,8
## 13 Sachsen 100 <NA> 104,1 104,9 106,4 102,7 104,6 107,4
## 14 Sachsen-Anhalt 100 <NA> 102,6 102,1 103,7 101,3 103,6 106,3
## 15 Schleswig-Holstein 100 <NA> 105,2 105,7 108,2 106,4 107,7 109,1
## 16 Thüringen 100 <NA> 103,3 103,0 102,9 99,8 101,8 103,3
## 17 Deutschland 100 <NA> 105,0 106,0 107,1 103,2 105,9 107,8
Cool, everything is clear to us now. But not necessarily to R. Notice how all columns are encoded as characters. R doesn’t know that most columns should be numbers. That’s not great for calculations. So let’s fix that.
2]] |>
html_tables[[select(-(2:4)) |>
::row_to_names(row_number = 2) |>
janitorslice(-1) |>
mutate(
`2016` = NA_character_,
across(
-Land,
parse_number(x, locale = german_locale)
\(x)
),.before = 3
)## # A tibble: 17 × 9
## Land `2015` `2016` `2017` `2018` `2019` `2020` `2021` `2022`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Baden-Württemberg 100 NA 105. 107 107. 102. 105. 106.
## 2 Bayern 100 NA 106. 107. 109. 105. 108. 110.
## 3 Berlin 100 NA 110. 114. 117. 114. 118. 124.
## 4 Brandenburg 100 NA 105. 105. 107 105. 107. 111.
## 5 Bremen 100 NA 103. 103 102. 96.4 102. 108.
## 6 Hamburg 100 NA 104 104. 107. 102 106. 110.
## 7 Hessen 100 NA 105 106. 107. 102 104. 106.
## 8 Mecklenburg-Vorpomme… 100 NA 106. 104. 108. 105. 107. 107.
## 9 Niedersachsen 100 NA 107. 108. 111. 106. 107 108.
## 10 Nordrhein-Westfalen 100 NA 104. 105. 105. 102. 104. 105.
## 11 Rheinland-Pfalz 100 NA 102. 103. 103. 99.5 108. 108.
## 12 Saarland 100 NA 101. 101. 98.8 94 95.2 96.8
## 13 Sachsen 100 NA 104. 105. 106. 103. 105. 107.
## 14 Sachsen-Anhalt 100 NA 103. 102. 104. 101. 104. 106.
## 15 Schleswig-Holstein 100 NA 105. 106. 108. 106. 108. 109.
## 16 Thüringen 100 NA 103. 103 103. 99.8 102. 103.
## 17 Deutschland 100 NA 105 106 107. 103. 106. 108.
Sweet! Now we can rename the Land
column to state
so that it matches with the column name from our Wikipedia data set. And once that is done, let us rearrange our data so that we can easily summarise our data by state.
2]] |>
html_tables[[select(-(2:4)) |>
::row_to_names(row_number = 2) |>
janitorslice(-1) |>
mutate(
`2016` = NA_character_,
across(
-Land,
parse_number(x, locale = german_locale)
\(x)
),.before = 3
|>
) rename(state = Land) |>
pivot_longer(
cols = -state,
names_to = 'year',
values_to = 'gdp'
)## # A tibble: 136 × 3
## state year gdp
## <chr> <chr> <dbl>
## 1 Baden-Württemberg 2015 100
## 2 Baden-Württemberg 2016 NA
## 3 Baden-Württemberg 2017 105.
## 4 Baden-Württemberg 2018 107
## 5 Baden-Württemberg 2019 107.
## 6 Baden-Württemberg 2020 102.
## 7 Baden-Württemberg 2021 105.
## 8 Baden-Württemberg 2022 106.
## 9 Bayern 2015 100
## 10 Bayern 2016 NA
## # ℹ 126 more rows
That’s a fantastic format that summarise()
can handle. So let’s use it to collect all of the gdp
data in vectors.
<- html_tables[[2]] |>
gdp_series select(-(2:4)) |>
::row_to_names(row_number = 2) |>
janitorslice(-1) |>
mutate(
`2016` = NA_character_,
across(
-Land,
parse_number(x, locale = german_locale)
\(x)
),.before = 3
|>
) rename(state = Land) |>
pivot_longer(
cols = -state,
names_to = 'year',
values_to = 'gdp'
|>
) summarise(
gdp = list(gdp),
.by = state
)
gdp_series## # A tibble: 17 × 2
## state gdp
## <chr> <list>
## 1 Baden-Württemberg <dbl [8]>
## 2 Bayern <dbl [8]>
## 3 Berlin <dbl [8]>
## 4 Brandenburg <dbl [8]>
## 5 Bremen <dbl [8]>
## 6 Hamburg <dbl [8]>
## 7 Hessen <dbl [8]>
## 8 Mecklenburg-Vorpommern <dbl [8]>
## 9 Niedersachsen <dbl [8]>
## 10 Nordrhein-Westfalen <dbl [8]>
## 11 Rheinland-Pfalz <dbl [8]>
## 12 Saarland <dbl [8]>
## 13 Sachsen <dbl [8]>
## 14 Sachsen-Anhalt <dbl [8]>
## 15 Schleswig-Holstein <dbl [8]>
## 16 Thüringen <dbl [8]>
## 17 Deutschland <dbl [8]>
Combine the data sets
We’re almost done with assembling the data for our table. What we still need are the URLs to the images of the coat of arms of the states. No fancy trick there. Just going on to Wikipedia and copying all the URLs into a vector.
<- c(
img_urls 'https://upload.wikimedia.org/wikipedia/commons/0/0f/Lesser_coat_of_arms_of_Baden-W%C3%BCrttemberg.svg',
'https://upload.wikimedia.org/wikipedia/commons/d/d2/Bayern_Wappen.svg',
'https://upload.wikimedia.org/wikipedia/commons/8/8c/DEU_Berlin_COA.svg',
'https://upload.wikimedia.org/wikipedia/commons/a/a2/DEU_Brandenburg_COA.svg',
'https://upload.wikimedia.org/wikipedia/commons/6/64/Bremen_Wappen%28Mittel%29.svg',
'https://upload.wikimedia.org/wikipedia/commons/5/5d/DEU_Hamburg_COA.svg',
'https://upload.wikimedia.org/wikipedia/commons/c/cd/Coat_of_arms_of_Hesse.svg',
'https://upload.wikimedia.org/wikipedia/commons/7/74/Coat_of_arms_of_Mecklenburg-Western_Pomerania_%28great%29.svg',
'https://upload.wikimedia.org/wikipedia/commons/0/0b/Coat_of_arms_of_Lower_Saxony.svg',
'https://upload.wikimedia.org/wikipedia/commons/1/1b/Coat_of_arms_of_North_Rhine-Westphalia.svg',
'https://upload.wikimedia.org/wikipedia/commons/8/89/Coat_of_arms_of_Rhineland-Palatinate.svg',
'https://upload.wikimedia.org/wikipedia/commons/8/8e/Wappen_des_Saarlands.svg',
'https://upload.wikimedia.org/wikipedia/commons/5/5f/Coat_of_arms_of_Saxony.svg',
'https://upload.wikimedia.org/wikipedia/commons/5/53/Wappen_Sachsen-Anhalt.svg',
'https://upload.wikimedia.org/wikipedia/commons/0/02/DEU_Schleswig-Holstein_COA.svg',
'https://upload.wikimedia.org/wikipedia/commons/0/08/Coat_of_arms_of_Thuringia.svg',
'https://upload.wikimedia.org/wikipedia/commons/d/da/Coat_of_arms_of_Germany.svg'
)
|>
cleaned_state_dat mutate(
img = img_urls,
.before = 1
)## # A tibble: 17 × 8
## img state capital most_populous_city federal_assembly_votes area_km2
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 https://upl… Bade… Stuttg… Stuttgart 6 35748
## 2 https://upl… Baye… München München 6 70542
## 3 https://upl… Berl… — — 4 891
## 4 https://upl… Bran… Potsdam Potsdam 4 29654
## 5 https://upl… Brem… Bremen Bremen 3 420
## 6 https://upl… Hamb… — — 3 755
## 7 https://upl… Hess… Wiesba… Frankfurt am Main 5 21116
## 8 https://upl… Meck… Schwer… Rostock 3 23295
## 9 https://upl… Nied… Hannov… Hannover 6 47710
## 10 https://upl… Nord… Düssel… Köln 6 34112
## 11 https://upl… Rhei… Mainz Mainz 4 19858
## 12 https://upl… Saar… Saarbr… Saarbrücken 3 2571
## 13 https://upl… Sach… Dresden Leipzig 4 18450
## 14 https://upl… Sach… Magdeb… Halle (Saale) 4 20459
## 15 https://upl… Schl… Kiel Kiel 4 15804
## 16 https://upl… Thür… Erfurt Erfurt 4 16202
## 17 https://upl… Deut… Berlin Berlin 69 357588
## # ℹ 2 more variables: pop_million <dbl>, pop_per_km2 <dbl>
And now that cleaned_state_dat
is complete, we can join it with gdp_series
.
<- cleaned_state_dat |>
german_stats mutate(
img = img_urls,
.before = 1
|>
) left_join(gdp_series, by = 'state')
german_stats## # A tibble: 17 × 9
## img state capital most_populous_city federal_assembly_votes area_km2
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 https://upl… Bade… Stuttg… Stuttgart 6 35748
## 2 https://upl… Baye… München München 6 70542
## 3 https://upl… Berl… — — 4 891
## 4 https://upl… Bran… Potsdam Potsdam 4 29654
## 5 https://upl… Brem… Bremen Bremen 3 420
## 6 https://upl… Hamb… — — 3 755
## 7 https://upl… Hess… Wiesba… Frankfurt am Main 5 21116
## 8 https://upl… Meck… Schwer… Rostock 3 23295
## 9 https://upl… Nied… Hannov… Hannover 6 47710
## 10 https://upl… Nord… Düssel… Köln 6 34112
## 11 https://upl… Rhei… Mainz Mainz 4 19858
## 12 https://upl… Saar… Saarbr… Saarbrücken 3 2571
## 13 https://upl… Sach… Dresden Leipzig 4 18450
## 14 https://upl… Sach… Magdeb… Halle (Saale) 4 20459
## 15 https://upl… Schl… Kiel Kiel 4 15804
## 16 https://upl… Thür… Erfurt Erfurt 4 16202
## 17 https://upl… Deut… Berlin Berlin 69 357588
## # ℹ 3 more variables: pop_million <dbl>, pop_per_km2 <dbl>, gdp <list>
Get state descriptions from Wikipedia
Now the only thing that’s missing is the state descriptions. For that, we can scrape the first paragraph from Wikipedia. Let’s do that with one example first. Let’s use the URL to the English Wiki of Bavaria.
<- 'https://en.wikipedia.org/wiki/Baden-W%C3%BCrttemberg' url
Then, we can get all the paragraphs from this page by looking for <p>
-tags. That’s HTML notation to denote a paragraph. With the {rvest}
package, all we have to do is to read the URL just like before and then pass that to the html_elements()
function.
<- read_html(url) |>
paragraphs html_elements('p')
paragraphs## {xml_nodeset (69)}
## [1] <p class="mw-empty-elt">\n</p>
## [2] <p><b>Baden-Württemberg</b> (<span class="rt-commentedText nowrap"><span ...
## [3] <p>Modern Baden-Württemberg includes the historical territories of <a hr ...
## [4] <p>Baden-Württemberg is especially known for its strong economy with var ...
## [5] <p>The <a href="/wiki/Sobriquet" title="Sobriquet">sobriquet</a> <span t ...
## [6] <p>Baden-Württemberg is formed from the historical territories of <a hre ...
## [7] <p>In 100 AD, the <a href="/wiki/Roman_Empire" title="Roman Empire">Roma ...
## [8] <p>The Holy Roman Empire was later established. The majority of people i ...
## [9] <p>In the late 18th and early 19th century, <a href="/wiki/K%C3%BCnzelsa ...
## [10] <p>In the late 19th and early 20th centuries, numerous people emigrated ...
## [11] <p>At the beginning of the 20th century, the territory of modern-day Bad ...
## [12] <p>Following <a href="/wiki/Adolf_Hitler" title="Adolf Hitler">Adolf Hit ...
## [13] <p>After World War II, the <a href="/wiki/Allies_of_World_War_II" title= ...
## [14] <p>In 1949, each state became a founding member of the <a href="/wiki/We ...
## [15] <p>There were still opponents to the merger of Baden and Württemberg, ho ...
## [16] <p>Baden-Württemberg shares borders with the German states of <a href="/ ...
## [17] <p>Most of the major cities of Baden-Württemberg straddle the banks of t ...
## [18] <p>The <a href="/wiki/Rhine" title="Rhine">Rhine</a> (German: <i lang="d ...
## [19] <p>The <a href="/wiki/Danube" title="Danube">Danube</a> is conventionall ...
## [20] <p>The forests in this region are home to common pests such as <i><a hre ...
## ...
This will give us a bunch of HTML text. If we want to turn this into clean text, we just apply the html_text()
function. Then, let us look at the first three entries of the resulting vector.
html_text(paragraphs)[1:3]
## [1] "\n"
## [2] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;[6].mw-parser-output .IPA-label-small{font-size:85%}.mw-parser-output .references .IPA-label-small,.mw-parser-output .infobox .IPA-label-small,.mw-parser-output .navbox .IPA-label-small{font-size:100%}German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019[update] across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria).[7] As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"
## [3] "Modern Baden-Württemberg includes the historical territories of Baden, Prussian Hohenzollern, and Württemberg. Baden-Württemberg became a state of West Germany in April 1952 through the merger of South Baden, Württemberg-Baden, and Württemberg-Hohenzollern. These states had been created by the Allies as they separated traditional states into occupation zones after World War II\n"
It appears as if the text we’re looking for is in the second entry. But is has a whole lot of gibberish in it too. Things like:
.mw-parser-output .IPA-label-small{font-size:85%}
.mw-parser-output .references
.IPA-label-small,.mw-parser-output .infobox
.IPA-label-small,.mw-parser-output .navbox
.IPA-label-small{font-size:100%}
So let’s get rid of that. First, let’s look for the text .mw-parser-output
and remove that.
html_text(paragraphs)[2] |>
str_remove_all('\\.mw-parser-output')
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;[6] .IPA-label-small{font-size:85%} .references .IPA-label-small, .infobox .IPA-label-small, .navbox .IPA-label-small{font-size:100%}German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019[update] across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria).[7] As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"
Here, we have used \\.
to look for a literal .
character. Otherwise, .
would mean any character in regular expressions. But this doesn’t remove everything. We want to remove everything from .mw-parser-output
to what is in the curly brackets {}
(like {font-size:85%}
). So let’s check if we can remove JUST the brackets.
html_text(paragraphs)[2] |>
str_remove_all('\\{.+\\}')
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;[6].mw-parser-output .IPA-label-smallGerman: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019[update] across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria).[7] As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"
Here, we have used three parts in our regular expression.
- Literal curly brackets
\\{
and\\}
- Any character
.
between the brackets but - not just a single character but arbitrarily many (
+
)
So with that, we could break down what we want to remove into three chunks:
- Start at
\\.mw-parser-output
- End at
\\{.+\\}
- Delete those and everything between (
.+
)
So let’s throw this all together:
html_text(paragraphs)[2] |>
str_remove_all('\\.mw-parser-output.+\\{.+\\}')
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;[6]German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019[update] across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria).[7] As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"
Cool. Similarly, we can get rid of these footnotes brackets that Wikipedia has (e.g. [6]
). They are described by opening and closing brackets with numbers between them. Numbers are denoted by \\d
.
html_text(paragraphs)[2] |>
str_remove_all('\\.mw-parser-output.+\\{.+\\}') |>
str_remove_all("\\[\\d+\\]")
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019[update] across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria). As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"
Ahh there’s also an [update]
tag in there. Let’s throw that out too. We can just tell the regular expression to look for either numbers \\d
or for the word update
by grouping them together with ()
and putting a |
between them.
html_text(paragraphs)[2] |>
str_remove_all('\\.mw-parser-output.+\\{.+\\}') |>
str_remove_all("\\[(\\d+|update)\\]")
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019 across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria). As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"
And for good measure, let’s get rid of any single-character brackets like [a]
as well (in case they pop up for other states.)
html_text(paragraphs)[2] |>
str_remove_all('\\.mw-parser-output.+\\{.+\\}') |>
str_remove_all("\\[(\\d+|update|.)\\]")
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019 across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria). As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"
Alright, that was the hard part. Let’s finish off with the simple stuff. Let’s remove things like \\n
and ⓘ
. And let’s also replace km2
with km²
.
html_text(paragraphs)[2] |>
str_remove_all('\\.mw-parser-output.+\\{.+\\}') |>
str_remove_all("\\[(\\d+|update|.)\\]") |>
str_remove_all("\\n") |>
str_remove_all('ⓘ') |>
str_replace_all('km2', 'km²')
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019 across a total area of nearly 35,752 km² (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria). As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm."
Nice. Looks like we have a text cleaning recipe. Let’s wrap that into a function so that we can apply that to all state URLs.
<- function(url) {
get_state_description <- read_html(url) |>
paragraphs html_elements('p')
<- 2
paragraph_nmbr ## Earlier versions of this post had
## paragraph_nmbr <- 2 if (str_detect(url, 'Hesse')) 1 else 2
## That was due to the the relevant info being in the first paragraph for Hesse.
## But that changed now so the code got easier.
|>
paragraphs[paragraph_nmbr] html_text() |>
str_remove_all("\\[(\\d+|update|.)\\]") |>
str_remove_all('\\.mw-parser-output.+\\{.+\\}') |>
# str_remove_all("\\.mw-parser-output.*?\\{.*?\\}") |>
str_remove_all("\\n") |>
str_remove_all('ⓘ') |>
str_replace_all('km2', 'km²')
}
There’s a function. Time to iterate with it. Let’s collect all of the URLs for different states in a vector and then use it in conjunction with map_chr()
to iterate over the vector.
<- c(
state_urls 'https://en.wikipedia.org/wiki/Baden-W%C3%BCrttemberg',
'https://en.wikipedia.org/wiki/Bavaria',
'https://en.wikipedia.org/wiki/Berlin',
'https://en.wikipedia.org/wiki/Brandenburg',
'https://en.wikipedia.org/wiki/Bremen',
'https://en.wikipedia.org/wiki/Hamburg',
'https://en.wikipedia.org/wiki/Hesse',
'https://en.wikipedia.org/wiki/Mecklenburg-Vorpommern',
'https://en.wikipedia.org/wiki/Lower_Saxony',
'https://en.wikipedia.org/wiki/North_Rhine-Westphalia',
'https://en.wikipedia.org/wiki/Rhineland-Palatinate',
'https://en.wikipedia.org/wiki/Saarland',
'https://en.wikipedia.org/wiki/Saxony',
'https://en.wikipedia.org/wiki/Saxony-Anhalt',
'https://en.wikipedia.org/wiki/Schleswig-Holstein',
'https://en.wikipedia.org/wiki/Thuringia'
)
<- state_urls |>
state_descriptions map_chr(get_state_description)
|>
state_descriptions glimpse()
## chr [1:16] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ), commonly shorte"| __truncated__ ...
Excellent! We have all of our data for our interactive table now (Hoooray!) Time to create the table then. But before we create the table, let me throw in a short announcement:
Data Cleaning Master Class
If you enjoyed this blog post so far, then I am sure you’re going to really like my Data Cleaning Master Class. It teaches you everything you need to know to clean up messy data sets in no time. It teaches way more techniques than the ones I have shown you already and it covers
- a variety of file formats (including pesky JSON & Excel files),
- text cleaning strategies (including the almighty regular expressions, i.e. RegEx) and
- handling time and date data (which is always tedious when you don’t know the right functions.)
So if you want to transform messy data into insights quickly, then check out the course page today.
Turning the data into a table
Alright, let’s start building the table. If you want to recreate the code from the YT video and don’t want to webscrape the data yourself, you can download the data from the video here. For that, we first create a data set that doesn’t have the last row with the totals in it.
<- german_stats |>
german_stats_wo_total slice(-17)
german_stats_wo_total## # A tibble: 16 × 9
## img state capital most_populous_city federal_assembly_votes area_km2
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 https://upl… Bade… Stuttg… Stuttgart 6 35748
## 2 https://upl… Baye… München München 6 70542
## 3 https://upl… Berl… — — 4 891
## 4 https://upl… Bran… Potsdam Potsdam 4 29654
## 5 https://upl… Brem… Bremen Bremen 3 420
## 6 https://upl… Hamb… — — 3 755
## 7 https://upl… Hess… Wiesba… Frankfurt am Main 5 21116
## 8 https://upl… Meck… Schwer… Rostock 3 23295
## 9 https://upl… Nied… Hannov… Hannover 6 47710
## 10 https://upl… Nord… Düssel… Köln 6 34112
## 11 https://upl… Rhei… Mainz Mainz 4 19858
## 12 https://upl… Saar… Saarbr… Saarbrücken 3 2571
## 13 https://upl… Sach… Dresden Leipzig 4 18450
## 14 https://upl… Sach… Magdeb… Halle (Saale) 4 20459
## 15 https://upl… Schl… Kiel Kiel 4 15804
## 16 https://upl… Thür… Erfurt Erfurt 4 16202
## # ℹ 3 more variables: pop_million <dbl>, pop_per_km2 <dbl>, gdp <list>
Basic Table
Then, we can pass that data to the reactable()
function.
library(reactable)
|>
german_stats_wo_total reactable()
This doesn’t look nice at all. We have lots of tweaking to do. First, let’s get rid of the pagination. I want to have all rows visible in the table.
|>
german_stats_wo_total reactable(
pagination = FALSE
)
Default column settings
Then, we can set the default settings for every column. By default we cant to have
- the content left-aligned (horizontally) and center-aligned (vertically),
- Column names in title case that use white space instead of
_
- Bold text as well as a thick border for the column names
|>
german_stats_wo_total reactable(
pagination = FALSE,
defaultColDef = colDef(
vAlign = 'center',
align = 'left',
header = \(x) str_replace_all(x, '_', ' ') |>
str_to_title(),
headerStyle = htmltools::css(
font_weight = 600,
border_bottom = '2px solid black'
)
) )
Then, we can add a footer to all columns by grabbing the last row of the german_stats
data set. Remember? This data set has the totals in the last row. And while we’re at it, we can also set a bit of stylings for the footers.
|>
german_stats_wo_total reactable(
pagination = FALSE,
defaultColDef = colDef(
vAlign = 'center',
align = 'left',
header = \(x) str_replace_all(x, '_', ' ') |>
str_to_title(),
headerStyle = htmltools::css(
font_weight = 600,
border_bottom = '2px solid black'
),footer = function(values, col_name) {
17, col_name]]
german_stats[[
},footerStyle = htmltools::css(
font_weight = 600,
border_top = '2px solid black'
)
) )
These settings aren’t perfect and we have to modify them for individual columns. But it’s a nice baseline. Now we only have to go through every column one by one to apply custom changes.
Setting the image column
Let’s start with the img
column. There, we want to take the urls and wrap them into <img>
tags. Since the reactable
output is HTML, this will cause our images to appear from their online source. To create the tags we will need the {htmltools}
package.
Further, we’ll adjust the width of that column and get rid of the column name altogether. And just like in the default settings, we have to treat the footer of the table separately.
library(htmltools)
|>
german_stats_wo_total reactable(
pagination = FALSE,
defaultColDef = colDef(
vAlign = 'center',
align = 'left',
header = \(x) str_replace_all(x, '_', ' ') |>
str_to_title(),
headerStyle = htmltools::css(
font_weight = 600,
border_bottom = '2px solid black'
),footer = function(values, col_name) {
17, col_name]]
german_stats[[
},footerStyle = htmltools::css(
font_weight = 600,
border_top = '2px solid black'
)
),
columns = list(
img = colDef(
header = '',
width = 65,
cell = function(value) {
$img(
tagssrc = value,
width = 50
)
},footer = function(value) {
$img(
tagssrc = german_stats$img[17],
width = 50
)
}
)
) )
Fix widths of name columns
For the columns state
, capital
and most_populous_city
there’s not much to do. In these cases, we might only want do set the column width.
|>
german_stats_wo_total reactable(
pagination = FALSE,
defaultColDef = colDef(
vAlign = 'center',
align = 'left',
header = \(x) str_replace_all(x, '_', ' ') |>
str_to_title(),
headerStyle = htmltools::css(
font_weight = 600,
border_bottom = '2px solid black'
),footer = function(values, col_name) {
17, col_name]]
german_stats[[
},footerStyle = htmltools::css(
font_weight = 600,
border_top = '2px solid black'
)
),
columns = list(
img = colDef(
header = '',
width = 65,
cell = function(value) {
$img(
tagssrc = value,
width = 50
)
},footer = function(value) {
$img(
tagssrc = german_stats$img[17],
width = 50
)
}
),
state = colDef(width = 125),
capital = colDef(width = 115),
most_populous_city = colDef(width = 160)
) )
Turn numbers into bubbles
Next, let us change the federal_assembly_votes
column. Easy changes will once again involve the header
name, the align
ment and the width
of the column. But that’s not the important part here.
What is important is the fact that we turn the numbers into bubbles. For that all we have to do is to load the {reactablefmtr}
package and use its bubble_grid
function inside the cell
argument.
library(reactablefmtr)
<- '#104E8B' # define a main color
main_color
|>
german_stats_wo_total reactable(
pagination = FALSE,
defaultColDef = colDef(
vAlign = 'center',
align = 'left',
header = \(x) str_replace_all(x, '_', ' ') |>
str_to_title(),
headerStyle = htmltools::css(
font_weight = 600,
border_bottom = '2px solid black'
),footer = function(values, col_name) {
17, col_name]]
german_stats[[
},footerStyle = htmltools::css(
font_weight = 600,
border_top = '2px solid black'
)
),
columns = list(
img = colDef(
header = '',
width = 65,
cell = function(value) {
$img(
tagssrc = value,
width = 50
)
},footer = function(value) {
$img(
tagssrc = german_stats$img[17],
width = 50
)
}
),
state = colDef(width = 125),
capital = colDef(width = 115),
most_populous_city = colDef(width = 160),
federal_assembly_votes = colDef(
header = 'Bundesrat Votes',
align = 'center',
width = 140,
cell = bubble_grid(
|>
german_stats_wo_total select(federal_assembly_votes),
min_value = 1,
max_value = 10,
colors = c(
::lighten(main_color, 0.9),
colorspace
main_color
)
)
)
) )
Turn numbers into bar charts
Next, we’re going to do a very similar thing with the columns area_km2
, pop_million
, pop_per_km2
. This time, though, we’re going to use the data_bars()
function from the {reactablefmtr}
package.
This will involve having to call data_bars()
more or less three times with mostly the same arguments. Thus, I wrap the logic into a function to avoid a bit of code duplication.
<- function(column) {
bar_chart <- if (column == 'area_km2') {
formatting_function ::label_number(
scalessuffix = ' km²',
big.mark = ','
)else {
} ::label_comma(
scalesaccuracy = if (column == 'pop_per_km2') 1 else 0.01
)
}
<- if (column %in% c('area_km2', 'pop_million')) {
bar_max_value 17, column]]
german_stats[[else {
} NULL
}data_bars(
data = german_stats_wo_total |> select(any_of(column)),
text_position = 'above',
bar_height = 20,
number_fmt = formatting_function,
fill_color = main_color,
max_value = bar_max_value
) }
With that function we can modify the cell
argument of the corresponding columns. And while we’re at it, we can also fix the width, alignment and footer of these columns.
|>
german_stats_wo_total reactable(
pagination = FALSE,
defaultColDef = colDef(
vAlign = 'center',
align = 'left',
header = \(x) str_replace_all(x, '_', ' ') |>
str_to_title(),
headerStyle = htmltools::css(
font_weight = 600,
border_bottom = '2px solid black'
),footer = function(values, col_name) {
17, col_name]]
german_stats[[
},footerStyle = htmltools::css(
font_weight = 600,
border_top = '2px solid black'
)
),
columns = list(
img = colDef(
header = '',
width = 65,
cell = function(value) {
$img(
tagssrc = value,
width = 50
)
},footer = function(value) {
$img(
tagssrc = german_stats$img[17],
width = 50
)
}
),
state = colDef(width = 125),
capital = colDef(width = 115),
most_populous_city = colDef(width = 160),
federal_assembly_votes = colDef(
header = 'Bundesrat Votes',
align = 'center',
width = 140,
cell = bubble_grid(
|>
german_stats_wo_total select(federal_assembly_votes),
min_value = 1,
max_value = 10,
colors = c(
::lighten(main_color, 0.9),
colorspace
main_color
)
)
),
area_km2 = colDef(
header = 'Area',
align = 'left',
cell = bar_chart('area_km2'),
width = 110,
footer = function(value) {
$area_km2[17] |>
german_stats::label_number(
scalessuffix = ' km²',
big.mark = ','
)()
}
),pop_million = colDef(
header = 'in millions',
cell = bar_chart('pop_million'),
footer = german_stats$pop_million[[17]] |>
::comma(accuracy = 0.01),
scaleswidth = 110
),pop_per_km2 = colDef(
header = 'per km²',
cell = bar_chart('pop_per_km2'),
width = 110
)
) )
Turn numbers into line charts
Sweet! We’re getting somewhere. We have bars and circles already. Time to add sparklines.
This time, I don’t want to use {reactablefmtr}
for that. I want to use my own custom charts that I create with {ggplot2}
. But there are two problems with that:
{ggplot2}
creates static image files that I would have to export and then reimport that into our tables using<img>
tags again.{ggplot2}
charts are not interactive and I want to have hover tool tips
Thankfully, the solution for both of these problems is one and the same thing. Namely, {ggiraph}
. This package is my favorite way to create interactive charts with R. If you’ve never worked with that package, I have a blog post and YT video about that:
Anyway, let’s create a function that depends on a column’s values, row indices and name. We’re going to stick that function into the cell
argument of the gdp
column. In this function, we can start with a interactive chart without tooltips.
But we have to be careful. Since all charts are drawn separately for each row, we have to ensure that the axes limits are the same for all line charts. That’s where we’ll use scale_y_continuous()
.
library(ggiraph)
<- function(value, row_index, column_name) {
spark_line <- 2015:2022
years
# Create static ggplot with `value` vector
<- ggplot(mapping = aes(x = years, y = value)) +
gg_plt geom_ribbon(
aes(
x = years[-2],
y = NULL,
ymin = 100,
ymax = value[-2]
),alpha = 0.1,
fill = main_color
+
) geom_line(linewidth = 2, color = main_color) +
geom_point(
color = main_color,
size = 25,
na.rm = TRUE
+
) scale_y_continuous(
expand = expansion(mult = 0.1),
limits = range(
$gdp |> unlist(),
german_statsna.rm = TRUE
)+
) theme_void()
girafe(
ggobj = gg_plt,
width_svg = 16,
height = 9,
options = list(
opts_toolbar(saveaspng = FALSE)
)
)
}# Example with first row
spark_line(german_stats$gdp[[1]], NA, NA)
So with that, we can define the column settings of the GDP columns and ensure that it uses this function. And as always, we have to treat the footer separately.
|>
german_stats_wo_total reactable(
pagination = FALSE,
defaultColDef = colDef(
vAlign = 'center',
align = 'left',
header = \(x) str_replace_all(x, '_', ' ') |>
str_to_title(),
headerStyle = htmltools::css(
font_weight = 600,
border_bottom = '2px solid black'
),footer = function(values, col_name) {
17, col_name]]
german_stats[[
},footerStyle = htmltools::css(
font_weight = 600,
border_top = '2px solid black'
)
),
columns = list(
img = colDef(
header = '',
width = 65,
cell = function(value) {
$img(
tagssrc = value,
width = 50
)
},footer = function(value) {
$img(
tagssrc = german_stats$img[17],
width = 50
)
}
),
state = colDef(width = 125),
capital = colDef(width = 115),
most_populous_city = colDef(width = 160),
federal_assembly_votes = colDef(
header = 'Bundesrat Votes',
align = 'center',
width = 140,
cell = bubble_grid(
|>
german_stats_wo_total select(federal_assembly_votes),
min_value = 1,
max_value = 10,
colors = c(
::lighten(main_color, 0.9),
colorspace
main_color
)
)
),
area_km2 = colDef(
header = 'Area',
align = 'left',
cell = bar_chart('area_km2'),
width = 110,
footer = function(value) {
$area_km2[17] |>
german_stats::label_number(
scalessuffix = ' km²',
big.mark = ','
)()
}
),pop_million = colDef(
header = 'in millions',
cell = bar_chart('pop_million'),
footer = german_stats$pop_million[[17]] |>
::comma(accuracy = 0.01),
scaleswidth = 110
),pop_per_km2 = colDef(
header = 'per km²',
cell = bar_chart('pop_per_km2'),
width = 110
),
gdp = colDef(
header = div(
'GDP ',
span(
style = htmltools::css(
font_size = '0.75rem'
),'(Index 2015 = 100)'
)
),cell = spark_line,
footer = \(x) spark_line(
$gdp[[17]], NA, NA
german_stats
),width = 160
)
) )
Add tooltips to spark lines
Now, to make the chart more interesting, we can compute the year-on-year change of the GPD (if possible) and show that in tooltips.
<- function(value, row_index, column_name) {
spark_line <- 2015:2022
years <- value / lag(value) - 1
rel_diff <- if_else(
texts is.na(rel_diff),
'',
::percent(rel_diff, accuracy = 0.01)
scales
)<- if_else(
text_color < 0,
rel_diff '#BF1363',
'#06A77D'
)
<- map(
compare_texts seq_along(texts),
\(x) {if (!is.na(value[[x]])) {
div(
style = css(
width = '250px',
),div(
style = htmltools::css(
font_weight = 600
),::glue(
glue'GPD in {years[x]}:'
)
),div(
value[x],if (texts[[x]] != '') {
span(
style = htmltools::css(
font_size = '1rem'
),'(',
span(
style = htmltools::css(
color = text_color[x],
font_weight = 600
),
texts[x]
),span(
', year-on-year)'
)
)
}
)|> as.character()
) else {
} ''
}
}
)
<- ggplot(mapping = aes(x = years, y = value)) +
gg_plt geom_ribbon(
aes(
x = years[-2],
y = NULL, ymin = 100, ymax = value[-2]),
alpha = 0.1,
fill = main_color
+
) geom_line(linewidth = 2, color = main_color) +
geom_point_interactive(
aes(
tooltip = compare_texts
),color = main_color,
size = 25,
na.rm = TRUE
+
) scale_y_continuous(
expand = expansion(mult = 0.1),
limits = range(
$gdp |> unlist(),
german_statsna.rm = TRUE
)+
) theme_void()
girafe(
ggobj = gg_plt,
width_svg = 16,
height = 9,
options = list(
opts_toolbar(saveaspng = FALSE),
opts_tooltip(
opacity = 1,
css = girafe_css(
css = htmltools::css(
color = 'black',
background = 'white',
padding = '15px',
font_size = '1.25rem',
font_family = '"Source Sans Pro", Courier',
border = glue::glue('2px solid {main_color}'),
border_radius = '5px'
)
)
)
)
)
}spark_line(german_stats$gdp[[1]], NA, NA)
And now we just have to re-execute the code from before.
Add column groups and nicer font
Phew, that was hard, wasn’t it? Let’s throw in something easy in between. You can think of it as a little break if you want.
Anyway, what we do now is to add a group label for the population columns. And we can add a nicer font.
|>
german_stats_wo_total reactable(
pagination = FALSE,
defaultColDef = colDef(
vAlign = 'center',
align = 'left',
header = \(x) str_replace_all(x, '_', ' ') |>
str_to_title(),
headerStyle = htmltools::css(
font_weight = 600,
border_bottom = '2px solid black'
),footer = function(values, col_name) {
17, col_name]]
german_stats[[
},footerStyle = htmltools::css(
font_weight = 600,
border_top = '2px solid black'
)
),
columns = list(
img = colDef(
header = '',
width = 65,
cell = function(value) {
$img(
tagssrc = value,
width = 50
)
},footer = function(value) {
$img(
tagssrc = german_stats$img[17],
width = 50
)
}
),
state = colDef(width = 125),
capital = colDef(width = 115),
most_populous_city = colDef(width = 160),
federal_assembly_votes = colDef(
header = 'Bundesrat Votes',
align = 'center',
width = 140,
cell = bubble_grid(
|>
german_stats_wo_total select(federal_assembly_votes),
min_value = 1,
max_value = 10,
colors = c(
::lighten(main_color, 0.9),
colorspace
main_color
)
)
),
area_km2 = colDef(
header = 'Area',
align = 'left',
cell = bar_chart('area_km2'),
width = 110,
footer = function(value) {
$area_km2[17] |>
german_stats::label_number(
scalessuffix = ' km²',
big.mark = ','
)()
}
),pop_million = colDef(
header = 'in millions',
cell = bar_chart('pop_million'),
footer = german_stats$pop_million[[17]] |>
::comma(accuracy = 0.01),
scaleswidth = 110
),pop_per_km2 = colDef(
header = 'per km²',
cell = bar_chart('pop_per_km2'),
width = 110
),
gdp = colDef(
header = div(
'GDP ',
span(
style = htmltools::css(
font_size = '0.75rem'
),'(Index 2015 = 100)'
)
),cell = spark_line,
footer = \(x) spark_line(
$gdp[[17]], NA, NA
german_stats
),width = 160
)
),columnGroups = list(
colGroup(
name = 'Population',
columns = c('pop_million', 'pop_per_km2')
)
),style = htmltools::css(
font_family = '"Source Sans Pro", sans-serif'
) )
Create state maps
Nice! The rows of the table are complete. Now it’s time to create the folds for the table. This one will contain a map of Germany with the selected state highlighted as well as some information about the state.
Let’s first get the data for the mapss. For that we will use the {giscoR}
package. If you’re unfamiliar with that package, I have a tutorial for you:
The data that we’re going to use looks like this:
library(giscoR)
<- gisco_get_nuts(
germany_states year = "2021",
nuts_level = 1,
epsg = 3035,
country = 'Germany'
|>
) # Nicer output
as_tibble() |>
::clean_names()
janitor
germany_states## # A tibble: 16 × 10
## nuts_id levl_code urbn_type cntr_code name_latn nuts_name mount_type
## <chr> <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 DE1 1 0 DE Baden-Württemberg Baden-Wü… 0
## 2 DE6 1 0 DE Hamburg Hamburg 0
## 3 DE7 1 0 DE Hessen Hessen 0
## 4 DE8 1 0 DE Mecklenburg-Vorpo… Mecklenb… 0
## 5 DE9 1 0 DE Niedersachsen Niedersa… 0
## 6 DEA 1 0 DE Nordrhein-Westfal… Nordrhei… 0
## 7 DEB 1 0 DE Rheinland-Pfalz Rheinlan… 0
## 8 DEC 1 0 DE Saarland Saarland 0
## 9 DED 1 0 DE Sachsen Sachsen 0
## 10 DEE 1 0 DE Sachsen-Anhalt Sachsen-… 0
## 11 DEF 1 0 DE Schleswig-Holstein Schleswi… 0
## 12 DEG 1 0 DE Thüringen Thüringen 0
## 13 DE2 1 0 DE Bayern Bayern 0
## 14 DE3 1 0 DE Berlin Berlin 0
## 15 DE4 1 0 DE Brandenburg Brandenb… 0
## 16 DE5 1 0 DE Bremen Bremen 0
## # ℹ 3 more variables: coast_type <dbl>, geo <chr>, geometry <GEOMETRY [m]>
And then we can use that data to create maps of Germany. Since we need one map for every state, let us wrap the similar logic into a function.
<- function(selected_state) {
create_state_map <- germany_states |>
plt ggplot(
aes(
geometry = geometry,
fill = (name_latn == selected_state)
)+
) geom_sf(col = 'white', linewidth = 0.75) +
theme_void() +
theme(
legend.position = 'none'
+
) scale_fill_manual(
values = c(
'TRUE' = main_color,
'FALSE' = 'grey50'
)
)girafe(
ggobj = plt,
options = list(
opts_toolbar(saveaspng = FALSE)
)
)
}create_state_map(german_stats$state[[1]])
Assemble state descriptions
With that function, we can create yet another function that will put the map next to the state information that we have in the state_descriptions
descriptions. You know, information like this:
1:2]
state_descriptions[## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019 across a total area of nearly 35,752 km² (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria). As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm."
## [2] "Bavaria, officially the Free State of Bavaria, is a state in the southeast of Germany. With an area of 70,550.19 km² (27,239.58 sq mi), it is the largest German state by land area, comprising roughly a fifth of the total land area of Germany, and with over 13.08 million inhabitants, it is the second most populous German state, behind only North Rhine-Westphalia; however, due to its large land area, its population density is below the German average. Major cities include Munich (its capital and largest city, which is also the third largest city in Germany),Nuremberg, and Augsburg."
Since we’re going to use our new function in the details
argument of img
column’s colDef()
function, we have to make sure that this new function depends on the row_index
and the col_name
.
<- function(row_index, col_name) {
state_details <- german_stats[[row_index, 'state']]
state_name
div(
style = htmltools::css(
display = 'grid',
grid_template_columns = '1fr 2fr',
row_gap = '10px',
padding = '10px 50px 10px 50px'
),div(
create_state_map(state_name)
),div(
style = css(
font_family = 'Source Sans Pro',
font_size = '1.25rem'
),
span(
a(
# `state_urls` contains URLs to a state's Wiki page
href = state_urls[row_index],
style = css(
color = main_color,
text_decoration = 'none',
font_weight = 600
),'From Wikipedia: '
),
state_descriptions[row_index]
)
)
)
}
state_details(1, '') |>
::browsable() htmltools
Include state description
<- german_stats_wo_total |>
tbl_wo_titles reactable(
pagination = FALSE,
defaultColDef = colDef(
vAlign = 'center',
align = 'left',
header = \(x) str_replace_all(x, '_', ' ') |>
str_to_title(),
headerStyle = htmltools::css(
font_weight = 600,
border_bottom = '2px solid black'
),footer = function(values, col_name) {
17, col_name]]
german_stats[[
},footerStyle = htmltools::css(
font_weight = 600,
border_top = '2px solid black'
)
),
columns = list(
img = colDef(
header = '',
width = 100, #<<< INCREASE WIDTH
cell = function(value) {
$img(
tagssrc = value,
width = 50
)
},footer = function(value) {
$img(
tagssrc = german_stats$img[17],
width = 50
)
},details = state_details #<<< ADD FCT HERE
),
state = colDef(width = 125),
capital = colDef(width = 115),
most_populous_city = colDef(width = 160),
federal_assembly_votes = colDef(
header = 'Bundesrat Votes',
align = 'center',
width = 140,
cell = bubble_grid(
|>
german_stats_wo_total select(federal_assembly_votes),
min_value = 1,
max_value = 10,
colors = c(
::lighten(main_color, 0.9),
colorspace
main_color
)
)
),
area_km2 = colDef(
header = 'Area',
align = 'left',
cell = bar_chart('area_km2'),
width = 110,
footer = function(value) {
$area_km2[17] |>
german_stats::label_number(
scalessuffix = ' km²',
big.mark = ','
)()
}
),pop_million = colDef(
header = 'in millions',
cell = bar_chart('pop_million'),
footer = german_stats$pop_million[[17]] |>
::comma(accuracy = 0.01),
scaleswidth = 110
),pop_per_km2 = colDef(
header = 'per km²',
cell = bar_chart('pop_per_km2'),
width = 110
),
gdp = colDef(
header = div(
'GDP ',
span(
style = htmltools::css(
font_size = '0.75rem'
),'(Index 2015 = 100)'
)
),cell = spark_line,
footer = \(x) spark_line(
$gdp[[17]], NA, NA
german_stats
),width = 160
)
),columnGroups = list(
colGroup(
name = 'Population',
columns = c('pop_million', 'pop_per_km2')
)
),style = htmltools::css(
font_family = '"Source Sans Pro", sans-serif'
)
) tbl_wo_titles
Add title & subtitle
Finally, all that’s left to do is to add a title and subtitle to the table. For that we just have to wrap our variable that contains the table into a div
container and place the title and sub-title containers on top of that.
<- div(
final_table div(
style = htmltools::css(
font_family = '"Source Sans Pro", sans-serif',
font_size = '2rem',
font_weight = 'bold'
),'Germany\'s 16 States in Numbers'
),div(
style = htmltools::css(
font_family = '"Source Sans Pro", sans-serif',
font_size = '1.25rem'
),'Based on data from Wikipedia & statistikportal.de'
),
tbl_wo_titles
) |> browsable() final_table
Hoooooray! We’ve made it. This was quite a long blog post. So congrats if you’ve made it all the way to the end. If you’ve made it this far, don’t forget to check out my other content 👇