Creating fancy interactive tables using Internet data with rvest and reactable

In this blog post, I show you how to web-scrape data from the Internet using {rvest} and then I’ll show you how to create an elaborate table with {reactable}
Author

Albert Rapp

Published

September 1, 2024

This blog post will be a long one as we’ll create this fancy interactive table:

INSERT TABLE HERE

To do so, we’ll proceed in two steps:

  1. We’ll web-scrape the data for this table from the Internet using {rvest}.
  2. We’ll put everything into a nice table using {reactable}.

As always, you can find the video version of this blog post on YouTube:

The video for the {reactable} part will be placed here as soon as it’s released.

Scraping & cleaning tables and texts from the Internet

In this first part, we’re going to learn how to get the source data for our table. After that’s done, we’ll wrap that data into a nice package

Scraping a table from Wikipedia

Extracting tables from Wikipedia is really easy. All you have to do is to pass a link to read_html() and html_table() from the {rvest} package. The only tricky thing: Make sure that you specify what decimal number indicator the website uses. On English sites this will likely be . (and that’s the default auf html_table()) but for German information, we will need ,.

library(tidyverse)
library(rvest)

url_wiki <- 'https://de.wikipedia.org/wiki/Land_(Deutschland)#Rahmendaten_der_L%C3%A4nder'

html_tables <- read_html(url_wiki) |> 
  html_table(dec = ',')
glimpse(html_tables)
## List of 6
##  $ : tibble [17 × 14] (S3: tbl_df/tbl/data.frame)
##   ..$ Wappen                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     : logi [1:17] NA NA NA NA NA NA ...
##   ..$ Land                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       : chr [1:17] "Baden-Württemberg" "Bayern" "Berlin" "Brandenburg" ...
##   ..$ Abk.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       : chr [1:17] "BW" "BY" "BE" "BB" ...
##   ..$ Haupt­stadt                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                : chr [1:17] "Stuttgart" "München" "—" "Potsdam" ...
##   ..$ bevöl­kerungs-reichste Stadt.mw-parser-output .fussnoten-marke{font-size:0.75rem;font-style:normal;font-variant:normal;font-weight:normal;unicode-bidi:isolate;white-space:nowrap}.mw-parser-output .fussnoten-marke.reference,.mw-parser-output span.fussnoten-inhalt{padding-left:0.1rem}.mw-parser-output .fussnoten-marke.reference~.fussnoten-marke.reference,.mw-parser-output span.fussnoten-inhalt~span.fussnoten-inhalt{padding-left:0.15rem}.mw-parser-output .fussnoten-block{margin-bottom:0.1rem}.mw-parser-output div.fussnoten-inhalt{display:inline-block;padding-left:0.8rem;text-indent:-0.8rem}.mw-parser-output div.fussnoten-inhalt p,.mw-parser-output div.fussnoten-inhalt dl,.mw-parser-output div.fussnoten-inhalt ol,.mw-parser-output div.fussnoten-inhalt ul{text-indent:0}.mw-parser-output div.fussnoten-inhalt.fussnoten-floatfix{display:block}.mw-parser-output .fussnoten-box{margin-top:0.5rem;padding-left:0.8rem}.mw-parser-output .fussnoten-box,.mw-parser-output div.fussnoten-inhalt{font-size:94%}.mw-parser-output .fussnoten-box div.fussnoten-inhalt,.mw-parser-output span.fussnoten-inhalt,.mw-parser-output .fussnoten-inhalt .reference-text{font-size:inherit}.mw-parser-output .fussnoten-inhalt .reference-text{display:inline}.mw-parser-output .fussnoten-linie{display:inline-block;position:relative;top:-1em;border-top:solid 1px #808080;width:8rem}.mw-parser-output .fussnoten-linie+p,.mw-parser-output .fussnoten-linie+dl,.mw-parser-output .fussnoten-linie+ol,.mw-parser-output .fussnoten-linie+ul,.mw-parser-output .fussnoten-linie+link+div{margin-top:-1em}.mw-parser-output .fussnoten-marke.reference:target,.mw-parser-output .fussnoten-inhalt:target{background-color:#eaf3ff;box-shadow:0 0 0 0.25em #eaf3ff}.mw-parser-output .fussnoten-marke.reference:target,.mw-parser-output .fussnoten-inhalt:target .fussnoten-marke{font-weight:bold}⁠a: chr [1:17] "Stuttgart" "München" "—" "Potsdam" ...
##   ..$ Beitrittzum Bund                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           : chr [1:17] "1949/52[15]" "1949" "1990[16]" "1990" ...
##   ..$ Regierungs-chef                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            : chr [1:17] "Winfried Kretschmann (Grüne)" "Markus Söder (CSU)" "Kai Wegner (CDU)" "Dietmar Woidke (SPD)" ...
##   ..$ Regierungs-partei(en)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      : chr [1:17] "Grüne und CDU" "CSU und Freie Wähler" "CDU und SPD" "SPD, CDU und Grüne" ...
##   ..$ Bundes­rats-stimmen                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        : chr [1:17] "6" "6" "4" "4" ...
##   ..$ Fläche(km²)[13]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            : chr [1:17] "35.748" "70.542" "891" "29.654" ...
##   ..$ Ein-wohner(Mio.)[13]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       : num [1:17] 11.28 13.369 3.755 2.573 0.685 ...
##   ..$ Ein-wohnerje km²[13]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       : chr [1:17] "316" "190" "4.210" "87" ...
##   ..$ Aus­länder(%)[14]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          : num [1:17] 16.1 13.7 19.6 5.2 19 16.8 16.9 4.8 9.9 13.8 ...
##   ..$ Sprachen                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   : chr [1:17] "Deutsch" "Deutsch" "Deutsch" "Deutsch, Niedersorbisch, Niederdeutsch" ...
##  $ : tibble [18 × 10] (S3: tbl_df/tbl/data.frame)
##   ..$ Wappen                                   : logi [1:18] NA NA NA NA NA NA ...
##   ..$ Land                                     : chr [1:18] "BW" "BY" "BE" "BB" ...
##   ..$ BIP (2018)in Mrd. €[17]                  : chr [1:18] "511,4" "625,2" "147,1" "73,7" ...
##   ..$ Pro Kopf (2018)in €[17]                  : chr [1:18] "46.279" "47.946" "40.568" "29.411" ...
##   ..$ EK/Kin €[18]                             : chr [1:18] "19.261" "18.775" "14.797" "14.634" ...
##   ..$ Schulden (2012)in Mrd. €[19]             : chr [1:18] "67,471" "42,794" "61,220" "21,336" ...
##   ..$ Pro Kopf (2012)in €[20]                  : chr [1:18] "6.255" "3.397" "17.482" "8.549" ...
##   ..$ Schulden (31. Dezember 2018)in Mrd. €[21]: chr [1:18] "44,009" "14,613" "54,403" "16,122" ...
##   ..$ Pro Kopf (31. Dezember 2018)in €[21]     : chr [1:18] "3.976" "1.117" "14.926" "6.418" ...
##   ..$ AQ[22]                                   : chr [1:18] "3,9" "3,8" "9,8" "8,2" ...
##  $ : tibble [4 × 4] (S3: tbl_df/tbl/data.frame)
##   ..$ X1: chr [1:4] "Land Baden-WürttembergSeitenverhältnis: 3:5" "Freie Hansestadt Bremen2:3" "Land Niedersachsen2:3" "Freistaat Sachsen3:5"
##   ..$ X2: chr [1:4] "Freistaat Bayern3:5" "Freie und Hansestadt Hamburg2:3" "Land Nordrhein-Westfalen3:5" "Land Sachsen-Anhalt3:5"
##   ..$ X3: chr [1:4] "Land Berlin3:5" "Land Hessen3:5" "Land Rheinland-Pfalz2:3" "Land Schleswig-Holstein3:5"
##   ..$ X4: chr [1:4] "Land Brandenburg3:5" "Land Mecklenburg-Vorpommern3:5" "Saarland3:5" "Freistaat Thüringen1:2"
##  $ : tibble [8 × 3] (S3: tbl_df/tbl/data.frame)
##   ..$ Fläche in km²            : chr [1:8] ">250.000" "50.000–100.000" "20.000–50.000" "10.000–20.000" ...
##   ..$ WeimarerRepublik         : int [1:8] 1 1 0 5 2 3 3 4
##   ..$ BundesrepublikDeutschland: int [1:8] 0 1 7 4 0 1 0 3
##  $ : tibble [8 × 3] (S3: tbl_df/tbl/data.frame)
##   ..$ Einwohner (1925 bzw. 2018): chr [1:8] ">30.000.000" "10.000.000–20.000.000" "5.000.000–10.000.000" "2.000.000–5.000.000" ...
##   ..$ WeimarerRepublik          : int [1:8] 1 0 1 3 3 4 5 2
##   ..$ BundesrepublikDeutschland : int [1:8] 0 3 2 7 2 2 0 0
##  $ : tibble [2 × 3] (S3: tbl_df/tbl/data.frame)
##   ..$ X1: chr [1:2] "Aktuelle Länder:" "Ehemalige Länder:"
##   ..$ X2: chr [1:2] "Baden-Württemberg Baden-Württemberg | Bayern Bayern | Berlin Berlin | Brandenburg Brandenburg | Bremen Bremen |"| __truncated__ "(Süd-)Baden |  Württemberg-Baden |  Württemberg-Hohenzollern"
##   ..$ X3: logi [1:2] NA NA

Notice that this gives you a list of tibbles that were on the website. The conversion to tables isn’t perfect but it’s a great start. Let’s get the first table from that and check out the column names.

colnames(html_tables[[1]]) 
##  [1] "Wappen"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
##  [2] "Land"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
##  [3] "Abk."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
##  [4] "Haupt­stadt"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
##  [5] "bevöl­kerungs-reichste Stadt.mw-parser-output .fussnoten-marke{font-size:0.75rem;font-style:normal;font-variant:normal;font-weight:normal;unicode-bidi:isolate;white-space:nowrap}.mw-parser-output .fussnoten-marke.reference,.mw-parser-output span.fussnoten-inhalt{padding-left:0.1rem}.mw-parser-output .fussnoten-marke.reference~.fussnoten-marke.reference,.mw-parser-output span.fussnoten-inhalt~span.fussnoten-inhalt{padding-left:0.15rem}.mw-parser-output .fussnoten-block{margin-bottom:0.1rem}.mw-parser-output div.fussnoten-inhalt{display:inline-block;padding-left:0.8rem;text-indent:-0.8rem}.mw-parser-output div.fussnoten-inhalt p,.mw-parser-output div.fussnoten-inhalt dl,.mw-parser-output div.fussnoten-inhalt ol,.mw-parser-output div.fussnoten-inhalt ul{text-indent:0}.mw-parser-output div.fussnoten-inhalt.fussnoten-floatfix{display:block}.mw-parser-output .fussnoten-box{margin-top:0.5rem;padding-left:0.8rem}.mw-parser-output .fussnoten-box,.mw-parser-output div.fussnoten-inhalt{font-size:94%}.mw-parser-output .fussnoten-box div.fussnoten-inhalt,.mw-parser-output span.fussnoten-inhalt,.mw-parser-output .fussnoten-inhalt .reference-text{font-size:inherit}.mw-parser-output .fussnoten-inhalt .reference-text{display:inline}.mw-parser-output .fussnoten-linie{display:inline-block;position:relative;top:-1em;border-top:solid 1px #808080;width:8rem}.mw-parser-output .fussnoten-linie+p,.mw-parser-output .fussnoten-linie+dl,.mw-parser-output .fussnoten-linie+ol,.mw-parser-output .fussnoten-linie+ul,.mw-parser-output .fussnoten-linie+link+div{margin-top:-1em}.mw-parser-output .fussnoten-marke.reference:target,.mw-parser-output .fussnoten-inhalt:target{background-color:#eaf3ff;box-shadow:0 0 0 0.25em #eaf3ff}.mw-parser-output .fussnoten-marke.reference:target,.mw-parser-output .fussnoten-inhalt:target .fussnoten-marke{font-weight:bold}⁠a"
##  [6] "Beitrittzum Bund"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
##  [7] "Regierungs-chef"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
##  [8] "Regierungs-partei(en)"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
##  [9] "Bundes­rats-stimmen"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
## [10] "Fläche(km²)[13]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [11] "Ein-wohner(Mio.)[13]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
## [12] "Ein-wohnerje km²[13]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
## [13] "Aus­länder(%)[14]"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
## [14] "Sprachen"

Notice that the first column labels may look right in the console. But when you copy them into a script file, you will notice that there are dumb invisible special characters. So with that in mind, let us simply select the columns that are of interest to us with numbers instead of the column names.

selected_state_dat <- html_tables[[1]] |> 
  select(
    state = 2,
    capital = 4, 
    most_populous_city = 5,
    federal_assembly_votes = 9,
    area_km2 = 10,
    pop_million = 11,
    pop_per_km2 = 12
  )
selected_state_dat
## # A tibble: 17 × 7
##    state  capital most_populous_city federal_assembly_votes area_km2 pop_million
##    <chr>  <chr>   <chr>              <chr>                  <chr>          <dbl>
##  1 Baden… Stuttg… Stuttgart          6                      35.748        11.3  
##  2 Bayern München München            6                      70.542        13.4  
##  3 Berlin —       —                  4                      891            3.76 
##  4 Brand… Potsdam Potsdam            4                      29.654         2.57 
##  5 Bremen Bremen… Bremen             3                      420            0.685
##  6 Hambu… —       —                  3                      755            1.89 
##  7 Hessen Wiesba… Frankfurt am Main  5                      21.116         6.39 
##  8 Meckl… Schwer… Rostock            3                      23.295         1.63 
##  9 Niede… Hannov… Hannover           6                      47.710         8.14 
## 10 Nordr… Düssel… Köln               6                      34.112        18.1  
## 11 Rhein… Mainz   Mainz              4                      19.858         4.16 
## 12 Saarl… Saarbr… Saarbrücken        3                      2.571          0.993
## 13 Sachs… Dresden Leipzig            4                      18.450         4.09 
## 14 Sachs… Magdeb… Halle (Saale)      4                      20.459         2.19 
## 15 Schle… Kiel    Kiel               4                      15.804         2.95 
## 16 Thüri… Erfurt  Erfurt             4                      16.202         2.13 
## 17 Bunde… Berlin  Berlin             ⁠b –                    357.588       84.4  
## # ℹ 1 more variable: pop_per_km2 <chr>

Parsing texts to numbers

Notice how the columns area_km2, federal_assembly_votes and pop_per_km2 are encoded as characters instead of as numbers. The reason for that is that in those columns there are always characters that R doesn’t know how to handle. For one, German numbers that are written as 4.001 mean 4,001 in English terminology. Also, in the last row of the federal_assembly_votes column there’s still a b character which was a footnote from Wikipedia. Let’s clean all of this up.

german_locale <- locale(
  decimal_mark = ',', 
  grouping_mark = '.'
)
selected_state_dat |> 
  mutate(
    federal_assembly_votes = parse_number(
      federal_assembly_votes,
      na = selected_state_dat$federal_assembly_votes[[17]],
      locale = german_locale
    ),
    federal_assembly_votes = if_else(
      is.na(federal_assembly_votes),
      sum(federal_assembly_votes, na.rm = TRUE),
      federal_assembly_votes
    ),
    across(
      c(area_km2, pop_per_km2),
      \(x) parse_number(
        x,
        locale = german_locale
      )
    )
  )
## # A tibble: 17 × 7
##    state  capital most_populous_city federal_assembly_votes area_km2 pop_million
##    <chr>  <chr>   <chr>                               <dbl>    <dbl>       <dbl>
##  1 Baden… Stuttg… Stuttgart                               6    35748      11.3  
##  2 Bayern München München                                 6    70542      13.4  
##  3 Berlin —       —                                       4      891       3.76 
##  4 Brand… Potsdam Potsdam                                 4    29654       2.57 
##  5 Bremen Bremen… Bremen                                  3      420       0.685
##  6 Hambu… —       —                                       3      755       1.89 
##  7 Hessen Wiesba… Frankfurt am Main                       5    21116       6.39 
##  8 Meckl… Schwer… Rostock                                 3    23295       1.63 
##  9 Niede… Hannov… Hannover                                6    47710       8.14 
## 10 Nordr… Düssel… Köln                                    6    34112      18.1  
## 11 Rhein… Mainz   Mainz                                   4    19858       4.16 
## 12 Saarl… Saarbr… Saarbrücken                             3     2571       0.993
## 13 Sachs… Dresden Leipzig                                 4    18450       4.09 
## 14 Sachs… Magdeb… Halle (Saale)                           4    20459       2.19 
## 15 Schle… Kiel    Kiel                                    4    15804       2.95 
## 16 Thüri… Erfurt  Erfurt                                  4    16202       2.13 
## 17 Bunde… Berlin  Berlin                                 69   357588      84.4  
## # ℹ 1 more variable: pop_per_km2 <dbl>

Finally, we can also clean up the text in the capital column where we transform "Bremen (de facto)" to "Bremen". And we can transform "Bundes­republik Deutschland" to "Deutschland" in the state column. This will let us join nicely with our second data set that we’re going to scrape from the interwebs.

cleaned_state_dat <- selected_state_dat |> 
  mutate(
    federal_assembly_votes = parse_number(
      federal_assembly_votes,
      na = selected_state_dat$federal_assembly_votes[[17]],
      locale = german_locale
    ),
    federal_assembly_votes = if_else(
      is.na(federal_assembly_votes),
      sum(federal_assembly_votes, na.rm = TRUE),
      federal_assembly_votes
    ),
    across(
      c(area_km2, pop_per_km2),
      \(x) parse_number(
        x,
        locale = german_locale
      )
    ),
    state = if_else(
      state == "Bundes­republik Deutschland",
      "Deutschland",
      state
    ),
    capital = if_else(
      str_detect(capital, 'Bremen'),
      'Bremen',
      capital
    )
  )

cleaned_state_dat |> 
  glimpse()
## Rows: 17
## Columns: 7
## $ state                  <chr> "Baden-Württemberg", "Bayern", "Berlin", "Brand…
## $ capital                <chr> "Stuttgart", "München", "—", "Potsdam", "Bremen…
## $ most_populous_city     <chr> "Stuttgart", "München", "—", "Potsdam", "Bremen…
## $ federal_assembly_votes <dbl> 6, 6, 4, 4, 3, 3, 5, 3, 6, 6, 4, 3, 4, 4, 4, 4,…
## $ area_km2               <dbl> 35748, 70542, 891, 29654, 420, 755, 21116, 2329…
## $ pop_million            <dbl> 11.280, 13.369, 3.755, 2.573, 0.685, 1.892, 6.3…
## $ pop_per_km2            <dbl> 316, 190, 4210, 87, 1633, 2506, 303, 70, 171, 5…

Scraping data from Statistikportal

Alright, now let’s get GDP data from the website Statistikportal.de

url_gdp <- 'https://www.statistikportal.de/de/ugrdl/ergebnisse/wirtschaft-und-bevoelkerung/bipbws'

read_html(url_gdp) |> 
  html_table(dec = ',') |> 
  glimpse()
## List of 3
##  $ : tibble [19 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ Bruttoinlandsprodukt in jeweiligen Preisen 2022 nach Bundesländern: chr [1:19] "Land" "Land" "Baden-\n\t\t\tWürttemberg" "Bayern" ...
##   ..$ Bruttoinlandsprodukt in jeweiligen Preisen 2022 nach Bundesländern: chr [1:19] "2022" "Mill. Euro" "538 948" "666 388" ...
##  $ : tibble [19 × 11] (S3: tbl_df/tbl/data.frame)
##   ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "Land" "Land" "Baden-Württemberg" "Bayern" ...
##   ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "1991" "Index (2015 = 100)" "72,3" "65,1" ...
##   ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2000" "Index (2015 = 100)" "80,7" "77,4" ...
##   ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2010" "Index (2015 = 100)" "89,4" "88,4" ...
##   ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2015" "Index (2015 = 100)" "100" "100" ...
##   ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2017" "Index (2015 = 100)" "104,7" "106,3" ...
##   ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2018" "Index (2015 = 100)" "107,0" "106,7" ...
##   ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2019" "Index (2015 = 100)" "106,6" "108,6" ...
##   ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2020" "Index (2015 = 100)" "101,5" "104,6" ...
##   ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2021" "Index (2015 = 100)" "104,8" "107,5" ...
##   ..$ Bruttoinlandsprodukt (preisbereinigt, verkettet) 1991 – 2022 nach Bundesländern: chr [1:19] "2022" "Index (2015 = 100)" "106,2" "109,8" ...
##  $ : tibble [21 × 7] (S3: tbl_df/tbl/data.frame)
##   ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Land" "Land" "Land" "Land" ...
##   ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Wirtschaft\n\t\t\tinsgesamt\n\t\t\t(A-T)" "Wirtschaft\n\t\t\tinsgesamt\n\t\t\t(A-T)" "Wirtschaft\n\t\t\tinsgesamt\n\t\t\t(A-T)" "Mill. EUR" ...
##   ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Davon" "Land- und\n\t\t\tForstwirtschaft,\n\t\t\tFischerei\n\t\t\t(A)" "Land- und\n\t\t\tForstwirtschaft,\n\t\t\tFischerei\n\t\t\t(A)" "Mill. EUR" ...
##   ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Davon" "Produzierendes\n\t\t\tGewerbe\n\t\t\t(B-F)" "Produzierendes\n\t\t\tGewerbe\n\t\t\t(B-F)" "Mill. EUR" ...
##   ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Davon" "darunter" "Verarbeitendes\n\t\t\tGewerbe\n\t\t\t(C)" "Mill. EUR" ...
##   ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Davon" "darunter" "Baugewerbe\n\t\t\t(F)" "Mill. EUR" ...
##   ..$ Bruttowertschöpfung in jeweiligen Preisen 2022 nach Wirtschaftszweigen und Bundesländern: chr [1:21] "Davon" "Dienstleistungs-\n\t\t\tbereiche\n\t\t\t(G-T)" "Dienstleistungs-\n\t\t\tbereiche\n\t\t\t(G-T)" "Mill. EUR" ...

In this case, we see that column names are always the same because that table has a merged cell on top. So let’s avoid that by telling html_table() to avoid headers. And then let’s grab the second table from the list of extracted tables.

html_tables <- read_html(url_gdp) |> 
  html_table(dec = ',', header = FALSE)
html_tables[[2]]
## # A tibble: 20 × 11
##    X1                X2    X3    X4    X5    X6    X7    X8    X9    X10   X11  
##    <chr>             <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 Bruttoinlandspro… Brut… Brut… Brut… Brut… Brut… Brut… Brut… Brut… Brut… Brut…
##  2 Land              1991  2000  2010  2015  2017  2018  2019  2020  2021  2022 
##  3 Land              Inde… Inde… Inde… Inde… Inde… Inde… Inde… Inde… Inde… Inde…
##  4 Baden-Württemberg 72,3  80,7  89,4  100   104,7 107,0 106,6 101,5 104,8 106,2
##  5 Bayern            65,1  77,4  88,4  100   106,3 106,7 108,6 104,6 107,5 109,8
##  6 Berlin            77,0  83,7  90,3  100   109,6 113,5 116,8 114,2 117,8 123,6
##  7 Brandenburg       50,2  84,6  93,1  100   104,7 105,2 107,0 104,6 107,2 110,7
##  8 Bremen            86,5  89,1  94,5  100   103,3 103,0 101,5 96,4  102,3 107,5
##  9 Hamburg           76,5  85,8  94,6  100   104,0 103,8 107,1 102,0 105,8 110,5
## 10 Hessen            80,5  91,8  94,8  100   105,0 105,5 107,1 102,0 104,5 106,2
## 11 Mecklenburg-Vorp… 59,2  89,7  94,8  100   105,8 103,8 108,3 104,7 107,2 107,4
## 12 Niedersachsen     78,1  85,1  93,6  100   106,9 108,3 110,6 106,2 107,0 108,2
## 13 Nordrhein-Westfa… 81,6  88,6  94,2  100   103,7 105,1 105,1 101,8 103,6 104,7
## 14 Rheinland-Pfalz   79,5  85,1  91,7  100   102,5 102,6 103,1 99,5  108,2 107,9
## 15 Saarland          84,2  90,8  96,2  100   101,4 100,8 98,8  94,0  95,2  96,8 
## 16 Sachsen           51,2  80,5  90,8  100   104,1 104,9 106,4 102,7 104,6 107,4
## 17 Sachsen-Anhalt    59,5  91,2  97,2  100   102,6 102,1 103,7 101,3 103,6 106,3
## 18 Schleswig-Holste… 81,3  88,7  93,1  100   105,2 105,7 108,2 106,4 107,7 109,1
## 19 Thüringen         47,9  83,1  90,5  100   103,3 103,0 102,9 99,8  101,8 103,3
## 20 Deutschland       73,3  84,5  92,0  100   105,0 106,0 107,1 103,2 105,9 107,8

Alright, let’s get rid of the 1991, 2000 and 2010 column. I only want to start at 2015.

html_tables[[2]]  |> 
  select(-(2:4))
## # A tibble: 20 × 8
##    X1                                  X5    X6    X7    X8    X9    X10   X11  
##    <chr>                               <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 Bruttoinlandsprodukt (preisbereini… Brut… Brut… Brut… Brut… Brut… Brut… Brut…
##  2 Land                                2015  2017  2018  2019  2020  2021  2022 
##  3 Land                                Inde… Inde… Inde… Inde… Inde… Inde… Inde…
##  4 Baden-Württemberg                   100   104,7 107,0 106,6 101,5 104,8 106,2
##  5 Bayern                              100   106,3 106,7 108,6 104,6 107,5 109,8
##  6 Berlin                              100   109,6 113,5 116,8 114,2 117,8 123,6
##  7 Brandenburg                         100   104,7 105,2 107,0 104,6 107,2 110,7
##  8 Bremen                              100   103,3 103,0 101,5 96,4  102,3 107,5
##  9 Hamburg                             100   104,0 103,8 107,1 102,0 105,8 110,5
## 10 Hessen                              100   105,0 105,5 107,1 102,0 104,5 106,2
## 11 Mecklenburg-Vorpommern              100   105,8 103,8 108,3 104,7 107,2 107,4
## 12 Niedersachsen                       100   106,9 108,3 110,6 106,2 107,0 108,2
## 13 Nordrhein-Westfalen                 100   103,7 105,1 105,1 101,8 103,6 104,7
## 14 Rheinland-Pfalz                     100   102,5 102,6 103,1 99,5  108,2 107,9
## 15 Saarland                            100   101,4 100,8 98,8  94,0  95,2  96,8 
## 16 Sachsen                             100   104,1 104,9 106,4 102,7 104,6 107,4
## 17 Sachsen-Anhalt                      100   102,6 102,1 103,7 101,3 103,6 106,3
## 18 Schleswig-Holstein                  100   105,2 105,7 108,2 106,4 107,7 109,1
## 19 Thüringen                           100   103,3 103,0 102,9 99,8  101,8 103,3
## 20 Deutschland                         100   105,0 106,0 107,1 103,2 105,9 107,8

Now, notice that the column names that we actually want are in the second row. The {janitor} package has us covered there.

html_tables[[2]]  |> 
  select(-(2:4)) |> 
  janitor::row_to_names(row_number = 2)
## # A tibble: 18 × 8
##    Land                   `2015`       `2017` `2018` `2019` `2020` `2021` `2022`
##    <chr>                  <chr>        <chr>  <chr>  <chr>  <chr>  <chr>  <chr> 
##  1 Land                   Index (2015… Index… Index… Index… Index… Index… Index…
##  2 Baden-Württemberg      100          104,7  107,0  106,6  101,5  104,8  106,2 
##  3 Bayern                 100          106,3  106,7  108,6  104,6  107,5  109,8 
##  4 Berlin                 100          109,6  113,5  116,8  114,2  117,8  123,6 
##  5 Brandenburg            100          104,7  105,2  107,0  104,6  107,2  110,7 
##  6 Bremen                 100          103,3  103,0  101,5  96,4   102,3  107,5 
##  7 Hamburg                100          104,0  103,8  107,1  102,0  105,8  110,5 
##  8 Hessen                 100          105,0  105,5  107,1  102,0  104,5  106,2 
##  9 Mecklenburg-Vorpommern 100          105,8  103,8  108,3  104,7  107,2  107,4 
## 10 Niedersachsen          100          106,9  108,3  110,6  106,2  107,0  108,2 
## 11 Nordrhein-Westfalen    100          103,7  105,1  105,1  101,8  103,6  104,7 
## 12 Rheinland-Pfalz        100          102,5  102,6  103,1  99,5   108,2  107,9 
## 13 Saarland               100          101,4  100,8  98,8   94,0   95,2   96,8  
## 14 Sachsen                100          104,1  104,9  106,4  102,7  104,6  107,4 
## 15 Sachsen-Anhalt         100          102,6  102,1  103,7  101,3  103,6  106,3 
## 16 Schleswig-Holstein     100          105,2  105,7  108,2  106,4  107,7  109,1 
## 17 Thüringen              100          103,3  103,0  102,9  99,8   101,8  103,3 
## 18 Deutschland            100          105,0  106,0  107,1  103,2  105,9  107,8

This leaves us an extra first row. So let’s remove that.

html_tables[[2]]  |> 
  select(-(2:4)) |> 
  janitor::row_to_names(row_number = 2) |> 
  slice(-1)
## # A tibble: 17 × 8
##    Land                   `2015` `2017` `2018` `2019` `2020` `2021` `2022`
##    <chr>                  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr> 
##  1 Baden-Württemberg      100    104,7  107,0  106,6  101,5  104,8  106,2 
##  2 Bayern                 100    106,3  106,7  108,6  104,6  107,5  109,8 
##  3 Berlin                 100    109,6  113,5  116,8  114,2  117,8  123,6 
##  4 Brandenburg            100    104,7  105,2  107,0  104,6  107,2  110,7 
##  5 Bremen                 100    103,3  103,0  101,5  96,4   102,3  107,5 
##  6 Hamburg                100    104,0  103,8  107,1  102,0  105,8  110,5 
##  7 Hessen                 100    105,0  105,5  107,1  102,0  104,5  106,2 
##  8 Mecklenburg-Vorpommern 100    105,8  103,8  108,3  104,7  107,2  107,4 
##  9 Niedersachsen          100    106,9  108,3  110,6  106,2  107,0  108,2 
## 10 Nordrhein-Westfalen    100    103,7  105,1  105,1  101,8  103,6  104,7 
## 11 Rheinland-Pfalz        100    102,5  102,6  103,1  99,5   108,2  107,9 
## 12 Saarland               100    101,4  100,8  98,8   94,0   95,2   96,8  
## 13 Sachsen                100    104,1  104,9  106,4  102,7  104,6  107,4 
## 14 Sachsen-Anhalt         100    102,6  102,1  103,7  101,3  103,6  106,3 
## 15 Schleswig-Holstein     100    105,2  105,7  108,2  106,4  107,7  109,1 
## 16 Thüringen              100    103,3  103,0  102,9  99,8   101,8  103,3 
## 17 Deutschland            100    105,0  106,0  107,1  103,2  105,9  107,8

Did you notice that the 2016 data is missing? Maybe you did. Maybe not. But let’s make this really obvious.

html_tables[[2]]  |> 
  select(-(2:4)) |> 
  janitor::row_to_names(row_number = 2) |> 
  slice(-1) |> 
  mutate(
    `2016` = NA_character_,
    .before = 3
  ) 
## # A tibble: 17 × 9
##    Land                  `2015` `2016` `2017` `2018` `2019` `2020` `2021` `2022`
##    <chr>                 <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr> 
##  1 Baden-Württemberg     100    <NA>   104,7  107,0  106,6  101,5  104,8  106,2 
##  2 Bayern                100    <NA>   106,3  106,7  108,6  104,6  107,5  109,8 
##  3 Berlin                100    <NA>   109,6  113,5  116,8  114,2  117,8  123,6 
##  4 Brandenburg           100    <NA>   104,7  105,2  107,0  104,6  107,2  110,7 
##  5 Bremen                100    <NA>   103,3  103,0  101,5  96,4   102,3  107,5 
##  6 Hamburg               100    <NA>   104,0  103,8  107,1  102,0  105,8  110,5 
##  7 Hessen                100    <NA>   105,0  105,5  107,1  102,0  104,5  106,2 
##  8 Mecklenburg-Vorpomme… 100    <NA>   105,8  103,8  108,3  104,7  107,2  107,4 
##  9 Niedersachsen         100    <NA>   106,9  108,3  110,6  106,2  107,0  108,2 
## 10 Nordrhein-Westfalen   100    <NA>   103,7  105,1  105,1  101,8  103,6  104,7 
## 11 Rheinland-Pfalz       100    <NA>   102,5  102,6  103,1  99,5   108,2  107,9 
## 12 Saarland              100    <NA>   101,4  100,8  98,8   94,0   95,2   96,8  
## 13 Sachsen               100    <NA>   104,1  104,9  106,4  102,7  104,6  107,4 
## 14 Sachsen-Anhalt        100    <NA>   102,6  102,1  103,7  101,3  103,6  106,3 
## 15 Schleswig-Holstein    100    <NA>   105,2  105,7  108,2  106,4  107,7  109,1 
## 16 Thüringen             100    <NA>   103,3  103,0  102,9  99,8   101,8  103,3 
## 17 Deutschland           100    <NA>   105,0  106,0  107,1  103,2  105,9  107,8

Cool, everything is clear to us now. But not necessarily to R. Notice how all columns are encoded as characters. R doesn’t know that most columns should be numbers. That’s not great for calculations. So let’s fix that.

html_tables[[2]]  |> 
  select(-(2:4)) |> 
  janitor::row_to_names(row_number = 2) |> 
  slice(-1) |> 
  mutate(
    `2016` = NA_character_,
    across(
      -Land, 
      \(x) parse_number(x, locale = german_locale)
    ),
    .before = 3
  )
## # A tibble: 17 × 9
##    Land                  `2015` `2016` `2017` `2018` `2019` `2020` `2021` `2022`
##    <chr>                  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 Baden-Württemberg        100     NA   105.   107   107.   102.   105.   106. 
##  2 Bayern                   100     NA   106.   107.  109.   105.   108.   110. 
##  3 Berlin                   100     NA   110.   114.  117.   114.   118.   124. 
##  4 Brandenburg              100     NA   105.   105.  107    105.   107.   111. 
##  5 Bremen                   100     NA   103.   103   102.    96.4  102.   108. 
##  6 Hamburg                  100     NA   104    104.  107.   102    106.   110. 
##  7 Hessen                   100     NA   105    106.  107.   102    104.   106. 
##  8 Mecklenburg-Vorpomme…    100     NA   106.   104.  108.   105.   107.   107. 
##  9 Niedersachsen            100     NA   107.   108.  111.   106.   107    108. 
## 10 Nordrhein-Westfalen      100     NA   104.   105.  105.   102.   104.   105. 
## 11 Rheinland-Pfalz          100     NA   102.   103.  103.    99.5  108.   108. 
## 12 Saarland                 100     NA   101.   101.   98.8   94     95.2   96.8
## 13 Sachsen                  100     NA   104.   105.  106.   103.   105.   107. 
## 14 Sachsen-Anhalt           100     NA   103.   102.  104.   101.   104.   106. 
## 15 Schleswig-Holstein       100     NA   105.   106.  108.   106.   108.   109. 
## 16 Thüringen                100     NA   103.   103   103.    99.8  102.   103. 
## 17 Deutschland              100     NA   105    106   107.   103.   106.   108.

Sweet! Now we can rename the Land column to state so that it matches with the column name from our Wikipedia data set. And once that is done, let us rearrange our data so that we can easily summarise our data by state.

html_tables[[2]]  |> 
  select(-(2:4)) |> 
  janitor::row_to_names(row_number = 2) |> 
  slice(-1) |> 
  mutate(
    `2016` = NA_character_,
    across(
      -Land, 
      \(x) parse_number(x, locale = german_locale)
    ),
    .before = 3
  ) |> 
  rename(state = Land) |> 
  pivot_longer(
    cols = -state,
    names_to = 'year',
    values_to = 'gdp'
  )
## # A tibble: 136 × 3
##    state             year    gdp
##    <chr>             <chr> <dbl>
##  1 Baden-Württemberg 2015   100 
##  2 Baden-Württemberg 2016    NA 
##  3 Baden-Württemberg 2017   105.
##  4 Baden-Württemberg 2018   107 
##  5 Baden-Württemberg 2019   107.
##  6 Baden-Württemberg 2020   102.
##  7 Baden-Württemberg 2021   105.
##  8 Baden-Württemberg 2022   106.
##  9 Bayern            2015   100 
## 10 Bayern            2016    NA 
## # ℹ 126 more rows

That’s a fantastic format that summarise() can handle. So let’s use it to collect all of the gdp data in vectors.

gdp_series <- html_tables[[2]]  |> 
  select(-(2:4)) |> 
  janitor::row_to_names(row_number = 2) |> 
  slice(-1) |> 
  mutate(
    `2016` = NA_character_,
    across(
      -Land, 
      \(x) parse_number(x, locale = german_locale)
    ),
    .before = 3
  ) |> 
  rename(state = Land) |> 
  pivot_longer(
    cols = -state,
    names_to = 'year',
    values_to = 'gdp'
  ) |> 
  summarise(
    gdp = list(gdp),
    .by = state
  )
gdp_series
## # A tibble: 17 × 2
##    state                  gdp      
##    <chr>                  <list>   
##  1 Baden-Württemberg      <dbl [8]>
##  2 Bayern                 <dbl [8]>
##  3 Berlin                 <dbl [8]>
##  4 Brandenburg            <dbl [8]>
##  5 Bremen                 <dbl [8]>
##  6 Hamburg                <dbl [8]>
##  7 Hessen                 <dbl [8]>
##  8 Mecklenburg-Vorpommern <dbl [8]>
##  9 Niedersachsen          <dbl [8]>
## 10 Nordrhein-Westfalen    <dbl [8]>
## 11 Rheinland-Pfalz        <dbl [8]>
## 12 Saarland               <dbl [8]>
## 13 Sachsen                <dbl [8]>
## 14 Sachsen-Anhalt         <dbl [8]>
## 15 Schleswig-Holstein     <dbl [8]>
## 16 Thüringen              <dbl [8]>
## 17 Deutschland            <dbl [8]>

Combine the data sets

We’re almost done with assembling the data for our table. What we still need are the URLs to the images of the coat of arms of the states. No fancy trick there. Just going on to Wikipedia and copying all the URLs into a vector.

img_urls <- c(
  'https://upload.wikimedia.org/wikipedia/commons/0/0f/Lesser_coat_of_arms_of_Baden-W%C3%BCrttemberg.svg',
  'https://upload.wikimedia.org/wikipedia/commons/d/d2/Bayern_Wappen.svg',
  'https://upload.wikimedia.org/wikipedia/commons/8/8c/DEU_Berlin_COA.svg',
  'https://upload.wikimedia.org/wikipedia/commons/a/a2/DEU_Brandenburg_COA.svg',
  'https://upload.wikimedia.org/wikipedia/commons/6/64/Bremen_Wappen%28Mittel%29.svg',
  'https://upload.wikimedia.org/wikipedia/commons/5/5d/DEU_Hamburg_COA.svg',
  'https://upload.wikimedia.org/wikipedia/commons/c/cd/Coat_of_arms_of_Hesse.svg',
  'https://upload.wikimedia.org/wikipedia/commons/7/74/Coat_of_arms_of_Mecklenburg-Western_Pomerania_%28great%29.svg',
  'https://upload.wikimedia.org/wikipedia/de/0/01/Wappen_von_Niedersachsen.svg',
  'https://upload.wikimedia.org/wikipedia/commons/1/1b/Coat_of_arms_of_North_Rhine-Westphalia.svg',
  'https://upload.wikimedia.org/wikipedia/commons/8/89/Coat_of_arms_of_Rhineland-Palatinate.svg',
  'https://upload.wikimedia.org/wikipedia/commons/8/8e/Wappen_des_Saarlands.svg',
  'https://upload.wikimedia.org/wikipedia/commons/5/5f/Coat_of_arms_of_Saxony.svg',
  'https://upload.wikimedia.org/wikipedia/commons/5/53/Wappen_Sachsen-Anhalt.svg',
  'https://upload.wikimedia.org/wikipedia/commons/0/02/DEU_Schleswig-Holstein_COA.svg',
  'https://upload.wikimedia.org/wikipedia/commons/0/08/Coat_of_arms_of_Thuringia.svg',
  'https://upload.wikimedia.org/wikipedia/commons/d/da/Coat_of_arms_of_Germany.svg'
)

cleaned_state_dat |> 
  mutate(
    img = img_urls,
    .before = 1
  )
## # A tibble: 17 × 8
##    img          state capital most_populous_city federal_assembly_votes area_km2
##    <chr>        <chr> <chr>   <chr>                               <dbl>    <dbl>
##  1 https://upl… Bade… Stuttg… Stuttgart                               6    35748
##  2 https://upl… Baye… München München                                 6    70542
##  3 https://upl… Berl… —       —                                       4      891
##  4 https://upl… Bran… Potsdam Potsdam                                 4    29654
##  5 https://upl… Brem… Bremen  Bremen                                  3      420
##  6 https://upl… Hamb… —       —                                       3      755
##  7 https://upl… Hess… Wiesba… Frankfurt am Main                       5    21116
##  8 https://upl… Meck… Schwer… Rostock                                 3    23295
##  9 https://upl… Nied… Hannov… Hannover                                6    47710
## 10 https://upl… Nord… Düssel… Köln                                    6    34112
## 11 https://upl… Rhei… Mainz   Mainz                                   4    19858
## 12 https://upl… Saar… Saarbr… Saarbrücken                             3     2571
## 13 https://upl… Sach… Dresden Leipzig                                 4    18450
## 14 https://upl… Sach… Magdeb… Halle (Saale)                           4    20459
## 15 https://upl… Schl… Kiel    Kiel                                    4    15804
## 16 https://upl… Thür… Erfurt  Erfurt                                  4    16202
## 17 https://upl… Deut… Berlin  Berlin                                 69   357588
## # ℹ 2 more variables: pop_million <dbl>, pop_per_km2 <dbl>

And now that cleaned_state_dat is complete, we can join it with gdp_series.

german_stats <- cleaned_state_dat |> 
  mutate(
    img = img_urls,
    .before = 1
  ) |> 
  left_join(gdp_series, by = 'state')
german_stats
## # A tibble: 17 × 9
##    img          state capital most_populous_city federal_assembly_votes area_km2
##    <chr>        <chr> <chr>   <chr>                               <dbl>    <dbl>
##  1 https://upl… Bade… Stuttg… Stuttgart                               6    35748
##  2 https://upl… Baye… München München                                 6    70542
##  3 https://upl… Berl… —       —                                       4      891
##  4 https://upl… Bran… Potsdam Potsdam                                 4    29654
##  5 https://upl… Brem… Bremen  Bremen                                  3      420
##  6 https://upl… Hamb… —       —                                       3      755
##  7 https://upl… Hess… Wiesba… Frankfurt am Main                       5    21116
##  8 https://upl… Meck… Schwer… Rostock                                 3    23295
##  9 https://upl… Nied… Hannov… Hannover                                6    47710
## 10 https://upl… Nord… Düssel… Köln                                    6    34112
## 11 https://upl… Rhei… Mainz   Mainz                                   4    19858
## 12 https://upl… Saar… Saarbr… Saarbrücken                             3     2571
## 13 https://upl… Sach… Dresden Leipzig                                 4    18450
## 14 https://upl… Sach… Magdeb… Halle (Saale)                           4    20459
## 15 https://upl… Schl… Kiel    Kiel                                    4    15804
## 16 https://upl… Thür… Erfurt  Erfurt                                  4    16202
## 17 https://upl… Deut… Berlin  Berlin                                 69   357588
## # ℹ 3 more variables: pop_million <dbl>, pop_per_km2 <dbl>, gdp <list>

Get state descriptions from Wikipedia

Now the only thing that’s missing is the state descriptions. For that, we can scrape the first paragraph from Wikipedia. Let’s do that with one example first. Let’s use the URL to the English Wiki of Bavaria.

url <- 'https://en.wikipedia.org/wiki/Baden-W%C3%BCrttemberg'

Then, we can get all the paragraphs from this page by looking for <p>-tags. That’s HTML notation to denote a paragraph. With the {rvest} package, all we have to do is to read the URL just like before and then pass that to the html_elements() function.

paragraphs <- read_html(url) |> 
  html_elements('p')
paragraphs
## {xml_nodeset (69)}
##  [1] <p class="mw-empty-elt">\n</p>
##  [2] <p><b>Baden-Württemberg</b> (<span class="rt-commentedText nowrap"><span ...
##  [3] <p>What is now Baden-Württemberg was formerly the historical territories ...
##  [4] <p>Baden-Württemberg is especially known for its strong economy with var ...
##  [5] <p>The <a href="/wiki/Sobriquet" title="Sobriquet">sobriquet</a> <span t ...
##  [6] <p>Baden-Württemberg is formed from the historical territories of <a hre ...
##  [7] <p>In 100 AD, the <a href="/wiki/Roman_Empire" title="Roman Empire">Roma ...
##  [8] <p>The Holy Roman Empire was later established. The majority of people i ...
##  [9] <p>In the late 18th and early 19th century, <a href="/wiki/K%C3%BCnzelsa ...
## [10] <p>In the late 19th and early 20th centuries, numerous people emigrated  ...
## [11] <p>At the beginning of the 20th century, the territory of modern-day Bad ...
## [12] <p>Following <a href="/wiki/Adolf_Hitler" title="Adolf Hitler">Adolf Hit ...
## [13] <p>After World War II, the <a href="/wiki/Allies_of_World_War_II" title= ...
## [14] <p>In 1949, each state became a founding member of the <a href="/wiki/We ...
## [15] <p>There were still opponents to the merger of Baden and Württemberg, ho ...
## [16] <p>Baden-Württemberg shares borders with the German states of <a href="/ ...
## [17] <p>Most of the major cities of Baden-Württemberg straddle the banks of t ...
## [18] <p>The <a href="/wiki/Rhine" title="Rhine">Rhine</a> (German: <i lang="d ...
## [19] <p>The <a href="/wiki/Danube" title="Danube">Danube</a> is conventionall ...
## [20] <p>The forests in this region are home to common pests such as <i><a hre ...
## ...

This will give us a bunch of HTML text. If we want to turn this into clean text, we just apply the html_text() function. Then, let us look at the first three entries of the resulting vector.

html_text(paragraphs)[1:3] 
## [1] "\n"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
## [2] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;[6].mw-parser-output .IPA-label-small{font-size:85%}.mw-parser-output .references .IPA-label-small,.mw-parser-output .infobox .IPA-label-small,.mw-parser-output .navbox .IPA-label-small{font-size:100%}German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019[update] across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria).[7] As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"
## [3] "What is now Baden-Württemberg was formerly the historical territories of Baden, Prussian Hohenzollern, and Württemberg. Baden-Württemberg became a state of West Germany in April 1952 through the merger of South Baden, Württemberg-Baden, and Württemberg-Hohenzollern. These three states had been artificially created by the Allies after World War II out of the existing traditional states by their separation over different occupation zones.\n"

It appears as if the text we’re looking for is in the second entry. But is has a whole lot of gibberish in it too. Things like:

.mw-parser-output .IPA-label-small{font-size:85%}
.mw-parser-output .references
.IPA-label-small,.mw-parser-output .infobox
.IPA-label-small,.mw-parser-output .navbox
.IPA-label-small{font-size:100%}

So let’s get rid of that. First, let’s look for the text .mw-parser-output and remove that.

html_text(paragraphs)[2] |> 
  str_remove_all('\\.mw-parser-output')
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;[6] .IPA-label-small{font-size:85%} .references .IPA-label-small, .infobox .IPA-label-small, .navbox .IPA-label-small{font-size:100%}German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019[update] across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria).[7] As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"

Here, we have used \\. to look for a literal . character. Otherwise, . would mean any character in regular expressions. But this doesn’t remove everything. We want to remove everything from .mw-parser-output to what is in the curly brackets {} (like {font-size:85%}). So let’s check if we can remove JUST the brackets.

html_text(paragraphs)[2] |> 
  str_remove_all('\\{.+\\}')
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;[6].mw-parser-output .IPA-label-smallGerman: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019[update] across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria).[7] As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"

Here, we have used three parts in our regular expression.

  1. Literal curly brackets \\{ and \\}
  2. Any character . between the brackets but
  3. not just a single character but arbitrarily many (+)

So with that, we could break down what we want to remove into three chunks:

  1. Start at \\.mw-parser-output
  2. End at \\{.+\\}
  3. Delete those and everything between (.+)

So let’s throw this all together:

html_text(paragraphs)[2] |> 
  str_remove_all('\\.mw-parser-output.+\\{.+\\}')
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;[6]German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019[update] across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria).[7] As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"

Cool. Similarly, we can get rid of these footnotes brackets that Wikipedia has (e.g. [6]). They are described by opening and closing brackets with numbers between them. Numbers are denoted by \\d.

html_text(paragraphs)[2] |> 
  str_remove_all('\\.mw-parser-output.+\\{.+\\}') |>
  str_remove_all("\\[\\d+\\]") 
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019[update] across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria). As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"

Ahh there’s also an [update] tag in there. Let’s throw that out too. We can just tell the regular expression to look for either numbers \\d or for the word update by grouping them together with () and putting a | between them.

html_text(paragraphs)[2] |> 
  str_remove_all('\\.mw-parser-output.+\\{.+\\}') |>
  str_remove_all("\\[(\\d+|update)\\]") 
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019 across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria). As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"

And for good measure, let’s get rid of any single-character brackets like [a] as well (in case they pop up for other states.)

html_text(paragraphs)[2] |> 
  str_remove_all('\\.mw-parser-output.+\\{.+\\}') |>
  str_remove_all("\\[(\\d+|update|.)\\]") 
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ⓘ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019 across a total area of nearly 35,752 km2 (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria). As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm.\n"

Alright, that was the hard part. Let’s finish off with the simple stuff. Let’s remove things like \\n and . And let’s also replace km2 with km².

html_text(paragraphs)[2] |> 
    str_remove_all('\\.mw-parser-output.+\\{.+\\}') |> 
    str_remove_all("\\[(\\d+|update|.)\\]") |> 
    str_remove_all("\\n") |> 
    str_remove_all('ⓘ') |> 
    str_replace_all('km2', 'km²')
## [1] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ), commonly shortened to BW or BaWü, is a German state (Land) in Southwest Germany, east of the Rhine, which forms the southern part of Germany's western border with France. With more than 11.07 million inhabitants as of 2019 across a total area of nearly 35,752 km² (13,804 sq mi), it is the third-largest German state by both area (behind Bavaria and Lower Saxony) and population (behind North Rhine-Westphalia and Bavaria). As a federated state, Baden-Württemberg is a partly-sovereign parliamentary republic. The largest city in Baden-Württemberg is the state capital of Stuttgart, followed by Mannheim and Karlsruhe. Other major cities are Freiburg im Breisgau, Heidelberg, Heilbronn, Pforzheim, Reutlingen, Tübingen, and Ulm."

Nice. Looks like we have a text cleaning recipe. Let’s wrap that into a function so that we can apply that to all state URLs.

get_state_description <- function(url) {
  paragraphs <- read_html(url) |> 
    html_elements('p') 
  
  
  paragraph_nmbr <- 2
  ## Earlier versions of this post had
  ## paragraph_nmbr <- 2 if (str_detect(url, 'Hesse')) 1 else 2
  ## That was due to the the relevant info being in the first paragraph for Hesse. 
  ## But that changed now so the code got easier.
  paragraphs[paragraph_nmbr] |> 
    html_text() |> 
    str_remove_all("\\[(\\d+|update|.)\\]") |> 
    str_remove_all('\\.mw-parser-output.+\\{.+\\}') |> 
    # str_remove_all("\\.mw-parser-output.*?\\{.*?\\}") |> 
    str_remove_all("\\n") |> 
    str_remove_all('ⓘ') |> 
    str_replace_all('km2', 'km²')
}

There’s a function. Time to iterate with it. Let’s collect all of the URLs for different states in a vector and then use it in conjunction with map_chr() to iterate over the vector.

state_urls <- c(
  'https://en.wikipedia.org/wiki/Baden-W%C3%BCrttemberg',
  'https://en.wikipedia.org/wiki/Bavaria',
  'https://en.wikipedia.org/wiki/Berlin',
  'https://en.wikipedia.org/wiki/Brandenburg',
  'https://en.wikipedia.org/wiki/Bremen',
  'https://en.wikipedia.org/wiki/Hamburg',
  'https://en.wikipedia.org/wiki/Hesse',
  'https://en.wikipedia.org/wiki/Mecklenburg-Vorpommern',
  'https://en.wikipedia.org/wiki/Lower_Saxony',
  'https://en.wikipedia.org/wiki/North_Rhine-Westphalia',
  'https://en.wikipedia.org/wiki/Rhineland-Palatinate',
  'https://en.wikipedia.org/wiki/Saarland',
  'https://en.wikipedia.org/wiki/Saxony',
  'https://en.wikipedia.org/wiki/Saxony-Anhalt',
  'https://en.wikipedia.org/wiki/Schleswig-Holstein',
  'https://en.wikipedia.org/wiki/Thuringia'
) 

state_descriptions <- state_urls |> 
  map_chr(get_state_description)

state_descriptions |> 
  glimpse()
##  chr [1:16] "Baden-Württemberg (/ˌbɑːdən ˈvɜːrtəmbɜːrɡ/ BAH-dən VURT-əm-burg;German: [ˌbaːdn̩ ˈvʏʁtəmbɛʁk] ), commonly shorte"| __truncated__ ...

Excellent! We have all of our data for our interactive table now (Hoooray!) Time to create the table then. But before we create the table, let me throw in a short announcement:


Data Cleaning Master Class

If you enjoyed this blog post so far, then I am sure you’re going to really like my Data Cleaning Master Class. It teaches you everything you need to know to clean up messy data sets in no time. It teaches way more techniques than the ones I have shown you already and it covers

  • a variety of file formats (including pesky JSON & Excel files),
  • text cleaning strategies (including the almighty regular expressions, i.e. RegEx) and
  • handling time and date data (which is always tedious when you don’t know the right functions.)

So if you want to transform messy data into insights quickly, then check out the course page today.


Turning the data into a table

Alright, let’s start building the table. For that, we first create a data set that doesn’t have the last row with the totals in it.

german_stats_wo_total <- german_stats |> 
  slice(-17) 
german_stats_wo_total
## # A tibble: 16 × 9
##    img          state capital most_populous_city federal_assembly_votes area_km2
##    <chr>        <chr> <chr>   <chr>                               <dbl>    <dbl>
##  1 https://upl… Bade… Stuttg… Stuttgart                               6    35748
##  2 https://upl… Baye… München München                                 6    70542
##  3 https://upl… Berl… —       —                                       4      891
##  4 https://upl… Bran… Potsdam Potsdam                                 4    29654
##  5 https://upl… Brem… Bremen  Bremen                                  3      420
##  6 https://upl… Hamb… —       —                                       3      755
##  7 https://upl… Hess… Wiesba… Frankfurt am Main                       5    21116
##  8 https://upl… Meck… Schwer… Rostock                                 3    23295
##  9 https://upl… Nied… Hannov… Hannover                                6    47710
## 10 https://upl… Nord… Düssel… Köln                                    6    34112
## 11 https://upl… Rhei… Mainz   Mainz                                   4    19858
## 12 https://upl… Saar… Saarbr… Saarbrücken                             3     2571
## 13 https://upl… Sach… Dresden Leipzig                                 4    18450
## 14 https://upl… Sach… Magdeb… Halle (Saale)                           4    20459
## 15 https://upl… Schl… Kiel    Kiel                                    4    15804
## 16 https://upl… Thür… Erfurt  Erfurt                                  4    16202
## # ℹ 3 more variables: pop_million <dbl>, pop_per_km2 <dbl>, gdp <list>

Basic Table

Then, we can pass that data to the reactable() function.

library(reactable)
german_stats_wo_total |> 
  reactable()

This doesn’t look nice at all. We have lots of tweaking to do. First, let’s get rid of the pagination. I want to have all rows visible in the table.

german_stats_wo_total |> 
  reactable(
    pagination = FALSE
  )

Default column settings

Then, we can set the default settings for every column. By default we cant to have

  • the content left-aligned (horizontally) and center-aligned (vertically),
  • Column names in title case that use white space instead of _
  • Bold text as well as a thick border for the column names
german_stats_wo_total |> 
  reactable(
    pagination = FALSE,
    defaultColDef = colDef(
      vAlign = 'center',
      align = 'left',
      header = \(x) str_replace_all(x, '_', ' ') |> 
        str_to_title(),
      headerStyle = htmltools::css(
        font_weight = 600,
        border_bottom = '2px solid black'
      )
    )
  )

Then, we can add a footer to all columns by grabbing the last row of the german_stats data set. Remember? This data set has the totals in the last row. And while we’re at it, we can also set a bit of stylings for the footers.

german_stats_wo_total |> 
  reactable(
    pagination = FALSE,
    defaultColDef = colDef(
      vAlign = 'center',
      align = 'left',
      header = \(x) str_replace_all(x, '_', ' ') |> 
        str_to_title(),
      headerStyle = htmltools::css(
        font_weight = 600,
        border_bottom = '2px solid black'
      ),
      footer = function(values, col_name) {
        german_stats[[17, col_name]]
      },
      footerStyle = htmltools::css(
        font_weight = 600,
        border_top = '2px solid black'
      )
    )
  )

These settings aren’t perfect and we have to modify them for individual columns. But it’s a nice baseline. Now we only have to go through every column one by one to apply custom changes.

Setting the image column

Let’s start with the img column. There, we want to take the urls and wrap them into <img> tags. Since the reactable output is HTML, this will cause our images to appear from their online source. To create the tags we will need the {htmltools} package.

Further, we’ll adjust the width of that column and get rid of the column name altogether. And just like in the default settings, we have to treat the footer of the table separately.

library(htmltools)
german_stats_wo_total |> 
  reactable(
    pagination = FALSE,
    
    defaultColDef = colDef(
      vAlign = 'center',
      align = 'left',
      header = \(x) str_replace_all(x, '_', ' ') |> 
        str_to_title(),
      headerStyle = htmltools::css(
        font_weight = 600,
        border_bottom = '2px solid black'
      ),
      footer = function(values, col_name) {
        german_stats[[17, col_name]]
      },
      footerStyle = htmltools::css(
        font_weight = 600,
        border_top = '2px solid black'
      )
    ),
    
    columns = list(
      img = colDef(
        header = '',
        width = 65,
        cell = function(value) {
          tags$img(
            src = value,
            width = 50
          )
        },
        footer = function(value) {
          tags$img(
            src = german_stats$img[17],
            width = 50
          )
        }
      )
    )
  )

Fix widths of name columns

For the columns state, capital and most_populous_city there’s not much to do. In these cases, we might only want do set the column width.

german_stats_wo_total |> 
  reactable(
    pagination = FALSE,
    
    defaultColDef = colDef(
      vAlign = 'center',
      align = 'left',
      header = \(x) str_replace_all(x, '_', ' ') |> 
        str_to_title(),
      headerStyle = htmltools::css(
        font_weight = 600,
        border_bottom = '2px solid black'
      ),
      footer = function(values, col_name) {
        german_stats[[17, col_name]]
      },
      footerStyle = htmltools::css(
        font_weight = 600,
        border_top = '2px solid black'
      )
    ),
    
    columns = list(
      img = colDef(
        header = '',
        width = 65,
        cell = function(value) {
          tags$img(
            src = value,
            width = 50
          )
        },
        footer = function(value) {
          tags$img(
            src = german_stats$img[17],
            width = 50
          )
        }
      ),
      
      state = colDef(width = 125),
      capital = colDef(width = 115),
      most_populous_city = colDef(width = 160)
    )
  )

Turn numbers into bubbles

Next, let us change the federal_assembly_votes column. Easy changes will once again involve the header name, the alignment and the width of the column. But that’s not the important part here.

What is important is the fact that we turn the numbers into bubbles. For that all we have to do is to load the {reactablefmtr} package and use its bubble_grid function inside the cell argument.

library(reactablefmtr)
## 
## Attaching package: 'reactablefmtr'
## The following object is masked from 'package:ggplot2':
## 
##     margin
main_color <- '#104E8B' # define a main color

german_stats_wo_total |> 
  reactable(
    pagination = FALSE,
    
    defaultColDef = colDef(
      vAlign = 'center',
      align = 'left',
      header = \(x) str_replace_all(x, '_', ' ') |> 
        str_to_title(),
      headerStyle = htmltools::css(
        font_weight = 600,
        border_bottom = '2px solid black'
      ),
      footer = function(values, col_name) {
        german_stats[[17, col_name]]
      },
      footerStyle = htmltools::css(
        font_weight = 600,
        border_top = '2px solid black'
      )
    ),
    
    columns = list(
      img = colDef(
        header = '',
        width = 65,
        cell = function(value) {
          tags$img(
            src = value,
            width = 50
          )
        },
        footer = function(value) {
          tags$img(
            src = german_stats$img[17],
            width = 50
          )
        }
      ),
      
      state = colDef(width = 125),
      capital = colDef(width = 115),
      most_populous_city = colDef(width = 160),
      
      federal_assembly_votes = colDef(
        header = 'Bundesrat Votes',
        align = 'center',
        width = 140,
        cell = bubble_grid(
          german_stats_wo_total |> 
            select(federal_assembly_votes),
          min_value = 1,
          max_value = 10,
          colors = c(
            colorspace::lighten(main_color, 0.9),
            main_color
          )
        )
      )
      
    )
  )

Turn numbers into bar charts

Next, we’re going to do a very similar thing with the columns area_km2, pop_million, pop_per_km2. This time, though, we’re going to use the data_bars() function from the {reactablefmtr} package.

This will involve having to call data_bars() more or less three times with mostly the same arguments. Thus, I wrap the logic into a function to avoid a bit of code duplication.

bar_chart <- function(column) {
  formatting_function <- if (column == 'area_km2') {
    scales::label_number(
      suffix = ' km²',
      big.mark = ','
    )
  } else {
    scales::label_comma(
      accuracy = if (column == 'pop_per_km2') 1 else 0.01
    )
  }
  
  bar_max_value <- if (column %in% c('area_km2', 'pop_million')) {
    german_stats[[17, column]]
  } else {
    NULL
  }
  data_bars(
    data = german_stats_wo_total |> select(any_of(column)),
    text_position = 'above',
    bar_height = 20,
    number_fmt = formatting_function,
    fill_color = main_color,
    max_value = bar_max_value
  )
}

With that function we can modify the cell argument of the corresponding columns. And while we’re at it, we can also fix the width, alignment and footer of these columns.

german_stats_wo_total |> 
  reactable(
    pagination = FALSE,
    
    defaultColDef = colDef(
      vAlign = 'center',
      align = 'left',
      header = \(x) str_replace_all(x, '_', ' ') |> 
        str_to_title(),
      headerStyle = htmltools::css(
        font_weight = 600,
        border_bottom = '2px solid black'
      ),
      footer = function(values, col_name) {
        german_stats[[17, col_name]]
      },
      footerStyle = htmltools::css(
        font_weight = 600,
        border_top = '2px solid black'
      )
    ),
    
    columns = list(
      img = colDef(
        header = '',
        width = 65,
        cell = function(value) {
          tags$img(
            src = value,
            width = 50
          )
        },
        footer = function(value) {
          tags$img(
            src = german_stats$img[17],
            width = 50
          )
        }
      ),
      
      state = colDef(width = 125),
      capital = colDef(width = 115),
      most_populous_city = colDef(width = 160),
      
      federal_assembly_votes = colDef(
        header = 'Bundesrat Votes',
        align = 'center',
        width = 140,
        cell = bubble_grid(
          german_stats_wo_total |> 
            select(federal_assembly_votes),
          min_value = 1,
          max_value = 10,
          colors = c(
            colorspace::lighten(main_color, 0.9),
            main_color
          )
        )
      ),
      
      area_km2 = colDef(
        header = 'Area',
        align = 'left',
        cell = bar_chart('area_km2'),
        width = 110,
        footer = function(value) {
          german_stats$area_km2[17] |> 
            scales::label_number(
              suffix = ' km²',
              big.mark = ','
            )()
        }
      ),
      pop_million = colDef(
        header = 'in millions',
        cell = bar_chart('pop_million'),
        footer = german_stats$pop_million[[17]] |> 
          scales::comma(accuracy = 0.01),
        width = 110
      ),
      pop_per_km2 = colDef(
        header = 'per km²',
        cell = bar_chart('pop_per_km2'),
        width = 110
      )
      
      
      
    )
  )

Turn numbers into line charts

Sweet! We’re getting somewhere. We have bars and circles already. Time to add sparklines.

This time, I don’t want to use {reactablefmtr} for that. I want to use my own custom charts that I create with {ggplot2}. But there are two problems with that:

  1. {ggplot2} creates static image files that I would have to export and then reimport that into our tables using <img> tags again.
  2. {ggplot2} charts are not interactive and I want to have hover tool tips

Thankfully, the solution for both of these problems is one and the same thing. Namely, {ggiraph}. This package is my favorite way to create interactive charts with R. If you’ve never worked with that package, I have a blog post and YT video about that:

Anyway, let’s create a function that depends on a column’s values, row indices and name. We’re going to stick that function into the cell argument of the gdp column. In this function, we can start with a interactive chart without tooltips.

But we have to be careful. Since all charts are drawn separately for each row, we have to ensure that the axes limits are the same for all line charts. That’s where we’ll use scale_y_continuous().

library(ggiraph)

spark_line <- function(value, row_index, column_name) {
  years <- 2015:2022
  
  # Create static ggplot with `value` vector
  gg_plt <- ggplot(mapping = aes(x = years, y = value)) +
    geom_ribbon(
      aes(
        x = years[-2],
        y = NULL, 
        ymin = 100, 
        ymax = value[-2]
      ),
      alpha = 0.1,
      fill = main_color
    ) +
    geom_line(linewidth = 2, color = main_color) +
    geom_point(
      color = main_color,
      size = 25,
      na.rm = TRUE
    ) +
    scale_y_continuous(
      expand = expansion(mult = 0.1),
      limits = range(
        german_stats$gdp |> unlist(), 
        na.rm = TRUE
      )
    ) +
    theme_void()
  
  girafe(
    ggobj = gg_plt, 
    width_svg = 16,
    height = 9,
    options = list(
      opts_toolbar(saveaspng = FALSE)
    )
  )
}
# Example with first row
spark_line(german_stats$gdp[[1]], NA, NA)

So with that, we can define the column settings of the GDP columns and ensure that it uses this function. And as always, we have to treat the footer separately.

german_stats_wo_total |> 
  reactable(
    pagination = FALSE,
    
    defaultColDef = colDef(
      vAlign = 'center',
      align = 'left',
      header = \(x) str_replace_all(x, '_', ' ') |> 
        str_to_title(),
      headerStyle = htmltools::css(
        font_weight = 600,
        border_bottom = '2px solid black'
      ),
      footer = function(values, col_name) {
        german_stats[[17, col_name]]
      },
      footerStyle = htmltools::css(
        font_weight = 600,
        border_top = '2px solid black'
      )
    ),
    
    columns = list(
      img = colDef(
        header = '',
        width = 65,
        cell = function(value) {
          tags$img(
            src = value,
            width = 50
          )
        },
        footer = function(value) {
          tags$img(
            src = german_stats$img[17],
            width = 50
          )
        }
      ),
      
      state = colDef(width = 125),
      capital = colDef(width = 115),
      most_populous_city = colDef(width = 160),
      
      federal_assembly_votes = colDef(
        header = 'Bundesrat Votes',
        align = 'center',
        width = 140,
        cell = bubble_grid(
          german_stats_wo_total |> 
            select(federal_assembly_votes),
          min_value = 1,
          max_value = 10,
          colors = c(
            colorspace::lighten(main_color, 0.9),
            main_color
          )
        )
      ),
      
      area_km2 = colDef(
        header = 'Area',
        align = 'left',
        cell = bar_chart('area_km2'),
        width = 110,
        footer = function(value) {
          german_stats$area_km2[17] |> 
            scales::label_number(
              suffix = ' km²',
              big.mark = ','
            )()
        }
      ),
      pop_million = colDef(
        header = 'in millions',
        cell = bar_chart('pop_million'),
        footer = german_stats$pop_million[[17]] |> 
          scales::comma(accuracy = 0.01),
        width = 110
      ),
      pop_per_km2 = colDef(
        header = 'per km²',
        cell = bar_chart('pop_per_km2'),
        width = 110
      ),
      
      gdp = colDef(
        header = div(
          'GDP ',
          span(
            style = htmltools::css(
              font_size = '0.75rem'
            ),
            '(Index 2015 = 100)'
          )
        ),
        cell = spark_line,
        footer = \(x) spark_line(
          german_stats$gdp[[17]], NA, NA
        ),
        width = 160
      )
      
      
    )
  )

Add tooltips to spark lines

Now, to make the chart more interesting, we can compute the year-on-year change of the GPD (if possible) and show that in tooltips.

spark_line <- function(value, row_index, column_name) {
  years <- 2015:2022
  rel_diff <- value / lag(value) - 1
  texts <- if_else(
    is.na(rel_diff),
    '',
    scales::percent(rel_diff, accuracy = 0.01)
  )
  text_color <- if_else(
    rel_diff < 0, 
    '#BF1363', 
    '#06A77D'
  )
  
  compare_texts <- map(
    seq_along(texts),
    \(x) {
      if (!is.na(value[[x]])) {
        div(
          style = css(
            width = '250px',
          ),
          div(
            style = htmltools::css(
              font_weight = 600
            ),
            glue::glue(
              'GPD in {years[x]}:'
            )
          ),
          div(
            value[x],
            if (texts[[x]] != '') {
                span(
                  style = htmltools::css(
                    font_size = '1rem'
                  ),
                  '(',
                  span(
                    style = htmltools::css(
                      color = text_color[x],
                      font_weight = 600
                    ),
                    texts[x]
                  ),
                  span(
                    ', year-on-year)'
                  )
                )
            }
          )
        ) |> as.character()
      } else {
        ''
      }
    }
  )
  
  gg_plt <- ggplot(mapping = aes(x = years, y = value)) +
    geom_ribbon(
      aes(
        x = years[-2],
        y = NULL, ymin = 100, ymax = value[-2]),
      alpha = 0.1,
      fill = main_color
    ) +
    geom_line(linewidth = 2, color = main_color) +
    geom_point_interactive(
      aes(
        tooltip = compare_texts
      ),
      color = main_color,
      size = 25,
      na.rm = TRUE
    ) +
    scale_y_continuous(
      expand = expansion(mult = 0.1),
      limits = range(
        german_stats$gdp |> unlist(),
        na.rm = TRUE
      )
    ) +
    theme_void()
  
  girafe(
    ggobj = gg_plt, 
    width_svg = 16,
    height = 9,
    options = list(
      opts_toolbar(saveaspng = FALSE),
      opts_tooltip(
        opacity = 1,
        css = girafe_css(
          css = htmltools::css(
            color = 'black',
            background = 'white',
            padding = '15px',
            font_size = '1.25rem',
            font_family = '"Source Sans Pro", Courier',
            border = glue::glue('2px solid {main_color}'),
            border_radius = '5px'
          )
        )
      )
    )
  )
}
spark_line(german_stats$gdp[[1]], NA, NA)

And now we just have to re-execute the code from before.

Add column groups and nicer font

Phew, that was hard, wasn’t it? Let’s throw in something easy in between. You can think of it as a little break if you want.

Anyway, what we do now is to add a group label for the population columns. And we can add a nicer font.

german_stats_wo_total |> 
  reactable(
    pagination = FALSE,
    
    defaultColDef = colDef(
      vAlign = 'center',
      align = 'left',
      header = \(x) str_replace_all(x, '_', ' ') |> 
        str_to_title(),
      headerStyle = htmltools::css(
        font_weight = 600,
        border_bottom = '2px solid black'
      ),
      footer = function(values, col_name) {
        german_stats[[17, col_name]]
      },
      footerStyle = htmltools::css(
        font_weight = 600,
        border_top = '2px solid black'
      )
    ),
    
    columns = list(
      img = colDef(
        header = '',
        width = 65,
        cell = function(value) {
          tags$img(
            src = value,
            width = 50
          )
        },
        footer = function(value) {
          tags$img(
            src = german_stats$img[17],
            width = 50
          )
        }
      ),

      state = colDef(width = 125),
      capital = colDef(width = 115),
      most_populous_city = colDef(width = 160),
      
      federal_assembly_votes = colDef(
        header = 'Bundesrat Votes',
        align = 'center',
        width = 140,
        cell = bubble_grid(
          german_stats_wo_total |> 
            select(federal_assembly_votes),
          min_value = 1,
          max_value = 10,
          colors = c(
            colorspace::lighten(main_color, 0.9),
            main_color
          )
        )
      ),
      
      area_km2 = colDef(
        header = 'Area',
        align = 'left',
        cell = bar_chart('area_km2'),
        width = 110,
        footer = function(value) {
          german_stats$area_km2[17] |> 
            scales::label_number(
              suffix = ' km²',
              big.mark = ','
            )()
        }
      ),
      pop_million = colDef(
        header = 'in millions',
        cell = bar_chart('pop_million'),
        footer = german_stats$pop_million[[17]] |> 
          scales::comma(accuracy = 0.01),
        width = 110
      ),
      pop_per_km2 = colDef(
        header = 'per km²',
        cell = bar_chart('pop_per_km2'),
        width = 110
      ),
      
      gdp = colDef(
        header = div(
          'GDP ',
          span(
            style = htmltools::css(
              font_size = '0.75rem'
            ),
            '(Index 2015 = 100)'
          )
        ),
        cell = spark_line,
        footer = \(x) spark_line(
          german_stats$gdp[[17]], NA, NA
        ),
        width = 160
      )
    ),
    columnGroups = list(
      colGroup(
        name = 'Population',
        columns = c('pop_million', 'pop_per_km2')
      )
    ),
    style = htmltools::css(
      font_family = '"Source Sans Pro", sans-serif'
    )
  )