Week 2: SOC 223

Author

Andrew Weatherman

Preparation:

Load the data:

olympics <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-07-27/olympics.csv")
dplyr::glimpse(olympics)

Rows: 271,116
Columns: 15
$ id     <dbl> 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, …
$ name   <chr> "A Dijiang", "A Lamusi", "Gunnar Nielsen Aaby", "Edgar Lindenau…
$ sex    <chr> "M", "M", "M", "M", "F", "F", "F", "F", "F", "F", "M", "M", "M"…
$ age    <dbl> 24, 23, 24, 34, 21, 21, 25, 25, 27, 27, 31, 31, 31, 31, 33, 33,…
$ height <dbl> 180, 170, NA, NA, 185, 185, 185, 185, 185, 185, 188, 188, 188, …
$ weight <dbl> 80, 60, NA, NA, 82, 82, 82, 82, 82, 82, 75, 75, 75, 75, 75, 75,…
$ team   <chr> "China", "China", "Denmark", "Denmark/Sweden", "Netherlands", "…
$ noc    <chr> "CHN", "CHN", "DEN", "DEN", "NED", "NED", "NED", "NED", "NED", …
$ games  <chr> "1992 Summer", "2012 Summer", "1920 Summer", "1900 Summer", "19…
$ year   <dbl> 1992, 2012, 1920, 1900, 1988, 1988, 1992, 1992, 1994, 1994, 199…
$ season <chr> "Summer", "Summer", "Summer", "Summer", "Winter", "Winter", "Wi…
$ city   <chr> "Barcelona", "London", "Antwerpen", "Paris", "Calgary", "Calgar…
$ sport  <chr> "Basketball", "Judo", "Football", "Tug-Of-War", "Speed Skating"…
$ event  <chr> "Basketball Men's Basketball", "Judo Men's Extra-Lightweight", …
$ medal  <chr> NA, NA, NA, "Gold", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…

Define global color variable for plotting:

theme_color <- '#087E8B'
off_black <- '#272932'

Question 1:

gold_medals <- olympics |> dplyr::filter(medal == "Gold")

Question 2:

A common solution to high-density, messy graphs in ggplot2 is to adjust the alpha level. Another, and one that I explore below in tandem with the former, is to clarify your research question. Tracking the age of all Olympic gold medalists with a single plot might be too audacious an ask – not to mention it opens the door for a myriad of confounding variables. Instead, focusing on a subset of events, countries, or years might reveal a more useful trend.

library(dplyr)
library(ggplot2)
library(ggbeeswarm)
library(pilot)

gold_medals |>
  filter(between(year, 1952, 2016) & season == "Summer") |>
  group_by(year) |>
  mutate(avg = mean(age, na.rm = TRUE)) |>
  ggplot() +
  geom_quasirandom(aes(year, age), alpha = 0.08, color = theme_color) +
  geom_line(aes(year, avg), size = 1.2) +
  scale_x_continuous(breaks = seq(1950, 2020, 10)) +
  theme_pilot() +
  labs(
    title = "Olympic gold medalists aren't getting younger",
    subtitle = "Average age of summer gold medalists from 1952-2016",
    x = "Olympic Games Year",
    y = "Age of Medalist",
    caption = "Visualization by @andreweatherman"
  )

Question 3:

us_medals <- gold_medals |>
  filter(noc == "USA") |>
  group_by(year) |>
  summarise(num_medals = n())

The United States’ most successful year:

us_medals |>
  dplyr::slice_max(num_medals)

# A tibble: 1 × 2
   year num_medals
  <dbl>      <int>
1  1984        190

The variance in gold medals won appears to grow towards the end of the data because the United States became more successful in summer events and less so in winter ones.*

* This assumes that the number of summer and winter events remain relatively constant throughout the data. I am not sure if this is true.

Question 4:

events <- c(
  "Gymnastics Men's Individual All-Around",
  "Gymnastics Women's Individual All-Around"
)

gymnastics <- gold_medals |>
  dplyr::filter(event %in% events)

Histogram:

library(ggplot2)
library(pilot)

gymnastics |>
  ggplot(aes(age)) +
  geom_histogram(binwidth = 2, fill = theme_color, color = off_black, alpha = 0.5) +
  geom_density(aes(y = 2 * ..count..), size = 1.2, color = off_black) +
  geom_vline(aes(xintercept = mean(age)), linetype = "dashed", size = 1.2, color = off_black) +
  theme_pilot() +
  labs(y = "count")

This distribution is roughly symmetric and bimodal with a center at ~24 years old and no visible outliers. For questions like these on a homework, I generally leave out titles as the shape is all that we are interested in and the plot is merely a vehicle to that answer. If you want titles, I can start to add those in.

library(dplyr)
library(ggplot2)
library(ggdensity)
library(pilot)

gymnastics |>
  # theme_pilot() fails with labeller in facet_wrap, idk why
  mutate(title = case_when(
    sex == "F" ~ "Women",
    TRUE ~ "Men"
  )) |>
  ggplot(aes(age, fill = sex)) +
  geom_histogram(binwidth = 2, color = off_black) +
  scale_fill_manual(values = c("#9E90A2", "#4D7EA8")) +
  facet_wrap(~title) +
  theme_pilot() +
  theme(legend.position = "none",
        strip.text = element_text(size=10)) +
  labs(
    title = "The age gap in elite gymnastics",
    subtitle = "Ages of Olympic gold-winning all-around gymnasts",
    x = "Age",
    y = "Medalists"
  )

Female gold-winning all-around gymnasts are more likely to be younger than their male counterparts.

Question 5:

Box plots are boring:

events <- c(
  "Gymnastics Men's Individual All-Around",
  "Gymnastics Women's Individual All-Around",
  "Athletics Women's 100 metres",
  "Athletics Men's 100 metres"
)

two_events <- gold_medals |>
  dplyr::filter(event %in% events)

library(ggplot2)
library(ggdist)
library(paletteer)
library(pilot)

x_names <- c('Men 100M', 'Wom. 100M', 'Men All-Around', 'Wom. All-Around')
y_names <- c(seq(150, 180, 10), '190 cm.')

two_events |>
  ggplot(aes(event, height, fill=event)) +
  stat_gradientinterval(.width = 1, point_size = 2, adjust = .2, alpha=0.7) +
  scale_x_discrete(labels = x_names) +
  scale_y_continuous(labels = y_names, breaks=seq(150, 190, 10)) +
  scale_fill_paletteer_d(`"nationalparkcolors::Badlands"`) +
  theme_pilot() +
  theme(legend.position='none',
        panel.grid.major.x = element_blank()) +
  labs(x = '',
       y='',
       title = "Most elite gymnasts are oddly the same height",
       subtitle = 'Height distributions of gold-winning athletes in select Summer Olympic events \nback to 1896')

Question 6:

library(dplyr)

us_medalists <- gold_medals |> 
  # keep only summer games
  filter(noc == "USA" & season == 'Summer') |>
  mutate(decade = paste0(year - year %% 10, 's')) |> 
  filter(decade >= 1960) |> 
  group_by(year, decade, sex) |> 
  summarize(medals = n())

library(ggplot2)
library(pilot)
library(ggtext)

us_medalists |> 
  ggplot(aes(medals, as.factor(year), fill=sex)) +
  geom_bar(position='dodge', stat='identity', size=0.4, color='black') +
  scale_fill_manual(values = c("#9E90A2", "#4D7EA8")) +
  facet_wrap(~decade, scales = 'free_y') +
  theme_pilot() +
  theme(legend.position = 'none',
        axis.text = element_text(size = 8),
        plot.title = element_markdown(size=14)) + 
  labs(y='',
       x = '',
       title = "<span style = 'color: #9E90A2; font-weight: bold;'>Female olympians</span> are closing the American medal gap",
       subtitle = 'Gold medals won by American Olympians in Summer games since 1960')

Female Olympians appear to be narrowing the medal gap in recent Summer games.