library(dplyr)
library(magrittr)
library(ggplot2)
library(pilot)
library(forcats)
Week 6: SOC 223
Preparation
Load libraries:
Load the data:
<- readr::read_csv('https://github.com/NicolasRestrep/223_course/raw/main/Data/tv_ratings.csv') tv
Create data frame for later use:
<- tv |>
comedies_dramas mutate(is_comedy = if_else(stringr::str_detect(genres, "Comedy"),
1,
0)) |>
filter(is_comedy == 1 | genres == "Drama") |>
mutate(genres = if_else(genres == "Drama",
"Drama",
"Comedy"))
Chapter 4:
Question 1:
<- tv |>
tv_long group_by(title) |>
summarise(num_seasons = n()) |>
ungroup() |>
left_join(tv, by = "title")
<- tv_long |>
tv_long filter(num_seasons >= 5)
It is difficult to make any conclusions because of how busy the plot is. A rough estimation would be that shows reach a critical peak within their first five seasons and slowly fall off from there.
|>
tv_long ggplot(aes(seasonNumber, av_rating)) +
geom_line(aes(group=title), alpha=0.15) +
geom_smooth(method = 'gam', color = '#684551', fill='#D5B0AC') +
scale_x_continuous(breaks=c(1, seq(5, 20, 5))) +
theme_pilot() +
labs(x = 'Season Number',
y = 'Average Rating')
Question 2:
40-something genres made the plot too cluttered on my screen to draw any real conclusions, so I opted to take just the first two listed genres for each show and facet the plot accordingly.
Crime and drama shows tend to last the longest, and the ratings generally remain constant over time, but shorter-running shows occasionally see jumps in rating towards the end of their run.
|>
tv_long group_by(titleId) |>
mutate(two_genres = toString(head(strsplit(genres, ',')[[1]], 2))) |>
ggplot(aes(seasonNumber, av_rating)) +
geom_line(aes(group=title), alpha=0.25, color='#684551') +
scale_x_continuous(breaks=c(1, seq(5, 20, 5))) +
facet_wrap(~ two_genres) +
theme_pilot(
facet_title_size = 7,
axis_text_size = 7,
title_size = 14
+
) theme(plot.title.position = 'plot') +
labs(x = 'Season Number',
y = 'Average Rating',
title = 'Long-running action and crime shows age well over time',
subtitle = 'Average ratings for TV shows that aired for five or more seasons',
caption = 'Data from IMDB')
There is only one show that falls under the Drama, Family, and Friends category: Are You Afraid of the Dark?
|>
tv_long filter(genres == 'Drama,Family,Fantasy') |>
::pluck(1) |>
purrrunique()
[1] "Are You Afraid of the Dark?"
Question 3:
Keeping with the theme of the last question, I am restricting this to including just the first two listed genres. 40-plus categories is just a few too many.
coord_flip
inverts the axes. The genre with the top-rated shows is Crime and Drama.
|>
tv filter(av_rating >= 9) |>
group_by(titleId) |>
mutate(two_genres = toString(head(strsplit(genres, ',')[[1]], 2))) |>
ggplot(aes(fct_rev(fct_infreq(two_genres)))) +
geom_bar(fill = '#684551') +
coord_flip() +
theme_pilot(title_size = 14) +
theme(plot.title.position = 'plot') +
labs(x = '',
y = 'Number of Shows',
title = 'Crime and drama shows are critically successful',
subtitle = 'Number of shows by genre that achieved an average IMBD rating of nine or higher',
caption = 'Data by IMBD')
Question 4:
Drama shows are not necessarily rated higher than comedy ones, the mean between the two are roughly the same, but drama show ratings tend to be clustered around the same rating. This might lead to the perception that dramas are rated higher, as fewer are ‘busts’ or ‘hits.’
|>
comedies_dramas ggplot(aes(av_rating, fill=genres)) +
geom_density(alpha=0.6, size=0.85) +
scale_fill_manual(values=c('#7678ED', '#3D348B')) +
theme_pilot() +
theme(plot.title.position = 'plot',
legend.position = c(0.15, 0.8)) +
labs(x='Average Rating',
y = 'Kernal Density Estimate',
fill = 'Series Genre',
title = 'Drama shows are more critically stable than comedy ones',
subtitle = 'Density plot of IMBD rated-television shows that are classified as dramas or comedies',
caption = 'Data from IMBD')
Question 5:
Histograms show us the number of total observations in the data set, while the density plot was not giving us that information. Without seeing this histogram or the data, the reader might assume that the number of shows were around the same.
|>
comedies_dramas ggplot(aes(av_rating, fill=genres)) +
geom_histogram() +
scale_fill_manual(values=c('#7678ED', '#3D348B')) +
theme_pilot()
This is similar to geom_density
and geom_histogram
in that is shows the total observations but is smoothed over (unlike histogram).
I would probably prefer to use geom_freqpoly
. Both the histogram and density plot have their uses, but I think that geom_freqpoly
is a good mix of the two’s biggest advantages. Showing the count is important.
|>
comedies_dramas ggplot(aes(av_rating, color=genres)) +
geom_freqpoly() +
scale_color_manual(values=c('#7678ED', '#3D348B')) +
theme_pilot() +
theme(plot.title.position = 'plot',
legend.position = c(0.13, 0.75)) +
labs(x='Average Rating',
y = 'Total Shows',
color = 'Series Genre',
title = 'Drama shows are more critically stable than comedy ones',
subtitle = 'Frequency plot of IMBD rated-television shows that are classified as dramas or comedies',
caption = 'Data from IMBD')
Question 6:
A heatmap is good when you want to fill by a numeric variable that should be on a gradient. In this particular example, we are able to ‘bin off’ areas of the plot to see the number of shows that fell under each ‘bin’ (viewership and share combination).
|>
comedies_dramas ggplot(aes(av_rating, share)) +
geom_bin_2d() +
facet_wrap(~genres) +
theme_pilot() +
scale_fill_gradient(low='#342C77', high='#A5A6F3',
breaks=seq(0,80,80),labels=c("Fewer Shows","More Shows"),
limits = c(0,80)) +
theme(
legend.position = 'top',
legend.key.width = unit(0.75, 'cm'),
legend.key.height = unit(0.25, 'cm'),
plot.title = element_text(hjust=0.5),
plot.subtitle = element_text(hjust=0.5),
strip.text = ggtext::element_textbox(
size = 10,
color = "white", fill = "#3D348B", box.color = "#4A618C",
halign = 0.5, linetype = 1, r = unit(5, "pt"), width = unit(1, "npc"),
padding = margin(2, 0, 1, 0), margin = margin(3, 3, 3, 3)
)+
) labs(x='Average Rating',
y = 'Share of Viewership',
fill = '',
title = 'Critically successful comedy shows get little viewers',
subtitle = 'Heatmap of viewership and IMBD ratings for comedy and drama shows',
caption='Data from IMBD')
The outlying drama show was the first season of Dekalog in 1990.
|>
comedies_dramas slice(which.max(share)) |>
::pluck(3) purrr
[1] "Dekalog"
Chapter 5:
Preparation
Load the data:
<- readr::read_csv('https://github.com/NicolasRestrep/223_course/raw/main/Data/wncaa.csv') ncaa
Create the ‘winners’ data frame:
<- ncaa |> filter(tourney_finish == 'Champ') %$% {unique(.$school)}
champ_names <- ncaa |>
winners filter(school %in% champ_names) |>
group_by(school) |>
mutate(champ = sum(tourney_finish == 'Champ')) |>
ungroup() |>
mutate(school = case_when(
== 'Southern California' ~ 'USC',
school .default = school
|>
)) left_join(cfbplotR::logo_ref |> select(school, color))
Question 1
|>
ncaa filter(tourney_finish == 'Champ') |>
count(school) |>
# change Southern Cal. to USC
mutate(prop = n / sum(n),
label = ifelse(n == max(n), paste0(n, ' championships won '), paste0(n, ' ')),
school = case_when(
== 'Southern California' ~ 'USC',
school .default = school
|>
)) left_join(cfbplotR::logo_ref |> select(school, color)) |>
ggplot(aes(prop, reorder(school, prop), fill = color)) +
geom_bar(stat='identity') +
geom_text(aes(label=label),
hjust = 1,
fontface = "bold",
family='Avenir Next',
color = "white",
size = 3.5) +
scale_x_continuous(labels = scales::label_percent()) +
scale_fill_identity() +
theme_pilot() +
theme(plot.title.position = 'plot') +
labs(y='',
x = 'Percentage of tournaments won',
title = "UConn and Tennessee dominate women's basketball",
subtitle = 'Proportion of NCAA tournaments won by program (1982-2018)')
Question 2:
|>
winners ggplot(aes(seed, reorder(school, champ), fill=color)) +
geom_boxplot(alpha = .5, size = 0.5, outlier.size = 2) +
scale_fill_identity() +
scale_x_continuous(breaks=c(1, seq(5, 15, 5), 15),
limits = c(1, 15)) +
theme_pilot() +
theme(plot.title.position = 'plot') +
labs(x = 'Seed',
y = '',
title = 'UConn and Tennessee dominate the top seed line',
subtitle = "Distribution of NCAA tournament seeds for previous women's champions (1982-2018). \nTeams are ordered by number of championships won.")
geom_violin
is one of the many fun alternatives to box plots, but I generally prefer something from the {ggdist}
package. For this case where there are not too many observations per team, I would lean towards using the above box plot.
|>
winners ggplot(aes(seed, reorder(school, champ), fill=color)) +
geom_violin(size = 0.6, bw = .2) +
scale_fill_identity() +
scale_x_continuous(breaks=c(1, seq(5, 15, 5), 15),
limits = c(1, 15)) +
theme_pilot() +
theme(plot.title.position = 'plot') +
labs(x = 'Seed',
y = '',
title = 'UConn and Tennessee dominate the top seed line',
subtitle = "Distribution of NCAA tournament seeds for previous women's champions (1982-2018). \nTeams are ordered by number of championships won.")
Question 3:
geom_point
does a poor job at handling clumped data // overplotting. This can be slightly fixed by jittering the points with something like geom_quasirandom
, but even then, it is not nearly as good as a boxplot at showing clumped distribution using one numeric variable.
And at that, plotting one integer exacerbates this issue because an excess of one value might lead to that value ‘bleeding over’ into space where one might expect the next integer to be graphed.
|>
winners ggplot(aes(seed, reorder(school, champ), color=color)) +
::geom_quasirandom(alpha = 0.5) +
ggbeeswarmscale_color_identity() +
scale_x_continuous(breaks=c(1, seq(5, 15, 5), 15),
limits = c(1, 15)) +
theme_pilot() +
theme(plot.title.position = 'plot') +
labs(x = 'Seed',
y = '',
title = 'UConn and Tennessee dominate the top seed line',
subtitle = "Distribution of NCAA tournament seeds for previous women's champions (1982-2018). \nTeams are ordered by number of championships won.")
Question 4:
summarize_if
is superseeded in favor of across
:
<- winners |>
sum_across group_by(school) |>
mutate(champs = n()) |>
summarize(across(where(is.numeric), list(mean=~mean(.x), sd=~sd(.x)))) |>
left_join(winners |> select(school, champ, color)) |>
unique()
So, I added a geom_point
layer on top of geom_pointrange
, which effectively treats the latter as geom_linerange
, as to have access to the size
parameter to scale the observations by number of championships won – a variable I created above. The size
parameter passed through to geom_pointrange
, instead, sizes both the line and the point.
|>
sum_across ggplot(aes(reg_percent_mean, reorder(school, reg_percent_mean))) +
geom_point(aes(size=champs_mean, color = color)) +
geom_pointrange(aes(xmin = reg_percent_mean - reg_percent_sd,
xmax = reg_percent_mean + reg_percent_sd,
color = color)) +
scale_color_identity() +
scale_x_continuous(labels=scales::label_percent(scale=1)) +
theme_pilot(title_size = 12) +
theme(plot.title.position = 'plot',
legend.position = 'none') +
labs(x = 'Average Reg. Season Win Percentage',
y='',
title = "The Huskies are as successful in the reg. season as they are in the post season",
subtitle = "Average regular season win percentage for national championship-winning Division 1 \nwomen's basketball programs from 1982 to 2018. Points sized by titles won.")
The average win percentages are relatively consistent, and when you consider the number of games played in the regular season, the difference in win percentage would likely boil down to just a game or two.
As I mentioned above, adding a geom_point
layer effectively treats geom_pointrange
as geom_linerange
, so just replacing the former with the latter would be how to create the above plot with geom_linerange
.
Question 5:
|>
winners ggplot(aes(reg_percent, full_percent)) +
geom_point() +
geom_abline()
We notice that many points are below the expected line. This lends credence to the idea that a 45-degree prediction line is not a great fit. This is to be expected as the general consensus is that a single-elimination tournament, as is the NCAA tournament, is not a great predictor of true team strength – slightly less-so for the women’s side of basketball but still stands. There is also the confounding variable of schedule strength.
Question 6:
|>
winners mutate(is_champ = if_else(tourney_finish == "Champ", 1, 0),
is_champ = as.factor(is_champ)) |>
ggplot(aes(reg_percent, full_percent)) +
geom_point(aes(color=is_champ)) +
geom_abline()
As to be expected, as champions have inherently ‘overperformed,’ a majority of the observations that lie above the predictor line were championship seasons.
Coding the is_champ
variable as a factor allows ggplot2
to treat the fill aesthetic as a discrete one, not a continuous one. Coding is_champ
as a logical (true/false) or as a character (‘1’, ‘0’; ‘yes’, ‘no’; etc.) would remove the necessity for factorizing is_champ
and remove one line of code.
Question 7:
<- winners %>%
winners mutate(is_champ = if_else(tourney_finish == "Champ", TRUE, FALSE),
difference = full_percent - reg_percent)
|>
winners mutate(
case_when(
== TRUE & reg_percent == min(subset(winners, is_champ == TRUE)$reg_percent) ~ tibble(label = paste0(school, '; ', year), hjust=1.45, vjust=-1.5),
is_champ == min(winners$reg_percent) ~ tibble(label = paste0(school, '; ', year), hjust=-0.5, vjust=0.0)
reg_percent |>
)) ggplot(aes(reg_percent, full_percent, label=label, hjust=hjust, vjust=vjust)) +
geom_smooth(method = 'gam', se = FALSE, color='#0B3142', alpha=0.7) +
geom_point(aes(color=is_champ), alpha=0.7, size=1.8) +
annotate(geom='curve', x=70, y=75, xend=65, yend=85, curvature=0.5, arrow=arrow(length=unit(.2, 'cm'))) +
annotate(geom='curve', x=48, y=45, xend=53, yend=46, arrow=arrow(length=unit(.2, 'cm'))) +
::geom_richtext() +
ggtextscale_x_continuous(labels=scales::label_percent(scale=1)) +
scale_y_continuous(labels=scales::label_percent(scale=1)) +
scale_color_manual(values=c('#81667A', '#048A81')) +
theme_pilot() +
theme(legend.position = 'none',
plot.title = ggtext::element_markdown(size=13),
plot.title.position = 'plot') +
labs(x = 'Reg. Season Win Percentage',
y = 'Final Win Percentage',
title = "<span style='color: #048A81;'>Tournament Champions<span style='color: black;'> vs. <span style='color: #81667A;'>Tournament Losers:<span style='color: black;'> Season Win Percentage",
subtitle = "A look at regular season and total win percentage for women's basketball teams who \nhave won at least one championship from 1982 to 2018")
No, I don’t see anything new that jumps out since the last question.
Question 8:
Surprising to no one who is familiar with women’s basketball, UConn has gone unbeaten six times.
|>
winners filter(full_percent == 100) |>
select(school, full_w, year)
# A tibble: 8 × 3
school full_w year
<chr> <dbl> <dbl>
1 Texas 34 1986
2 UConn 35 1995
3 UConn 39 2002
4 UConn 39 2009
5 UConn 39 2010
6 Baylor 40 2012
7 UConn 40 2014
8 UConn 38 2016