lab2_code

Github repo: https://github.com/wanjiag/EDLD652/tree/main/lab2

1. Start with the google trends data.

google_trends_long <- gather(google_trends, key = "hurricane", value = "score", -date)

# I would encourage you to try to gain fluency with `pivot_longer()` instead, 
# which has superseded `gather()`. For example, the below pulls out just the 
# hurricane name while doing the `pivot_longer()`

google_trends %>% 
  pivot_longer(starts_with("hurricane"), 
               names_to = "hurricane", 
               values_to = "interest",
               names_pattern = "_(.+)_")

## # A tibble: 148 x 3
##    date       hurricane interest
##    <date>     <chr>        <int>
##  1 2017-08-20 harvey           0
##  2 2017-08-20 irma             0
##  3 2017-08-20 maria            0
##  4 2017-08-20 jose             0
##  5 2017-08-21 harvey           0
##  6 2017-08-21 irma             0
##  7 2017-08-21 maria            0
##  8 2017-08-21 jose             0
##  9 2017-08-22 harvey           0
## 10 2017-08-22 irma             0
## # … with 138 more rows

Visualize the change in trends for each hurricane over time in one plot with three scales. Map color to a discrete scale.

p1 <- ggplot(google_trends_long, aes(x = date, y = score, color = hurricane)) +
  geom_line()
p1

# Careful about code styling, otherwise it looks great!

Visualize the trends again, this time with solid shapes (if your first plot was already solid shapes, use lines).

p2 <- ggplot(google_trends_long, aes(x = date, y = score, fill = hurricane)) +
  geom_ribbon(aes(ymin = 0, ymax = score), alpha = 0.5, color = "white")
p2

Visualize the same relation, but change the scales so color is mapped to a continuous scale.

p3 <- ggplot(google_trends_long, aes(x = date, y = hurricane, fill = score)) + 
  geom_tile() + 
  coord_fixed() +
  scale_fill_viridis_c() # shouldn't neet to namespace because it ships w/ggplot2, just use the ggplot version
p3

Create a data frame (using tibble() or data.frame()) that includes the dates that each hurricane made landfall.

Make any final modifications to the plot you’d like to communicate the information as effectively as possible

landfill <- tibble(hurricane = c("Harvey", "Irma", "Jose", "Maria"), 
                   date = c("2017-08-25","2017-09-10", NA, "2017-09-20")) %>% 
  mutate(date = as.Date(date))

p4 <- p2 + geom_vline(aes(xintercept = date), 
                      data = landfill,
                      color = "darkgrey") +
  scale_fill_discrete(name = "Hurricane",
                      labels = c("Harvey", "Irma", "Jose", "Maria")) +
  geom_label(aes(x = date, 
                 y = max(google_trends_long$score) + 10,
                 label = hurricane), 
             data = landfill, 
             fill = "darkgrey", 
             colour = "white", 
             show.legend = FALSE) + 
  theme_minimal() + 
  theme(legend.position = "top")

p4

2. Replicate the “National cable news networks” plot from the story using the tv_states data. Don’t worry about all the labels. Try to match the colors but don’t worry about getting them exact.

# I would again encourage you to use `pivot_longer()` and the tidyverse
# e.g., `mutate()`
tv_states_long <- gather(tv_states, key = "location", value = "score", -date)
tv_states_long$location <- factor(tv_states_long$location,
                                     levels = c("florida", "texas", "puerto_rico"))

news_data <- tibble(news = c("Harvey\nlandfall", "Irma\nlandfall", "Maria\nlandfall", "Las Vegas\nshooting"),
                   date = c("2017-08-25","2017-09-10", "2017-09-20", "2017-10-01"))
news_data$date <- as.Date(news_data$date)

location_text <- tibble(location = c("Texas", "Florida", "Puerto Rico"), 
                        x = c("2017-08-29","2017-09-10", "2017-10-01"),
                        y = c(0.9, 1.3, 1.3))
location_text$x <- as.Date(location_text$x)

color_platte <- c("#FC5185", "#ED713A", "#3FC1C9")

ggplot(tv_states_long, aes(x = date, y = score, fill = location)) + 
  geom_ribbon(aes(ymin = 0, ymax = score), 
              color = "white") +
  scale_fill_manual(values = color_platte) +
  labs(y = "Share of sentences", 
       title = "National cable news networks", 
       caption = "Includes Bloomberg, CNBC, CNN, Fox Business, Fox News and MSNBC.\nFiveThirtyEight") + 
  theme_minimal(base_size = 12) + 
  theme(axis.title.x = element_blank(),
        legend.position="none", 
        plot.title = element_text(hjust = 0.5), 
        plot.caption = element_text(hjust = 0)) +
  # You should really never need `$` within ggplot or the tidyverse generally
  geom_vline(aes(xintercept = date),
             data = news_data,
             color = "lightgrey", 
             linetype = "dotted") + 
  geom_label(data = news_data, aes(x = date, y = 3.8, label = news), 
             inherit.aes = FALSE, color = "grey80",  label.size = NA)+
  geom_text(data = location_text, aes(x, y, label = location), 
            inherit.aes = FALSE, size=4.5) + 
  ylim(0,4)

3. Use the comic_characters dataset to visualize the proportion of different genders using at least three different methods. With your final visualization, make any additional modifications you think would help reduce the cognitive load necessary to comprehend the figure.

a pie group that shows the proportion of each gender

gender_prop = comic_characters %>% 
  count(sex) %>%  # no need to group_by with `count()`
  mutate(percentage = n/sum(.$n))

ggplot(gender_prop, aes(x = "", y = percentage, fill = sex)) + 
  geom_bar(stat = "identity", width = 1) + 
  coord_polar("y", start = 0) + 
  theme_void()

the first appearance of characters across time, separated by different gender.

gender_prop_year <- comic_characters %>% 
  count(sex, year)

ggplot(gender_prop_year, aes(x = year, y = n, group = sex, color = sex)) + 
  geom_line()

Culmultive distribution (proportion) of each gender. It’s interesting to see that the appearance of female and NA are at similar rate within each own category. However, transgener and genderfuild characters are very limited, and thus makes the big steps.

gender_prop_month <- comic_characters %>% 
  count(sex, date)

ggplot(gender_prop_month, aes(x = date, y = n, group = sex, color = sex)) + 
  stat_ecdf(geom = "step")

# ooh... I like this one

final figure

gender_prop_year <- comic_characters %>% 
  count(sex, year)
gender_prop_year$sex = sub(" .*", "", gender_prop_year$sex)
gender_prop_year$sex = factor(gender_prop_year$sex, 
                              levels=c("Male", 
                                       "Female",
                                       "Agender",
                                       "Genderless",
                                       "Genderfluid",
                                       "Transgender"))
# I'm stopping styling here...
gender_prop_year_na = gender_prop_year %>% filter(is.na(sex))
gender_prop_year_no_na = gender_prop_year %>% filter(sex %in% c("Male", "Female", "Agender","Genderless"))
gender_prop_year_no_na$sex = factor(gender_prop_year_no_na$sex, levels=c("Male", "Female", "Agender","Genderless"))
gender_prop_year_no_na_others = gender_prop_year %>% filter(sex %in% c("Genderfluid","Transgender"))

text = gender_prop_year_no_na %>% group_by(sex) %>% summarise(max = max(n)) %>% mutate(x=1994)
text[1,2] = text[1,2]-325
text[2,2] = text[2,2]-125
text[3,2] = 5
text[4,3] = 1979

ggplot(gender_prop_year_no_na, aes(x=year, y=n, fill=sex)) + 
  geom_ribbon(aes(ymin=0, ymax=n)) + 
  geom_label(data=text, aes(x=x, y=max, label=sex, fill=sex),color="white",label.size = NA)+
  geom_line(data=gender_prop_year_na, color="darkgrey", linetype="dotted")+
  scale_y_continuous(trans='log2') + 
  labs(y="Number of Characters Introduced", x="Year", caption="Grey dotted line indicates N.A.")+
  scale_color_manual(values = c("springgreen", "gold"), name = "Least represented groups")+
  viridis::scale_fill_viridis(discrete = TRUE, option = "A", name = "More than 2 in total")+
  theme_minimal()+
  geom_point(data = gender_prop_year_no_na_others, aes(color=sex), shape=8, size=3)+
  guides(fill = FALSE)+
  theme(legend.position = "bottom")

# Pretty!

lab2_code

Wanjia Guo

2/1/2021

1. Start with the google trends data.

2. Replicate the “National cable news networks” plot from the story using the tv_states data. Don’t worry about all the labels. Try to match the colors but don’t worry about getting them exact.

3. Use the comic_characters dataset to visualize the proportion of different genders using at least three different methods. With your final visualization, make any additional modifications you think would help reduce the cognitive load necessary to comprehend the figure.