library(tidyverse)
library(rio)
library(here)
library(ggridges)
library(ungeviz)
library(colorspace)
library(ggtext)
library(gt)
# uploading transit_cost
transit_cost <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')

#I made this daniel anderson theme so that I could use less code on my plots
da_theme <- theme_minimal(base_size = 13) +
  theme(panel.background = element_rect(fill = "transparent", colour = NA),
        plot.background = element_rect(fill = "transparent", colour = NA),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.x = element_line(color = "gray90"),
        plot.title.position = "plot")

country_codes <- countrycode::codelist %>% 
  select(country_name = country.name.en, country = ecb)

Problem 1

# cleaning and transforming the data

figure_1 <- transit_cost %>%
  mutate(real_cost = as.numeric(real_cost)) %>% 
  group_by(country) %>% 
  summarize(average_transit_cost = 
              mean(real_cost,na.rm = TRUE),
            n = n(),
            se = sd(real_cost, na.rm = TRUE)/sqrt(n)) %>% 
  left_join(country_codes) %>% 
  filter(average_transit_cost != "NaN") %>% 
  filter(country != "UK") # it looks like you didn't use UK in your plot so I took it out too

#if I wanted to keep in the UK I would need to give it a name
# it didn't match because it is in country codes as "GB" 
#figure_1[53, "country_name"] <- "United Kingdom"
#I made this function to make my code more readable (for the next problem but I'm using it here too.) It calculates the left or right confidence interval bounds for average transit cost for a given probability (e.g., for a 95% ci, left = ci(.025), right = ci(.975))

ci <- function(p){
  figure_1$average_transit_cost + qnorm(p)*figure_1$se
}

figure_1 %>% 
  ggplot(aes(x = average_transit_cost, y = fct_reorder(country_name, average_transit_cost))) +
  geom_errorbar(aes(xmin = ifelse(ci(.025) < 0, 0, ci(.025)), #if the ci left bound is negative, make it 0 so you can see it 
                xmax = ci(.975)),
                color = "#6c6c6c",
                width = .3,
                size = .6)+
  geom_point(color = "#4673da",
               size = 2) +
  labs(x = "Real cost (In millions of dollars)",
       y = "Country",
       title = "Cost to build transit systems vary across countries",
       caption = "Data provided through #tidytuesday by the Transit Costs Project") +
  scale_x_continuous(expand = c(0,0),
                    limits = c(0, 95000),
                     breaks = c(seq(from = 0, to = 75000, by = 15000)), 
                     labels = scales::comma) +
  da_theme +
  geom_vline(xintercept = 0, color = "gray90")

Problem 2

Density stripes

figure_1 %>% 
  ggplot(aes(x = average_transit_cost, y = fct_reorder(country_name, average_transit_cost))) +
   stat_confidence_density(aes(moe = se),
                          fill = "#4375D3",
                          height = 0.6) +
  geom_point(size = 2) +
  labs(x = "Real cost (In millions of dollars)",
       y = "Country",
       title = "Cost to build transit systems vary across countries",
       caption = "Data provided through #tidytuesday by the Transit Costs Project") +
  scale_x_continuous(expand = c(0,0),
                    limits = c(0, 95000),
                     breaks = c(seq(from = 0, to = 75000, by = 15000)), 
                     labels = scales::comma) +
  da_theme +
  geom_vline(xintercept = 0, color = "gray90")

Multiple error bars

figure_1 %>% 
  ggplot(aes(x = average_transit_cost, y = fct_reorder(country_name, average_transit_cost))) +
  geom_errorbar(aes(xmin = ifelse(ci(.025) < 0, 0, ci(.025)), 
                xmax = ci(.975),
                color = "95%"),
                width = .5,
                size = .6)+
  geom_errorbar(aes(xmin = ifelse(ci(.05) < 0, 0, ci(.05)), 
                xmax = ci(.95),
                color = "90%"),
                width = .5,
                size = .6)+
  geom_errorbar(aes(xmin = ifelse(ci(.1) < 0, 0, ci(.1)), 
                xmax = ci(.9),
                color = "80%"),
                width = .5,
                size = .6)+
  geom_point(size = 2) +
  labs(x = "Real cost (In millions of dollars)",
       y = "Country",
       title = "Cost to build transit systems vary across countries",
       caption = "Data provided through #tidytuesday by the Transit Costs Project") +
  scale_x_continuous(expand = c(0,0),
                    limits = c(0, 95000),
                     breaks = c(seq(from = 0, to = 75000, by = 15000)), 
                     labels = scales::comma) +
  theme_minimal(base_size = 13) +
  da_theme +
  geom_vline(xintercept = 0, color = "gray90") +
  scale_color_manual("Confidence Interval",
                     values = c("#d61200",
                                lighten("#d61200", .3),
                                lighten("#d61200", .6)))

Problem 3

crime <- readr::read_csv(unzip(here("data", "hw2", "crime.zip"), "crime.csv"))

model_data <- crime %>% 
  mutate(neighborhood_id = relevel(factor(NEIGHBORHOOD_ID), ref = "barnum"))

m <- glm(IS_CRIME ~ neighborhood_id, 
         data = model_data,
         family = "binomial")

tidied <- broom::tidy(m)

regis <- tidied %>% 
  filter(term == "neighborhood_idregis")

discretized <- data.frame(
  x = qnorm(ppoints(20),
            mean = regis$estimate,
            sd = regis$std.error)
  ) %>% 
  mutate(regis = ifelse(x <= 0, "#887E96", "#55B190"))
ggplot(discretized, aes(x)) +
  geom_dotplot(aes(fill = regis), binwidth=0.031)+
  geom_vline(xintercept = 0, 
             color = "#C04D49", 
             size = 1.5) + 
  scale_fill_identity(guide = "none") +
  labs(title = "Probability of differential crime rates between neighborhoods",
       subtitle = "<span style = 'color: #887E96'>**Regis**</span> compared to 
       <span style = 'color: #55B190'>**Barnum**</span>",
       x = "Difference in log odds of a crime being committed",
       caption = "Each ball represents 5% probablity") + 
  theme_minimal() +
  theme(plot.subtitle = element_markdown(),
        legend.position = "none",
        axis.title.y = element_blank(),
        axis.text.y = element_blank())

Problem 4

# deal with year
tbl_data <- crime %>%
  mutate(YEAR = case_when(
    grepl("2016", REPORTED_DATE) ~ "2016",
    grepl("2017", REPORTED_DATE) ~ "2017",
    grepl("2018", REPORTED_DATE) ~ "2018",
    grepl("2019", REPORTED_DATE) ~ "2019",
    grepl("2020", REPORTED_DATE) ~ "2020",
    grepl("2021", REPORTED_DATE) ~ "2021")
    ) %>%
  dplyr::select(DISTRICT_ID, OFFENSE_CATEGORY_ID, YEAR) %>%
  count(OFFENSE_CATEGORY_ID, DISTRICT_ID, YEAR) %>%
  pivot_wider(names_from = "YEAR",
              values_from = "n")%>%
  replace(is.na(.), 0)

# deal with offense category
 tbl_data <- tbl_data %>% 
  filter(OFFENSE_CATEGORY_ID == "aggravated-assault" |
           OFFENSE_CATEGORY_ID == "sexual-assault" |
           OFFENSE_CATEGORY_ID == "murder" |
           OFFENSE_CATEGORY_ID == "other-crimes-against-persons") %>% 
  mutate(OFFENSE_CATEGORY_ID = factor(OFFENSE_CATEGORY_ID,
                                      levels = c(
                                        "aggravated-assault", 
                                        "sexual-assault", 
                                        "murder", 
                                        "other-crimes-against-persons"))) %>% 
   mutate(OFFENSE_CATEGORY_ID = case_when(
    OFFENSE_CATEGORY_ID == "aggravated-assault" ~ "Aggravated Assault",
    OFFENSE_CATEGORY_ID == "sexual-assault" ~ "Sexual Assault",
    OFFENSE_CATEGORY_ID == "murder" ~ "Murder",
    OFFENSE_CATEGORY_ID == "other-crimes-against-persons" ~ "Other Crimes Against Persons"))
 
# deal with district 
tbl_data <- tbl_data %>% 
  filter(DISTRICT_ID == "1" |
           DISTRICT_ID == "3" |
           DISTRICT_ID == "5") %>% 
  mutate(DISTRICT_ID = case_when(
    DISTRICT_ID == "1" ~ "District 1",
    DISTRICT_ID == "3" ~ "District 3",
    DISTRICT_ID == "5" ~ "District 5"))

tbl_data %>%
  group_by(DISTRICT_ID) %>%
  arrange(match(OFFENSE_CATEGORY_ID, c("Aggravated Assault", 
                                       "Sexual Assault",
                                      "Murder",
                                      "Other Crimes Against Persons"))) %>% 
  gt() %>% 
  tab_spanner(
    label = "Year",
    columns = vars(`2016`, `2017`, `2018`, `2019`, `2020`, `2021`)
    ) %>% 
  cols_label(OFFENSE_CATEGORY_ID = "Offense") %>% 
  cols_align(align = "left",
             columns = vars(OFFENSE_CATEGORY_ID)) %>% 
  tab_header(
    title = "Crimes Against Persons in Denver: 2014 to Present",
    subtitle = md("*Sample of three districts*")
  ) %>% 
  tab_source_note(
    source_note = md("Denver Crime Data Distributed via [Kaggle](https://www.kaggle.com/paultimothymooney/denver-crime-data)")
  )
Crimes Against Persons in Denver: 2014 to Present
Sample of three districts
Offense Year
2016 2017 2018 2019 2020 2021
District 1
Aggravated Assault 318 369 389 402 397 12
Sexual Assault 106 146 163 145 94 0
Murder 3 4 6 12 19 0
Other Crimes Against Persons 781 828 668 705 614 24
District 3
Aggravated Assault 294 311 326 346 460 20
Sexual Assault 116 147 161 161 148 6
Murder 8 9 10 9 14 1
Other Crimes Against Persons 769 845 678 709 627 20
District 5
Aggravated Assault 223 176 256 305 361 17
Sexual Assault 50 74 90 84 98 5
Murder 9 6 8 6 14 0
Other Crimes Against Persons 449 451 423 514 467 18
Denver Crime Data Distributed via Kaggle