library(tidyverse)
library(rio)
library(here)
library(ggridges)
library(ungeviz)
library(colorspace)
library(ggtext)
library(gt)

# uploading transit_cost
transit_cost <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-05/transit_cost.csv')

#I made this daniel anderson theme so that I could use less code on my plots
da_theme <- theme_minimal(base_size = 13) +
  theme(panel.background = element_rect(fill = "transparent", colour = NA),
        plot.background = element_rect(fill = "transparent", colour = NA),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.x = element_line(color = "gray90"),
        plot.title.position = "plot")

country_codes <- countrycode::codelist %>% 
  select(country_name = country.name.en, country = ecb)

Problem 1

# cleaning and transforming the data

figure_1 <- transit_cost %>%
  mutate(real_cost = as.numeric(real_cost)) %>% 
  group_by(country) %>% 
  summarize(average_transit_cost = 
              mean(real_cost,na.rm = TRUE),
            n = n(),
            se = sd(real_cost, na.rm = TRUE)/sqrt(n)) %>% 
  left_join(country_codes) %>% 
  filter(average_transit_cost != "NaN") %>% 
  filter(country != "UK") # it looks like you didn't use UK in your plot so I took it out too

#if I wanted to keep in the UK I would need to give it a name
# it didn't match because it is in country codes as "GB" 
#figure_1[53, "country_name"] <- "United Kingdom"

#I made this function to make my code more readable (for the next problem but I'm using it here too.) It calculates the left or right confidence interval bounds for average transit cost for a given probability (e.g., for a 95% ci, left = ci(.025), right = ci(.975))

ci <- function(p){
  figure_1$average_transit_cost + qnorm(p)*figure_1$se
}

figure_1 %>% 
  ggplot(aes(x = average_transit_cost, y = fct_reorder(country_name, average_transit_cost))) +
  geom_errorbar(aes(xmin = ifelse(ci(.025) < 0, 0, ci(.025)), #if the ci left bound is negative, make it 0 so you can see it 
                xmax = ci(.975)),
                color = "#6c6c6c",
                width = .3,
                size = .6)+
  geom_point(color = "#4673da",
               size = 2) +
  labs(x = "Real cost (In millions of dollars)",
       y = "Country",
       title = "Cost to build transit systems vary across countries",
       caption = "Data provided through #tidytuesday by the Transit Costs Project") +
  scale_x_continuous(expand = c(0,0),
                    limits = c(0, 95000),
                     breaks = c(seq(from = 0, to = 75000, by = 15000)), 
                     labels = scales::comma) +
  da_theme +
  geom_vline(xintercept = 0, color = "gray90")

Problem 2

Density stripes

figure_1 %>% 
  ggplot(aes(x = average_transit_cost, y = fct_reorder(country_name, average_transit_cost))) +
   stat_confidence_density(aes(moe = se),
                          fill = "#4375D3",
                          height = 0.6) +
  geom_point(size = 2) +
  labs(x = "Real cost (In millions of dollars)",
       y = "Country",
       title = "Cost to build transit systems vary across countries",
       caption = "Data provided through #tidytuesday by the Transit Costs Project") +
  scale_x_continuous(expand = c(0,0),
                    limits = c(0, 95000),
                     breaks = c(seq(from = 0, to = 75000, by = 15000)), 
                     labels = scales::comma) +
  da_theme +
  geom_vline(xintercept = 0, color = "gray90")

Multiple error bars

figure_1 %>% 
  ggplot(aes(x = average_transit_cost, y = fct_reorder(country_name, average_transit_cost))) +
  geom_errorbar(aes(xmin = ifelse(ci(.025) < 0, 0, ci(.025)), 
                xmax = ci(.975),
                color = "95%"),
                width = .5,
                size = .6)+
  geom_errorbar(aes(xmin = ifelse(ci(.05) < 0, 0, ci(.05)), 
                xmax = ci(.95),
                color = "90%"),
                width = .5,
                size = .6)+
  geom_errorbar(aes(xmin = ifelse(ci(.1) < 0, 0, ci(.1)), 
                xmax = ci(.9),
                color = "80%"),
                width = .5,
                size = .6)+
  geom_point(size = 2) +
  labs(x = "Real cost (In millions of dollars)",
       y = "Country",
       title = "Cost to build transit systems vary across countries",
       caption = "Data provided through #tidytuesday by the Transit Costs Project") +
  scale_x_continuous(expand = c(0,0),
                    limits = c(0, 95000),
                     breaks = c(seq(from = 0, to = 75000, by = 15000)), 
                     labels = scales::comma) +
  theme_minimal(base_size = 13) +
  da_theme +
  geom_vline(xintercept = 0, color = "gray90") +
  scale_color_manual("Confidence Interval",
                     values = c("#d61200",
                                lighten("#d61200", .3),
                                lighten("#d61200", .6)))

Problem 3

crime <- readr::read_csv(unzip(here("data", "hw2", "crime.zip"), "crime.csv"))

model_data <- crime %>% 
  mutate(neighborhood_id = relevel(factor(NEIGHBORHOOD_ID), ref = "barnum"))

m <- glm(IS_CRIME ~ neighborhood_id, 
         data = model_data,
         family = "binomial")

tidied <- broom::tidy(m)

regis <- tidied %>% 
  filter(term == "neighborhood_idregis")

discretized <- data.frame(
  x = qnorm(ppoints(20),
            mean = regis$estimate,
            sd = regis$std.error)
  ) %>% 
  mutate(regis = ifelse(x <= 0, "#887E96", "#55B190"))

ggplot(discretized, aes(x)) +
  geom_dotplot(aes(fill = regis), binwidth=0.031)+
  geom_vline(xintercept = 0, 
             color = "#C04D49", 
             size = 1.5) + 
  scale_fill_identity(guide = "none") +
  labs(title = "Probability of differential crime rates between neighborhoods",
       subtitle = "<span style = 'color: #887E96'>**Regis**</span> compared to 
       <span style = 'color: #55B190'>**Barnum**</span>",
       x = "Difference in log odds of a crime being committed",
       caption = "Each ball represents 5% probablity") + 
  theme_minimal() +
  theme(plot.subtitle = element_markdown(),
        legend.position = "none",
        axis.title.y = element_blank(),
        axis.text.y = element_blank())

Problem 4

# deal with year
tbl_data <- crime %>%
  mutate(YEAR = case_when(
    grepl("2016", REPORTED_DATE) ~ "2016",
    grepl("2017", REPORTED_DATE) ~ "2017",
    grepl("2018", REPORTED_DATE) ~ "2018",
    grepl("2019", REPORTED_DATE) ~ "2019",
    grepl("2020", REPORTED_DATE) ~ "2020",
    grepl("2021", REPORTED_DATE) ~ "2021")
    ) %>%
  dplyr::select(DISTRICT_ID, OFFENSE_CATEGORY_ID, YEAR) %>%
  count(OFFENSE_CATEGORY_ID, DISTRICT_ID, YEAR) %>%
  pivot_wider(names_from = "YEAR",
              values_from = "n")%>%
  replace(is.na(.), 0)

# deal with offense category
 tbl_data <- tbl_data %>% 
  filter(OFFENSE_CATEGORY_ID == "aggravated-assault" |
           OFFENSE_CATEGORY_ID == "sexual-assault" |
           OFFENSE_CATEGORY_ID == "murder" |
           OFFENSE_CATEGORY_ID == "other-crimes-against-persons") %>% 
  mutate(OFFENSE_CATEGORY_ID = factor(OFFENSE_CATEGORY_ID,
                                      levels = c(
                                        "aggravated-assault", 
                                        "sexual-assault", 
                                        "murder", 
                                        "other-crimes-against-persons"))) %>% 
   mutate(OFFENSE_CATEGORY_ID = case_when(
    OFFENSE_CATEGORY_ID == "aggravated-assault" ~ "Aggravated Assault",
    OFFENSE_CATEGORY_ID == "sexual-assault" ~ "Sexual Assault",
    OFFENSE_CATEGORY_ID == "murder" ~ "Murder",
    OFFENSE_CATEGORY_ID == "other-crimes-against-persons" ~ "Other Crimes Against Persons"))
 
# deal with district 
tbl_data <- tbl_data %>% 
  filter(DISTRICT_ID == "1" |
           DISTRICT_ID == "3" |
           DISTRICT_ID == "5") %>% 
  mutate(DISTRICT_ID = case_when(
    DISTRICT_ID == "1" ~ "District 1",
    DISTRICT_ID == "3" ~ "District 3",
    DISTRICT_ID == "5" ~ "District 5"))

tbl_data %>%
  group_by(DISTRICT_ID) %>%
  arrange(match(OFFENSE_CATEGORY_ID, c("Aggravated Assault", 
                                       "Sexual Assault",
                                      "Murder",
                                      "Other Crimes Against Persons"))) %>% 
  gt() %>% 
  tab_spanner(
    label = "Year",
    columns = vars(`2016`, `2017`, `2018`, `2019`, `2020`, `2021`)
    ) %>% 
  cols_label(OFFENSE_CATEGORY_ID = "Offense") %>% 
  cols_align(align = "left",
             columns = vars(OFFENSE_CATEGORY_ID)) %>% 
  tab_header(
    title = "Crimes Against Persons in Denver: 2014 to Present",
    subtitle = md("*Sample of three districts*")
  ) %>% 
  tab_source_note(
    source_note = md("Denver Crime Data Distributed via [Kaggle](https://www.kaggle.com/paultimothymooney/denver-crime-data)")
  )

Crimes Against Persons in Denver: 2014 to Present
Sample of three districts
Offense	Year
Offense	2016	2017	2018	2019	2020	2021
District 1
Aggravated Assault	318	369	389	402	397	12
Sexual Assault	106	146	163	145	94	0
Murder	3	4	6	12	19	0
Other Crimes Against Persons	781	828	668	705	614	24
District 3
Aggravated Assault	294	311	326	346	460	20
Sexual Assault	116	147	161	161	148	6
Murder	8	9	10	9	14	1
Other Crimes Against Persons	769	845	678	709	627	20
District 5
Aggravated Assault	223	176	256	305	361	17
Sexual Assault	50	74	90	84	98	5
Murder	9	6	8	6	14	0
Other Crimes Against Persons	449	451	423	514	467	18
Denver Crime Data Distributed via Kaggle

Homework 2

Sarah Dimakis, Wanjia Guo, & Meg Siritzky

Problem 1

Problem 2

Density stripes

Multiple error bars

Problem 3

Problem 4