df1 <- courses.csv %>%
right_join(studentInfo.csv) %>%
group_by(gender, region, id_student) %>%
summarise(m = mean(module_presentation_length)) %>%
group_by(gender, region) %>%
summarise(m = mean(m))
## Joining, by = c("code_module", "code_presentation")
## `summarise()` has grouped output by 'gender', 'region'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
ggplot(df1, aes(gender, region, fill=m)) +
geom_tile() +
coord_fixed() +
viridis::scale_fill_viridis()
Female take classes with lower module presentation length in general, only except femal in Ireland. Moreover, people from Scotland seem to take longer classes than other areas.
# I would (a) avoid using periods in object names because it can lead to
# ambiguous namings with things like S3 methods, and (b) keep all characters
# lower case just to keep it more simple
# As a general rule, you should also not exceed 80 characters for any given
# line. You can set a ruler in RStudio to help with this. Similarly, please
# include a line break after every %>%
TMA.assessments.id <- assessments.csv %>%
select(id_assessment, assessment_type, code_module, code_presentation) %>%
filter(assessment_type == "TMA")
TMA.score <- TMA.assessments.id %>%
left_join(studentAssessment.csv)
## Joining, by = "id_assessment"
studentInfo.csv %>%
count(code_module, code_presentation, id_student) %>%
filter(n > 1)
## # A tibble: 0 x 4
## # … with 4 variables: code_module <chr>, code_presentation <chr>,
## # id_student <dbl>, n <int>
TMA.student.info <- TMA.score %>%
inner_join(studentInfo.csv)
## Joining, by = c("code_module", "code_presentation", "id_student")
# Why inner join?
df2 <- TMA.student.info %>%
group_by(age_band) %>%
summarise(m_credits = mean(studied_credits, na.rm = TRUE),
m_score = mean(score, na.rm = TRUE)) %>%
gather("type", "score", -age_band)
# Please try to move to pivot_longer, which has superseded gather
# Similar to %>% , please include a line break after each +
ggplot(df2, aes(x = age_band, y = score, color = type)) +
geom_point() +
geom_line(aes(group = type)) +
scale_color_brewer(palette = "Set1",
name = "Type",
labels = c("Mean Credits", "Mean Scores")) +
labs(x = "Age", y = "Scores") +
theme_minimal()
With higher age group, people tend to take less credits but have higher TMS scores.
students.no.TMA.score <- studentInfo.csv %>%
anti_join(TMA.score, by="id_student") %>% # remove all students with TMA scores
count(gender, region, id_student) %>%
count(gender, region)
ggplot(students.no.TMA.score, aes(x = region, y = n, fill = gender)) +
geom_bar(stat="identity", position = position_dodge()) +
coord_flip() +
theme_classic() +
scale_fill_brewer(palette = "Dark2") +
labs(y="Number of Students Did Not Take TMA", x="Region")
avg.TMA <- TMA.student.info %>%
group_by(gender, highest_education, final_result) %>%
summarise(mean = mean(score, na.rm=TRUE))
## `summarise()` has grouped output by 'gender', 'highest_education'. You can override using the `.groups` argument.
# Ideally you should stick with the tidyverse, e.g.
avg.TMA <- avg.TMA %>%
mutate(final_result = factor(final_result,
levels = c("Fail", "Withdrawn",
"Pass", "Distinction")),
highest_education = factor(highest_education,
levels = c("No Formal quals",
"Lower Than A Level",
"A Level or Equivalent",
"HE Qualification",
"Post Graduate Qualification"))
)
# avg.TMA$final_result = factor(avg.TMA$final_result, levels = c("Fail", "Withdrawn", "Pass", "Distinction"))
# avg.TMA$highest_education = factor(avg.TMA$highest_education, levels = c("No Formal quals", "Lower Than A Level", "A Level or Equivalent", "HE Qualification", "Post Graduate Qualification"))
ggplot(avg.TMA, aes(x = mean, y = highest_education, color = gender)) +
geom_line(aes(group = highest_education), size = 3, color = "dimgrey") +
geom_point(size = 4) +
facet_wrap(~final_result, ncol = 1) +
scale_color_manual(values = c("lightblue", "dodgerblue3")) +
labs(x="Average Score",
y="Highest Education",
caption="Data from Kuzilek, Hlosta, & Zdrahal (2017). \nhttps://analyse.kmi.open.ac.uk/open_dataset",
title="Average TMA Scores",
subtitle="Results displayed by Education, gender, and Final Result Designation") +
theme_minimal()
#theme(plot.title= element_text(hjust = -0.6)) + theme(plot.subtitle = element_text(hjust = -0.6))
# Looks great!
new.avg.TMA = avg.TMA %>% spread(gender, mean) %>% mutate(diff = ifelse(`F`-`M` > 0, TRUE, FALSE)) %>% gather(gender, mean, -c(highest_education, final_result, diff))
ggplot(new.avg.TMA, aes(x=mean, y=highest_education)) +
geom_line(aes(group = highest_education, color=diff), size=3) +
geom_point(aes(fill=gender), size=5, shape=21, stroke = 0) +
facet_wrap(~final_result, ncol=1) +
scale_fill_manual(values = c("F"="lightskyblue2","M"="dodgerblue3"),
name = "Gender", labels = c("Female", "Male"))+
scale_color_manual(values =c("TRUE"="pink","FALSE"="slategray2"),
name = "Higher Mean",
breaks = c(TRUE, FALSE),
labels = c("Female", "Male"))+
labs(x="Average Score", y="Highest Education",
caption="Data from Kuzilek, Hlosta, & Zdrahal (2017).\nhttps://analyse.kmi.open.ac.uk/open_dataset",
title="Average TMA Scores",
subtitle="Results displayed by Education, gender, and Final Result Designation")+
theme_minimal()+
theme(legend.position = "bottom")
# Wonderful! + 1