This is the methodology used behind the story: Wealth and grades: Compare Connecticut’s school districts.
Visit the repo for the data used in this analysis or visit Stanford’s Center for Education Policy Analysis for even more thorough data sets.
Much of this analysis would not exist without the excellent work from TheUpshot at The New York Times. Their visualization of the data in Money, Race and Success: How Your School District Compares was very inspiring.
What this analysis does:
library(dplyr)
library(RCurl)
# devtools::install_github("hadley/readxl")
library(readxl)
library(ggplot2)
library(knitr)
library(DT)
## District level means in grade equivalent units. There is one observations per district; values are averaged across years, grades and subjects.
# url <- "https://stacks.stanford.edu/file/druid:db586ns4974/district%20means%20grade%20equivalent%20std%20(gs)%20(pooled%20year,%20grade%20and%20sub).xlsx"
# loc.download <- paste0(getwd(), "/data/", "district-means-grade-equivalent-std-gs-pooled-year-grade-and-sub.xlsx")
# download.file(url, loc.download, mode="wb")
grades <- read_excel("data/district-means-grade-equivalent-std-gs-pooled-year-grade-and-sub.xlsx", sheet = 1)
colnames(grades) <- make.names(colnames(grades))
## Subsetting dataframe to just Connecticut schools
# Feel free to switch the CT abbreviation for whatever school you're interested in
ct_grades <- subset(grades, location.state=="CT")
ct_grades_sub <- ct_grades[c("education.agency.name", "average.test.score..math.ela.pooled..in.grade.equiv")]
colnames(ct_grades_sub) <- c("district", "average.grade")
kable(head(ct_grades_sub))
district | average.grade |
---|---|
UNIFIED SCHOOL DISTRICT #2 | -4.3008 |
ANDOVER SCHOOL DISTRICT | 1.6717 |
ANSONIA SCHOOL DISTRICT | -0.3795 |
AREA COOPERATIVE EDUCATIONAL | -0.4335 |
ASHFORD SCHOOL DISTRICT | 0.5294 |
AVON SCHOOL DISTRICT | 2.5961 |
These are estimates of district-level average achievement. Scores of zero mean the district is at the national average. One-unit below zero means that students in the district are one grade level behind the average; one-unit above zero means that students in the district are one grade level above the average.
# District level covariates (socioeconomic, demographic, school level data). There are multiple observations per district; one for each year and grade.
# This is a 500 mb file so expect to sit a round a while
#url <- "https://stacks.stanford.edu/file/druid:db586ns4974/district%20covariates%20by%20year%20and%20grade%20(long%20file).csv"
#loc.download <- paste0(getwd(), "/data/", "district-covariates-by-year-and-grade-long-file.csv")
#download.file(url, loc.download, mode="wb")
soc <- read.csv("data/district-covariates-by-year-and-grade-long-file.csv")
us_grades <- subset(grades, location.state!="CT")
ct_soc <- subset(soc, stateabb=="CT")
ct_soc_6th <- subset(ct_soc, grade==6)
ct_soc_6th_2014 <- subset(ct_soc_6th, year==2014)
ct_6th_income <- ct_soc_6th_2014[c("leaname", "inc50all", "totenrl")]
colnames(ct_6th_income) <- c("district", "median.income", "students.in.grade")
kable(head(ct_6th_income))
district | median.income | students.in.grade | |
---|---|---|---|
60321 | ANDOVER SCHOOL DISTRICT | 86817.71 | 46 |
60355 | ANSONIA SCHOOL DISTRICT | 60535.68 | 180 |
60391 | ASHFORD SCHOOL DISTRICT | 81249.75 | 33 |
60427 | AVON SCHOOL DISTRICT | 133213.95 | 288 |
60453 | BARKHAMSTED SCHOOL DISTRICT | 104464.11 | 36 |
60487 | BERLIN SCHOOL DISTRICT | 101136.31 | 214 |
There are so many other variables to explore in the data set above but we limited it to these two columns. Here’s Stanford’s Codebook.
ct_scatter <- left_join(ct_6th_income, ct_grades_sub)
## Joining by: "district"
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
ct_scatter <- subset(ct_scatter, !is.na(average.grade))
ct_scatter <- subset(ct_scatter, !is.na(median.income))
ct_scatter <- ct_scatter[c("district", "median.income", "average.grade", "students.in.grade")]
ct_scatter$average.grade <- round(ct_scatter$average.grade, 1)
ct_scatter$median.income <- round(ct_scatter$median.income, 0)
p <- ggplot(ct_scatter, aes(median.income, average.grade))
p + geom_point(aes(size=students.in.grade), alpha=.5) +
theme_bw() +
xlab("Parents' socioeconomic status") + ylab("Grades above or below average") +
ggtitle("Educational attainment in CT school districts")
The scatter plot above shows that the higher the median family income in a school district, the higher the average grade equivalent tends to be.
Students in school districts where families live closer to poverty are, on average, four grade levels behind their counterparts in the wealthiest school districts of Connecticut.
## district level means in grade equivalent units. There are multiple observations per district; one for each year, grade and subject.
# url <- "https://stacks.stanford.edu/file/druid:db586ns4974/district%20means%20grade%20equivalent%20std%20(gs)%20(separate%20sheets%20year%20and%20grade).xlsx"
# loc.download <- paste0(getwd(), "/data/", "district-means-grade-equivalent-std-gs-separate-sheets-year-and-grade.xlsx")
# download.file(url, loc.download, mode="wb")
grades <- read_excel("data/district-means-grade-equivalent-std-gs-separate-sheets-year-and-grade.xlsx", sheet = 1)
colnames(grades) <- make.names(colnames(grades))
ct_grades2 <- subset(grades, location.state=="CT")
ct_grades2 <- ct_grades2[c("education.agency.name", "Estimated.District.Mean.in.ela..grade.equivalent.std..gs.", "Estimated.District.Mean.in.math..grade.equivalent.std..gs.")]
colnames(ct_grades2) <- c("district", "math", "ela")
ct_scatter_table <- left_join(ct_scatter, ct_grades2)
ct_scatter_table <- ct_scatter_table[c("district", "average.grade", "math", "ela", "median.income", "students.in.grade")]
datatable(ct_scatter_table)
us_soc <- subset(soc, stateabb!="CT")
us_soc_6th <- subset(us_soc, grade==6)
us_soc_6th_2014 <- subset(us_soc_6th, year==2014)
us_6th_income <- us_soc_6th_2014[c("leaname", "inc50all", "totenrl")]
us_grades_sub <- us_grades[c("education.agency.name", "average.test.score..math.ela.pooled..in.grade.equiv")]
colnames(us_6th_income) <- c("district", "median.income", "students.in.grade")
colnames(us_grades_sub) <- c("district", "average.grade")
us_scatter <- left_join(us_6th_income, us_grades_sub)
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
us_scatter <- subset(us_scatter, !is.na(average.grade))
us_scatter <- subset(us_scatter, !is.na(median.income))
us_scatter <- us_scatter[c("district", "median.income", "average.grade", "students.in.grade")]
us_scatter$average.grade <- round(us_scatter$average.grade, 1)
us_scatter$median.income <- round(us_scatter$median.income, 0)
ct_scatter$where <- "CT"
us_scatter$where <- "US"
all_scatter <- rbind(us_scatter, ct_scatter)
ggplot(all_scatter,
aes(x = median.income, y = average.grade)) + scale_x_log10() +
geom_point(aes(size = students.in.grade), pch = 21, show.legend = TRUE, alpha=.8) +
scale_size_continuous(range=c(1,40)) +
aes(fill = where) + theme_bw() +
xlab("Parents' socioeconomic status") + ylab("Grades above or below average") +
ggtitle("Educational attainment in U.S. school districts")