This is the methodology used behind the story: Wealth and grades: Compare Connecticut’s school districts.

Visit the repo for the data used in this analysis or visit Stanford’s Center for Education Policy Analysis for even more thorough data sets.

Much of this analysis would not exist without the excellent work from TheUpshot at The New York Times. Their visualization of the data in Money, Race and Success: How Your School District Compares was very inspiring.

What this analysis does:

Loading the packages

library(dplyr)
library(RCurl)
# devtools::install_github("hadley/readxl")
library(readxl)
library(ggplot2)
library(knitr)
library(DT)

Bringing in the data

## District level means in grade equivalent units. There is one observations per district; values are averaged across years, grades and subjects.
# url <- "https://stacks.stanford.edu/file/druid:db586ns4974/district%20means%20grade%20equivalent%20std%20(gs)%20(pooled%20year,%20grade%20and%20sub).xlsx"
# loc.download <- paste0(getwd(), "/data/", "district-means-grade-equivalent-std-gs-pooled-year-grade-and-sub.xlsx")
# download.file(url, loc.download, mode="wb")

grades <- read_excel("data/district-means-grade-equivalent-std-gs-pooled-year-grade-and-sub.xlsx", sheet = 1)
colnames(grades) <- make.names(colnames(grades))

## Subsetting dataframe to just Connecticut schools
# Feel free to switch the CT abbreviation for whatever school you're interested in

ct_grades <- subset(grades, location.state=="CT")
ct_grades_sub <- ct_grades[c("education.agency.name", "average.test.score..math.ela.pooled..in.grade.equiv")]
colnames(ct_grades_sub) <- c("district", "average.grade")
kable(head(ct_grades_sub))
district average.grade
UNIFIED SCHOOL DISTRICT #2 -4.3008
ANDOVER SCHOOL DISTRICT 1.6717
ANSONIA SCHOOL DISTRICT -0.3795
AREA COOPERATIVE EDUCATIONAL -0.4335
ASHFORD SCHOOL DISTRICT 0.5294
AVON SCHOOL DISTRICT 2.5961

These are estimates of district-level average achievement. Scores of zero mean the district is at the national average. One-unit below zero means that students in the district are one grade level behind the average; one-unit above zero means that students in the district are one grade level above the average.

# District level covariates (socioeconomic, demographic, school level data). There are multiple observations per district; one for each year and grade.
# This is a 500 mb file so expect to sit a round a while

#url <- "https://stacks.stanford.edu/file/druid:db586ns4974/district%20covariates%20by%20year%20and%20grade%20(long%20file).csv"
#loc.download <- paste0(getwd(), "/data/", "district-covariates-by-year-and-grade-long-file.csv")
#download.file(url, loc.download, mode="wb")
soc <- read.csv("data/district-covariates-by-year-and-grade-long-file.csv")

us_grades <- subset(grades, location.state!="CT")

ct_soc <- subset(soc, stateabb=="CT")
ct_soc_6th <- subset(ct_soc, grade==6)
ct_soc_6th_2014 <- subset(ct_soc_6th, year==2014)

ct_6th_income <- ct_soc_6th_2014[c("leaname", "inc50all", "totenrl")]

colnames(ct_6th_income) <- c("district", "median.income", "students.in.grade")
kable(head(ct_6th_income))
district median.income students.in.grade
60321 ANDOVER SCHOOL DISTRICT 86817.71 46
60355 ANSONIA SCHOOL DISTRICT 60535.68 180
60391 ASHFORD SCHOOL DISTRICT 81249.75 33
60427 AVON SCHOOL DISTRICT 133213.95 288
60453 BARKHAMSTED SCHOOL DISTRICT 104464.11 36
60487 BERLIN SCHOOL DISTRICT 101136.31 214

There are so many other variables to explore in the data set above but we limited it to these two columns. Here’s Stanford’s Codebook.


Visualizing Connecticut’s educational attainment disparity

ct_scatter <- left_join(ct_6th_income, ct_grades_sub)
## Joining by: "district"
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
ct_scatter <- subset(ct_scatter, !is.na(average.grade))
ct_scatter <- subset(ct_scatter, !is.na(median.income))

ct_scatter <- ct_scatter[c("district", "median.income", "average.grade", "students.in.grade")]
ct_scatter$average.grade <- round(ct_scatter$average.grade, 1)
ct_scatter$median.income <- round(ct_scatter$median.income, 0)

p <- ggplot(ct_scatter, aes(median.income, average.grade))
p + geom_point(aes(size=students.in.grade), alpha=.5) + 
  theme_bw() +
  xlab("Parents' socioeconomic status") + ylab("Grades above or below average") +
  ggtitle("Educational attainment in CT school districts")

The scatter plot above shows that the higher the median family income in a school district, the higher the average grade equivalent tends to be.

Students in school districts where families live closer to poverty are, on average, four grade levels behind their counterparts in the wealthiest school districts of Connecticut.


Explore the data

## district level means in grade equivalent units. There are multiple observations per district; one for each year, grade and subject.
# url <- "https://stacks.stanford.edu/file/druid:db586ns4974/district%20means%20grade%20equivalent%20std%20(gs)%20(separate%20sheets%20year%20and%20grade).xlsx"
# loc.download <- paste0(getwd(), "/data/", "district-means-grade-equivalent-std-gs-separate-sheets-year-and-grade.xlsx")
# download.file(url, loc.download, mode="wb")

grades <- read_excel("data/district-means-grade-equivalent-std-gs-separate-sheets-year-and-grade.xlsx", sheet = 1)

colnames(grades) <- make.names(colnames(grades))
ct_grades2 <- subset(grades, location.state=="CT")
ct_grades2 <- ct_grades2[c("education.agency.name", "Estimated.District.Mean.in.ela..grade.equivalent.std..gs.", "Estimated.District.Mean.in.math..grade.equivalent.std..gs.")]
colnames(ct_grades2) <- c("district", "math", "ela")

ct_scatter_table <- left_join(ct_scatter, ct_grades2)
ct_scatter_table <- ct_scatter_table[c("district", "average.grade", "math", "ela", "median.income", "students.in.grade")]
datatable(ct_scatter_table)

Visualizing U.S. and CT school districts

us_soc <- subset(soc, stateabb!="CT")
us_soc_6th <- subset(us_soc, grade==6)
us_soc_6th_2014 <- subset(us_soc_6th, year==2014)

us_6th_income <- us_soc_6th_2014[c("leaname", "inc50all", "totenrl")]

us_grades_sub <- us_grades[c("education.agency.name", "average.test.score..math.ela.pooled..in.grade.equiv")]

colnames(us_6th_income) <- c("district", "median.income", "students.in.grade")
colnames(us_grades_sub) <- c("district", "average.grade")

us_scatter <- left_join(us_6th_income, us_grades_sub)
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
us_scatter <- subset(us_scatter, !is.na(average.grade))
us_scatter <- subset(us_scatter, !is.na(median.income))

us_scatter <- us_scatter[c("district", "median.income", "average.grade", "students.in.grade")]
us_scatter$average.grade <- round(us_scatter$average.grade, 1)
us_scatter$median.income <- round(us_scatter$median.income, 0)


ct_scatter$where <- "CT"
us_scatter$where <- "US"
all_scatter <- rbind(us_scatter, ct_scatter)

ggplot(all_scatter,
       aes(x = median.income, y = average.grade)) + scale_x_log10() +
  geom_point(aes(size = students.in.grade), pch = 21, show.legend = TRUE, alpha=.8) +
  scale_size_continuous(range=c(1,40)) +
  aes(fill = where) + theme_bw() +
  xlab("Parents' socioeconomic status") + ylab("Grades above or below average") +
  ggtitle("Educational attainment in U.S. school districts")