Examining socioeconomic disparity in Connecticut school districts

This is the methodology used behind the story: Wealth and grades: Compare Connecticut’s school districts.

Visit the repo for the data used in this analysis or visit Stanford’s Center for Education Policy Analysis for even more thorough data sets.

Much of this analysis would not exist without the excellent work from TheUpshot at The New York Times. Their visualization of the data in Money, Race and Success: How Your School District Compares was very inspiring.

What this analysis does:

Bring in local data from Stanford CEPA (comment out lines to download directly from CEPA)
Join a couple of their data sets to focus on income, students in grade, and grade score equivalents
Visualize disparity in Connecticut school districts
Create a filterable table for all CT school districts
Visualize disparity in U.S. school districts and surface CT districts

Loading the packages

library(dplyr)
library(RCurl)
# devtools::install_github("hadley/readxl")
library(readxl)
library(ggplot2)
library(knitr)
library(DT)

Bringing in the data

## District level means in grade equivalent units. There is one observations per district; values are averaged across years, grades and subjects.
# url <- "https://stacks.stanford.edu/file/druid:db586ns4974/district%20means%20grade%20equivalent%20std%20(gs)%20(pooled%20year,%20grade%20and%20sub).xlsx"
# loc.download <- paste0(getwd(), "/data/", "district-means-grade-equivalent-std-gs-pooled-year-grade-and-sub.xlsx")
# download.file(url, loc.download, mode="wb")

grades <- read_excel("data/district-means-grade-equivalent-std-gs-pooled-year-grade-and-sub.xlsx", sheet = 1)
colnames(grades) <- make.names(colnames(grades))

## Subsetting dataframe to just Connecticut schools
# Feel free to switch the CT abbreviation for whatever school you're interested in

ct_grades <- subset(grades, location.state=="CT")
ct_grades_sub <- ct_grades[c("education.agency.name", "average.test.score..math.ela.pooled..in.grade.equiv")]
colnames(ct_grades_sub) <- c("district", "average.grade")
kable(head(ct_grades_sub))

district	average.grade
UNIFIED SCHOOL DISTRICT #2	-4.3008
ANDOVER SCHOOL DISTRICT	1.6717
ANSONIA SCHOOL DISTRICT	-0.3795
AREA COOPERATIVE EDUCATIONAL	-0.4335
ASHFORD SCHOOL DISTRICT	0.5294
AVON SCHOOL DISTRICT	2.5961

These are estimates of district-level average achievement. Scores of zero mean the district is at the national average. One-unit below zero means that students in the district are one grade level behind the average; one-unit above zero means that students in the district are one grade level above the average.

# District level covariates (socioeconomic, demographic, school level data). There are multiple observations per district; one for each year and grade.
# This is a 500 mb file so expect to sit a round a while

#url <- "https://stacks.stanford.edu/file/druid:db586ns4974/district%20covariates%20by%20year%20and%20grade%20(long%20file).csv"
#loc.download <- paste0(getwd(), "/data/", "district-covariates-by-year-and-grade-long-file.csv")
#download.file(url, loc.download, mode="wb")
soc <- read.csv("data/district-covariates-by-year-and-grade-long-file.csv")

us_grades <- subset(grades, location.state!="CT")

ct_soc <- subset(soc, stateabb=="CT")
ct_soc_6th <- subset(ct_soc, grade==6)
ct_soc_6th_2014 <- subset(ct_soc_6th, year==2014)

ct_6th_income <- ct_soc_6th_2014[c("leaname", "inc50all", "totenrl")]

colnames(ct_6th_income) <- c("district", "median.income", "students.in.grade")
kable(head(ct_6th_income))

	district	median.income	students.in.grade
60321	ANDOVER SCHOOL DISTRICT	86817.71	46
60355	ANSONIA SCHOOL DISTRICT	60535.68	180
60391	ASHFORD SCHOOL DISTRICT	81249.75	33
60427	AVON SCHOOL DISTRICT	133213.95	288
60453	BARKHAMSTED SCHOOL DISTRICT	104464.11	36
60487	BERLIN SCHOOL DISTRICT	101136.31	214

There are so many other variables to explore in the data set above but we limited it to these two columns. Here’s Stanford’s Codebook.

Visualizing Connecticut’s educational attainment disparity

ct_scatter <- left_join(ct_6th_income, ct_grades_sub)

## Joining by: "district"

## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector

ct_scatter <- subset(ct_scatter, !is.na(average.grade))
ct_scatter <- subset(ct_scatter, !is.na(median.income))

ct_scatter <- ct_scatter[c("district", "median.income", "average.grade", "students.in.grade")]
ct_scatter$average.grade <- round(ct_scatter$average.grade, 1)
ct_scatter$median.income <- round(ct_scatter$median.income, 0)

p <- ggplot(ct_scatter, aes(median.income, average.grade))
p + geom_point(aes(size=students.in.grade), alpha=.5) + 
  theme_bw() +
  xlab("Parents' socioeconomic status") + ylab("Grades above or below average") +
  ggtitle("Educational attainment in CT school districts")

The scatter plot above shows that the higher the median family income in a school district, the higher the average grade equivalent tends to be.

Students in school districts where families live closer to poverty are, on average, four grade levels behind their counterparts in the wealthiest school districts of Connecticut.

Explore the data

## district level means in grade equivalent units. There are multiple observations per district; one for each year, grade and subject.
# url <- "https://stacks.stanford.edu/file/druid:db586ns4974/district%20means%20grade%20equivalent%20std%20(gs)%20(separate%20sheets%20year%20and%20grade).xlsx"
# loc.download <- paste0(getwd(), "/data/", "district-means-grade-equivalent-std-gs-separate-sheets-year-and-grade.xlsx")
# download.file(url, loc.download, mode="wb")

grades <- read_excel("data/district-means-grade-equivalent-std-gs-separate-sheets-year-and-grade.xlsx", sheet = 1)

colnames(grades) <- make.names(colnames(grades))
ct_grades2 <- subset(grades, location.state=="CT")
ct_grades2 <- ct_grades2[c("education.agency.name", "Estimated.District.Mean.in.ela..grade.equivalent.std..gs.", "Estimated.District.Mean.in.math..grade.equivalent.std..gs.")]
colnames(ct_grades2) <- c("district", "math", "ela")

ct_scatter_table <- left_join(ct_scatter, ct_grades2)
ct_scatter_table <- ct_scatter_table[c("district", "average.grade", "math", "ela", "median.income", "students.in.grade")]
datatable(ct_scatter_table)

Visualizing U.S. and CT school districts

us_soc <- subset(soc, stateabb!="CT")
us_soc_6th <- subset(us_soc, grade==6)
us_soc_6th_2014 <- subset(us_soc_6th, year==2014)

us_6th_income <- us_soc_6th_2014[c("leaname", "inc50all", "totenrl")]

us_grades_sub <- us_grades[c("education.agency.name", "average.test.score..math.ela.pooled..in.grade.equiv")]

colnames(us_6th_income) <- c("district", "median.income", "students.in.grade")
colnames(us_grades_sub) <- c("district", "average.grade")

us_scatter <- left_join(us_6th_income, us_grades_sub)

## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector

us_scatter <- subset(us_scatter, !is.na(average.grade))
us_scatter <- subset(us_scatter, !is.na(median.income))

us_scatter <- us_scatter[c("district", "median.income", "average.grade", "students.in.grade")]
us_scatter$average.grade <- round(us_scatter$average.grade, 1)
us_scatter$median.income <- round(us_scatter$median.income, 0)


ct_scatter$where <- "CT"
us_scatter$where <- "US"
all_scatter <- rbind(us_scatter, ct_scatter)

ggplot(all_scatter,
       aes(x = median.income, y = average.grade)) + scale_x_log10() +
  geom_point(aes(size = students.in.grade), pch = 21, show.legend = TRUE, alpha=.8) +
  scale_size_continuous(range=c(1,40)) +
  aes(fill = where) + theme_bw() +
  xlab("Parents' socioeconomic status") + ylab("Grades above or below average") +
  ggtitle("Educational attainment in U.S. school districts")