This is an exploratory analysis looking at the relationships between various data sets:

  1. Department of Health stats on the number of children who have tested for high levels of lead in their blood
  2. U.S. Environmental Protection Agency data on public drinking water systems in CT that tested for elevated levels of lead
  3. U.S. Census data on housing and poverty. Combined, they can predict lead exposure risk.

This analysis is the basis for the TrendCT.org story Which neighborhoods have the highest risk of lead poisoning in CT? Not all of what was produced here ended up in the story. We encourage visitors to look over our calculations and expand upon our analysis.


First, let’s load up a bunch of packages for data wrangling, geospatial analysis, visualizations, and a neat package that interfaces with the Census API (censusapi R package).

library(dplyr)
library(scales)
require(rgdal)
require(ggmap)
require(Cairo)
require(maptools)
library(ggplot2)
#devtools::install_github("hrecht/censusapi")
library(censusapi)
source("keys.R")
library(stringr)
library(rvest)
library(tidyr)
library(corrgram)
library(gridExtra)

Alright, setting up the

gpclibPermit()
## [1] TRUE
gpclibPermitStatus()
## [1] TRUE
## Setting up the town tracts from the ctgeo towns shapefile in the maps folder
## Other state town (county subdivisions) border shapefiles can be downloaded from the Census
## https://www.census.gov/cgi-bin/geo/shapefiles/index.php
towntracts <- readOGR(dsn="maps", layer="ctgeo")
## OGR data source with driver: ESRI Shapefile 
## Source: "maps", layer: "ctgeo"
## with 169 features
## It has 6 fields
towntracts <- fortify(towntracts, region="NAME10")

# Bringing in data 
# This data is a cleaned up spreadsheet from a previous story:
# http://trendct.org/2016/01/27/children-screened-for-lead-poisoning-in-connecticut/

# The original data set can be found on the CT Department of Health website:
# http://www.ct.gov/dph/cwp/view.asp?a=3140&q=387576

screenings_total <- read.csv("data/confirmed.town.total.csv", stringsAsFactors=FALSE)

## Note: BLL stands for blood lead levels. There are different groupings: 5, 10, 15, and 20.
## Health officials must be notified if a child has five or more micrograms of lead per deciliter of blood (µg/dL) 

# BLL5 total children
bll5_total <- screenings_total[c("Town", "BLL5")]
colnames(bll5_total) <- c("id", "bll5")

# joining the data to the formatted shapefile
bll5_total_map <- left_join(towntracts, bll5_total)

dtm5_total <- ggplot() +
  geom_polygon(data = bll5_total_map, aes(x=long, y=lat, group=group, fill=bll5), color = "black", size=0.2) +
  coord_map() +
  scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
  theme_nothing(legend=TRUE) +
  labs(title="Lead-poisoned children by town (BLL 5 and up)", fill="")

# BLL10 total children
bll10_total <- screenings_total[c("Town", "BLL10")]
colnames(bll10_total) <- c("id", "bll10")

bll10_total_map <- left_join(towntracts, bll10_total)

dtm10_total <- ggplot() +
  geom_polygon(data = bll10_total_map, aes(x=long, y=lat, group=group, fill=bll10), color = "black", size=0.2) +
  coord_map() +
  scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
  theme_nothing(legend=TRUE) +
  labs(title="Lead-poisoned children by town (BLL 10 and up)", fill="")

# BLL15 total children
bll15_total <- screenings_total[c("Town", "BLL15")]
colnames(bll15_total) <- c("id", "bll15")

bll15_total_map <- left_join(towntracts, bll15_total)

dtm15_total <- ggplot() +
  geom_polygon(data = bll15_total_map, aes(x=long, y=lat, group=group, fill=bll15), color = "black", size=0.2) +
  coord_map() +
  scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
  theme_nothing(legend=TRUE) +
  labs(title="lead-poisoned children by town (BLL 15 and up)", fill="")

# BLL20  total children
bll20_total <- screenings_total[c("Town", "BLL20")]
colnames(bll20_total) <- c("id", "bll20")

bll20_total_map <- left_join(towntracts, bll20_total)

dtm20_total <- ggplot() +
  geom_polygon(data = bll20_total_map, aes(x=long, y=lat, group=group, fill=bll20), color = "black", size=0.2) +
  coord_map() +
  scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
  theme_nothing(legend=TRUE) +
  labs(title="Lead-poisoned children by town (BLL 20 and up)", fill="")

# Output of the totals map
grid.arrange(dtm5_total,dtm10_total,dtm15_total,dtm20_total, ncol=2)