This analysis is the basis for the TrendCT.org story Which neighborhoods have the highest risk of lead poisoning in CT? Not all of what was produced here ended up in the story. We encourage visitors to look over our calculations and expand upon our analysis.
First, let’s load up a bunch of packages for data wrangling, geospatial analysis, visualizations, and a neat package that interfaces with the Census API (censusapi R package).
library(dplyr)
library(scales)
require(rgdal)
require(ggmap)
require(Cairo)
require(maptools)
library(ggplot2)
#devtools::install_github("hrecht/censusapi")
library(censusapi)
source("keys.R")
library(stringr)
library(rvest)
library(tidyr)
library(corrgram)
library(gridExtra)
Alright, setting up the
gpclibPermit()
## [1] TRUE
gpclibPermitStatus()
## [1] TRUE
## Setting up the town tracts from the ctgeo towns shapefile in the maps folder
## Other state town (county subdivisions) border shapefiles can be downloaded from the Census
## https://www.census.gov/cgi-bin/geo/shapefiles/index.php
towntracts <- readOGR(dsn="maps", layer="ctgeo")
## OGR data source with driver: ESRI Shapefile
## Source: "maps", layer: "ctgeo"
## with 169 features
## It has 6 fields
towntracts <- fortify(towntracts, region="NAME10")
# Bringing in data
# This data is a cleaned up spreadsheet from a previous story:
# http://trendct.org/2016/01/27/children-screened-for-lead-poisoning-in-connecticut/
# The original data set can be found on the CT Department of Health website:
# http://www.ct.gov/dph/cwp/view.asp?a=3140&q=387576
screenings_total <- read.csv("data/confirmed.town.total.csv", stringsAsFactors=FALSE)
## Note: BLL stands for blood lead levels. There are different groupings: 5, 10, 15, and 20.
## Health officials must be notified if a child has five or more micrograms of lead per deciliter of blood (µg/dL)
# BLL5 total children
bll5_total <- screenings_total[c("Town", "BLL5")]
colnames(bll5_total) <- c("id", "bll5")
# joining the data to the formatted shapefile
bll5_total_map <- left_join(towntracts, bll5_total)
dtm5_total <- ggplot() +
geom_polygon(data = bll5_total_map, aes(x=long, y=lat, group=group, fill=bll5), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Lead-poisoned children by town (BLL 5 and up)", fill="")
# BLL10 total children
bll10_total <- screenings_total[c("Town", "BLL10")]
colnames(bll10_total) <- c("id", "bll10")
bll10_total_map <- left_join(towntracts, bll10_total)
dtm10_total <- ggplot() +
geom_polygon(data = bll10_total_map, aes(x=long, y=lat, group=group, fill=bll10), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Lead-poisoned children by town (BLL 10 and up)", fill="")
# BLL15 total children
bll15_total <- screenings_total[c("Town", "BLL15")]
colnames(bll15_total) <- c("id", "bll15")
bll15_total_map <- left_join(towntracts, bll15_total)
dtm15_total <- ggplot() +
geom_polygon(data = bll15_total_map, aes(x=long, y=lat, group=group, fill=bll15), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="lead-poisoned children by town (BLL 15 and up)", fill="")
# BLL20 total children
bll20_total <- screenings_total[c("Town", "BLL20")]
colnames(bll20_total) <- c("id", "bll20")
bll20_total_map <- left_join(towntracts, bll20_total)
dtm20_total <- ggplot() +
geom_polygon(data = bll20_total_map, aes(x=long, y=lat, group=group, fill=bll20), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Lead-poisoned children by town (BLL 20 and up)", fill="")
# Output of the totals map
grid.arrange(dtm5_total,dtm10_total,dtm15_total,dtm20_total, ncol=2)
Let’s try it again with percent of children
# This data is a cleaned up spreadsheet from a previous project
# The original data set can be found on the CT Department of Health website:
# http://www.ct.gov/dph/cwp/view.asp?a=3140&q=387576
screenings_percent <- read.csv("data/confirmed.town.percent.csv", stringsAsFactors=FALSE)
# BLL5 percent
bll5_percent <- screenings_percent[c("Town", "BLL5")]
colnames(bll5_percent) <- c("id", "bll5")
bll5_percent_map <- left_join(towntracts, bll5_percent)
dtm5_percent <- ggplot() +
geom_polygon(data = bll5_percent_map, aes(x=long, y=lat, group=group, fill=bll5), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="% of lead-poisoned children by town (BLL 5 +)", fill="")
# BLL10 percent
bll10_percent <- screenings_percent[c("Town", "BLL10")]
colnames(bll10_percent) <- c("id", "bll10")
bll10_percent_map <- left_join(towntracts, bll10_percent)
dtm10_percent <- ggplot() +
geom_polygon(data = bll10_percent_map, aes(x=long, y=lat, group=group, fill=bll10), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Percent of lead-poisoned children by town (BLL 10 +)", fill="")
# BLL15 percent
bll15_percent <- screenings_percent[c("Town", "BLL15")]
colnames(bll15_percent) <- c("id", "bll15")
bll15_percent_map <- left_join(towntracts, bll15_percent)
dtm15_percent <- ggplot() +
geom_polygon(data = bll15_percent_map, aes(x=long, y=lat, group=group, fill=bll15), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="% of lead-poisoned children by town (BLL 15 +)", fill="")
# BLL20 percent
bll20_percent <- screenings_percent[c("Town", "BLL20")]
colnames(bll20_percent) <- c("id", "bll20")
bll20_percent_map <- left_join(towntracts, bll20_percent)
dtm20_percent <- ggplot() +
geom_polygon(data = bll20_percent_map, aes(x=long, y=lat, group=group, fill=bll20), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="% of lead-poisoned children by town (BLL 20 +)", fill="")
# Outputting the maps
grid.arrange(dtm5_percent,dtm10_percent,dtm15_percent,dtm20_percent, ncol=2)
# This data is a cleaned up spreadsheet from a previous project
# The original data set can be found on the CT Department of Health website:
# http://www.ct.gov/dph/cwp/view.asp?a=3140&q=387576
screened <- read.csv("data/children.screened.csv", stringsAsFactors=FALSE)
screened_total <- screened[c("Town", "Children.Under.6.Screened")]
colnames(screened_total) <- c("id", "Children.Under.6.Screened")
screened_total_map <- left_join(towntracts, screened_total)
dtm_s <- ggplot() +
geom_polygon(data = screened_total_map, aes(x=long, y=lat, group=group, fill=Children.Under.6.Screened), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Reds", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Children screened for lead-poisoning by town", fill="")
screened_percent <- screened[c("Town", "Percent.9m.2yrs.Screened")]
colnames(screened_percent) <- c("id", "Percent.9m.2yrs.Screened")
screened_percent_map <- left_join(towntracts, screened_percent)
dtm_p <- ggplot() +
geom_polygon(data = screened_percent_map, aes(x=long, y=lat, group=group, fill=Percent.9m.2yrs.Screened), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Reds", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="% of children screened by town", fill="")
grid.arrange(dtm_s,dtm_p, ncol=2)
What’s going on here
housing_towns <- getCensus(name="acs5",
vintage=2014,
key=censuskey,
vars=c("NAME", "B25034_001E", "B25034_002E", "B25034_003E",
"B25034_004E","B25034_005E","B25034_006E","B25034_007E",
"B25034_008E","B25034_009E","B25034_010E", "B17004_001E"
),
region="county subdivision:*", regionin="state:09")
housing_towns <- subset(housing_towns, county.subdivision!="00000")
housing_towns$id <- sub(" town.*","",housing_towns$NAME)
old <- housing_towns
# Calculating a housing score based on weighted categorizations of home ages
old$age_39 <- old$B25034_010E * .68
old$age40_59 <- (old$B25034_009E + old$B25034_008E) * .43
old$age60_79 <- (old$B25034_007E + old$B25034_006E) * .08
old$age80_99 <- (old$B25034_005E + old$B25034_004E) * .03
old$age00_10 <- (old$B25034_003E + old$B25034_002E) * 0
old$total <- old$B25034_001E
old$risk_sum <- old$age_39 + old$age40_59 + old$age60_79 + old$age80_99 + old$age80_99 + old$age00_10
old$risk <- old$risk_sum/old$total*100
poverty_towns <- getCensus(name="acs5",
vintage=2014,
key=censuskey,
vars=c("NAME", "B06012_001E", "B06012_002E"
),
region="county subdivision:*", regionin="state:09")
poverty_towns <- subset(poverty_towns, county.subdivision!="00000")
poverty_towns$id <- sub(" town.*","",housing_towns$NAME)
# Determining percent of those living beneath the poverty line
poverty_towns$percent_poverty <- poverty_towns$B06012_002E/poverty_towns$B06012_001E * 100
# Z score
# (x-mean(x))/sd(x)
poverty_towns$z_poverty <- (poverty_towns$percent_poverty - mean(poverty_towns$percent_poverty))/sd(poverty_towns$percent_poverty)
old$z_housing <- (old$risk - mean(old$risk))/sd(old$risk)
# Calcuating weighted figures
poverty_towns$weighted_poverty <- poverty_towns$z_poverty * .42
old$weighted_housing <- old$z_housing * .58
old <- old[c("id", "risk", "z_housing", "weighted_housing")]
df <- left_join(poverty_towns, old)
df$riskscore_raw <- df$weighted_housing + df$weighted_poverty
df <- mutate(df, quantile = ntile(riskscore_raw, 10))
## mapping
gpclibPermit()
## [1] TRUE
gpclibPermitStatus()
## [1] TRUE
towntracts <- readOGR(dsn="maps", layer="ctgeo")
## OGR data source with driver: ESRI Shapefile
## Source: "maps", layer: "ctgeo"
## with 169 features
## It has 6 fields
towntracts_only <- towntracts
towntracts <- fortify(towntracts, region="NAME10")
df_town_map <- left_join(towntracts, df)
dtm_df1 <- ggplot() +
geom_polygon(data = df_town_map , aes(x=long, y=lat, group=group, fill=quantile), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Estimated lead risk by town", fill="")
dtm_df1
# Comparing results
comparing <- df[c("id", "quantile", "riskscore_raw", "percent_poverty", "risk")]
comparing <- left_join(comparing, bll5_total)
comparing <- left_join(comparing, bll10_total)
comparing <- left_join(comparing, bll15_total)
comparing <- left_join(comparing, bll20_total)
colnames(bll5_percent) <- c("id", "bll5.percent")
comparing <- left_join(comparing, bll5_percent)
colnames(bll10_percent) <- c("id", "bll10.percent")
comparing <- left_join(comparing, bll10_percent)
colnames(bll15_percent) <- c("id", "bll15.percent")
comparing <- left_join(comparing, bll15_percent)
colnames(bll20_percent) <- c("id", "bll20.percent")
comparing <- left_join(comparing, bll20_percent)
#dtm_df1$quantile
#dtm_df1$riskscore_raw
corrgram(comparing,order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie)
# This data set is from another story
# Methodology on how to get the raw data is at
# https://github.com/trendct/data/tree/master/2016/03/lead-analysis#how-to-get-the-list-of-lead-violations-from-the-epa
ct <- read.csv("data/lead_ale_samples_ct.csv", stringsAsFactors=FALSE)
# The data set doesn't have the business address. But we can get it from another data set
# -- this one is a list of all public drinking systems in Connecticut
summary <- read.csv("data/water_system_summary.csv", stringsAsFactors=FALSE)
# Join it to the dataset with elevated lead action levels
ct <- left_join(ct, summary, by="PWS.ID")
# Creating a new column with the link-- fortunately the EPA's URL structure is based on a few variables we already have
ct$link <- paste0("https://oaspub.epa.gov/enviro/sdw_report_v3.first_table?pws_id=",
ct$PWS.ID, "&state=CT&source=",
ct$Primary.Source.Code, "&population=",
ct$Population.Served.Count)
# Looking at towns served by drinking water systems with elevated lead violations
served <- ct %>%
group_by(Cities.Served) %>%
dplyr::summarise(Total=n())
served_single <- served[!grepl(",", served$Cities.Served),]
served_single <- served_single[!grepl("-", served_single$Cities.Served),]
colnames(served_single) <- c("id", "Total")
served_single$id <- str_to_title(served_single$id)
comparing <- left_join(comparing, served_single)
cor(comparing$riskscore_raw, comparing$Total, use = "complete")
## [1] -0.1164858
comparing$Total[is.na(comparing$Total)] <- 0
cor(comparing$riskscore_raw, comparing$Total)
## [1] -0.2631888
# scrape addresses of listed drinking water headquarter by visiting each individual drinking water page
# creating a temp column
ct$address.city <- ","
# Visiting each link in the row, finding the third row in the first table and getting the string found there
ct <- read.csv("data/ct_dataframe.csv", stringsAsFactors=FALSE)
## The following chunk of code might take more time than you're willing to spend since it's scraping 500+ websites
## That's why I brought in the dataframe above
## If you'd like to unskip, just comment out the line above and uncomment the following lines where SKIPPING OPTION begins
### BEGIN SKIPPING OPTION
#for (i in 1:nrow(ct)) {
# i_url <- ct$link[i] %>% read_html()
# ct$address.city[i] <- i_url %>% html_node("tr:nth-child(3) td") %>%
# html_text()
#}
# Cleaning up the address names to isolate the city
#ct$town.city <- sub(",.*","",ct$address.city)
#ct$state <- sub(".*,","",ct$address.city)
#ct$state <- sub(" .*$","",ct$state)
### END SKIPPING OPTION
# Ignoring drinking water systems outside of Connecticut-- there's quite a few in New York and Mass.
ct_only <- subset(ct, state=="CT")
hq <- ct %>%
group_by(town.city) %>%
dplyr::summarise(Total=n())
# Bringing in a custom package I wrote that cleans up town names, renames hamlets and villages
library(ctnamecleaner)
hq2 <- ctnamecleaner(town.city, hq)
## Your file with fixed town names has been exported.
## Unfortunately, no matches were found for 11 They can be found in your folder. The file is called no_matches.csv
hq3 <- hq2[!is.na(hq2$real.town.name),]
hq4 <- hq3 %>%
group_by(real.town.name) %>%
dplyr::summarise(Total=sum(Total))
colnames(hq4) <- c("id", "town.hqs")
comparing <- left_join(comparing, hq4)
Based on the town where the drinking water system indicated it was located.
cor(comparing$riskscore_raw, comparing$town.hqs, use = "complete")
## [1] -0.1466271
#comparing$town.hqs[is.na(comparing$town.hqs)] <- 0
#cor(comparing$riskscore_raw, comparing$town.hqs)
## cleaner towns served instances
## Had to bring this out of R to clean by hand. Sorry!
## Now I'm bringing it back in
tserv <- read.csv("data/towns_served.csv", stringsAsFactors=FALSE)
tserv$Towns <- str_to_title(tserv$Towns)
colnames(tserv) <- c("id", "towns.served")
comparing <- left_join(comparing, tserv)
cor(comparing$riskscore_raw, comparing$towns.served, use = "complete")
## [1] -0.1561462
comparing$towns.served[is.na(comparing$towns.served)] <- 0
cor(comparing$riskscore_raw, comparing$towns.served)
## [1] -0.2575911
cor(comparing$bll20.percent, comparing$towns.served)
## [1] 0.01783099
cor(comparing$bll20.percent, comparing$town.hqs)
## [1] NA
Alright, there is not much of a correlation between number of elevated lead water violations from drinking water systems in the towns they serve.
served_count_map <- left_join(towntracts, comparing)
dtm_served <- ggplot() +
geom_polygon(data = served_count_map, aes(x=long, y=lat, group=group, fill=towns.served), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Towns served", fill="")
dtm_hqs <- ggplot() +
geom_polygon(data = served_count_map, aes(x=long, y=lat, group=group, fill=town.hqs), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="System address", fill="")
## Outputting the results
grid.arrange(dtm_served,dtm_hqs, ncol=2)
Towns based on locations drinking water systems indicated they were headquartered.
ppbs_towns <- ct %>%
group_by(town.city) %>%
summarize(median.ppb=median(Sample.Measure..mg.L.)*1000, average.ppb=mean(Sample.Measure..mg.L.)*1000)
ppbs_towns_clean <- ctnamecleaner(town.city, ppbs_towns)
## Your file with fixed town names has been exported.
## Unfortunately, no matches were found for 11 They can be found in your folder. The file is called no_matches.csv
ppbs_towns_clean <- ppbs_towns_clean[!is.na(ppbs_towns_clean$real.town.name),]
ppbs_towns_clean <- ppbs_towns_clean %>%
group_by(real.town.name) %>%
summarize(median.ppb=median(median.ppb), average.ppb=mean(average.ppb))
colnames(ppbs_towns_clean) <- c("id", "median.ppb.hq", "average.ppb.hq")
comparing <- left_join(comparing, ppbs_towns_clean)
ppb_map_hq <- left_join(towntracts, comparing)
dtm_median <- ggplot() +
geom_polygon(data = ppb_map_hq, aes(x=long, y=lat, group=group, fill=median.ppb.hq), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Median PPB by town", fill="")
dtm_avg <- ggplot() +
geom_polygon(data = ppb_map_hq, aes(x=long, y=lat, group=group, fill=average.ppb.hq), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Average PPB by town", fill="")
grid.arrange(dtm_median,dtm_avg, ncol=2)
And is there a correlation between the risk score and median/average ppb in water systems with elevated levels of lead reported?
cor(comparing$riskscore_raw, comparing$median.ppb.hq, use = "complete")
## [1] 0.02246281
cor(comparing$riskscore_raw, comparing$average.ppb.hq, use = "complete")
## [1] 0.03310123
comparing$median.ppb.hq[is.na(comparing$median.ppb.hq)] <- 0
comparing$average.ppb.hq[is.na(comparing$average.ppb.hq)] <- 0
cor(comparing$riskscore_raw, comparing$median.ppb.hq)
## [1] -0.0335078
cor(comparing$riskscore_raw, comparing$average.ppb.hq)
## [1] -0.03319068
Nope.
This time in towns based on locations drinking water systems indicated they were served.
ppbs_served <- ct %>%
group_by(Cities.Served) %>%
summarize(median.ppb=median(Sample.Measure..mg.L.)*1000, average.ppb=mean(Sample.Measure..mg.L.)*1000)
ppbs_served_clean <- ctnamecleaner(Cities.Served, ppbs_served)
## Your file with fixed town names has been exported.
## Unfortunately, no matches were found for 7 They can be found in your folder. The file is called no_matches.csv
ppbs_served_clean <- ppbs_served_clean[!is.na(ppbs_served_clean$real.town.name),]
ppbs_served_clean <- ppbs_served_clean %>%
group_by(real.town.name) %>%
summarize(median.ppb=median(median.ppb), average.ppb=mean(average.ppb))
colnames(ppbs_served_clean) <- c("id", "median.ppb.served", "average.ppb.served")
comparing <- left_join(comparing, ppbs_served_clean)
ppb_map_hq <- left_join(towntracts, comparing)
dtm_median_served <- ggplot() +
geom_polygon(data = ppb_map_hq, aes(x=long, y=lat, group=group, fill=median.ppb.served), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Median PPB by town served", fill="")
dtm_avg_served <- ggplot() +
geom_polygon(data = ppb_map_hq, aes(x=long, y=lat, group=group, fill=average.ppb.served), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Average PPB by town served", fill="")
grid.arrange(dtm_median_served,dtm_avg_served, ncol=2)
And is there a correlation between the risk score and median/average ppb in water systems with elevated levels of lead reported?
(Based on towns served)
cor(comparing$riskscore_raw, comparing$median.ppb.served, use = "complete")
## [1] 0.2089186
cor(comparing$riskscore_raw, comparing$average.ppb.served, use = "complete")
## [1] 0.2685803
comparing$median.ppb.served[is.na(comparing$median.ppb.served)] <- 0
comparing$average.ppb.served[is.na(comparing$average.ppb.served)] <- 0
cor(comparing$riskscore_raw, comparing$median.ppb.served)
## [1] -0.1116948
cor(comparing$riskscore_raw, comparing$average.ppb.served)
## [1] 0.02986775
Doesn’t look like it.
Adding factorization to the towns might yield more insights.
## urban or rural?
designation <- read.csv("data/urban_or_rural.csv", stringsAsFactors=FALSE)
town_des <- designation[c("NAME10", "Type", "perc_urban")]
colnames(town_des) <-c("id", "Type", "perc_urban")
comparing <- left_join(comparing, town_des)
big_df <- comparing
big_df_0 <- big_df
big_df_0[is.na(big_df_0)] <- 0
big_df_na <- big_df
big_df_na[big_df_na==0] <- NA
raw_scatter1_0 <- big_df_0[c("id", "riskscore_raw", "bll5.percent")]
raw_scatter1_0 <- raw_scatter1_0 %>%
gather("Type", "Score", 2:3)
p <- ggplot(big_df_na, aes(riskscore_raw, bll5.percent))
p + geom_point(aes(colour = factor(Type))) + labs(title = "Percent of children with lead detected in blood versus risk score by town type", x = "Risk score", y="% of children with lead detected in blood stream")
cor(big_df_na$riskscore_raw, big_df_na$bll5.percent, use="complete")
## [1] 0.2978781
p <- ggplot(big_df_na, aes(riskscore_raw, bll5))
p + geom_point(aes(colour = Type)) + labs(title = "Number of children with lead detected in blood versus risk score by town type", x = "Risk score", y="Children with lead detected in blood stream")
cor(big_df_0$riskscore_raw, big_df_0$bll5)
## [1] 0.571366
p <- ggplot(big_df_0, aes(bll5.percent, quantile))+ labs(title = "Number of children with lead detected in blood versus risk score by town type", x = "% of children with lead detected in blood stream", y="Risk score")
p + geom_point(aes(colour = factor(Type)))
cor(big_df_0$quantile, big_df_0$bll5.percent)
## [1] 0.3326064
corrgram(big_df_na,order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie)
Interesting.
p <- ggplot(big_df_na, aes(factor(Type), quantile))
p + geom_boxplot() + labs(title = "Risk scores by town type")
p <- ggplot(big_df_0, aes(factor(Type), bll5.percent))
p + geom_boxplot() + labs(title = "Percent of children with lead detected in blood by town type")
p <- ggplot(big_df_na, aes(factor(Type), median.ppb.served))
p + geom_boxplot() + labs(title = "Median ppb detected in some town's drinking water")
# final maps
big_df_na_map <- left_join(towntracts, big_df_na, by="id")
map1 <- ggplot() +
geom_polygon(data =big_df_na_map, aes(x=long, y=lat, group=group, fill=riskscore_raw), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Lead risk score", fill="")
map2 <- ggplot() +
geom_polygon(data =big_df_na_map, aes(x=long, y=lat, group=group, fill=bll5), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Lead-poisoned children by town (BLL 5 and up)", fill="")
map3 <- ggplot() +
geom_polygon(data =big_df_na_map, aes(x=long, y=lat, group=group, fill=towns.served), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Median PPB detected in drinking water systems", fill="")
map4 <- ggplot() +
geom_polygon(data =big_df_na_map, aes(x=long, y=lat, group=group, fill=median.ppb.served), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", trans="reverse", palette = "Blues", breaks=pretty_breaks(n=10)) +
theme_nothing(legend=TRUE) +
labs(title="Frequency of elevated action lead levels detected in drinking water", fill="")
grid.arrange(map2, map1, map3, map4, ncol=2)
Large urban areas like New Haven and Hartford have some of the highest risks for lead exposure, and the data on children who test positive for high lead levels in their blood reflects that. But the formula indicates there’s also high risk in pockets of towns like Mansfield, Norwich and Danbury.
After applying the formula to towns and comparing it to the number of children reported to have high lead levels, we found a strong correlation between the estimate and the actuality (a coefficient of about .57).
According to our analysis, there’s a closer link between lead poisoning and old buildings or poverty than to drinking water systems with high lead levels.
This is in line with findings by the Connecticut Department of Health in 2013.
Though the tragedy in Flint, Mich., where thousands of children drank toxic water in their homes and schools, has increased awareness of lead in public drinking water systems, Connecticut officials did not trace any of the 140 cases of lead-poisoned children it investigated in 2013 back to drinking water.
Most cases were attributed to paint- and dust-related hazards.
Read the full story at TrendCT.org.