This is the methodology used for the story: The impact of HIV in Connecticut and across the country

Visit the repo for the data used in this analysis.

Data for this analysis was provided by AIDSVu.org, compiled from the Centers for Disease Control & Prevention, the U.S. Census, and local health departments.

What’s in this script

Loading all the libraries
library(readxl)
library(corrplot)
library(dplyr)
library(stringr)
library(dplyr)
library(tidyr)
library(rgeos)
library(maptools)
library(ggplot2)   # devtools::install_github("hadley/ggplot2") only if you want subtitles/captions
library(ggalt)
library(ggthemes)
library(albersusa) # devtools::install_github("hrbrmstr/albersusa")
library(viridis)
library(scales)
library(knitr)
Data importing and preparation
y2013 <- read_excel("data/AIDSVu_County_Prev_2013.xlsx", sheet=1, skip=2)
y2014 <- read_excel("data/AIDSVu_County_NewDX_2014.xlsx", sheet=1, skip=2)

colnames(y2013) <- make.names(colnames(y2013))
colnames(y2014) <- make.names(colnames(y2014))

y2013$GEO.ID <-  ifelse(nchar(y2013$GEO.ID) == 4, paste0("0", y2013$GEO.ID), y2013$GEO.ID)
y2014$GEO.ID <-  ifelse(nchar(y2014$GEO.ID) == 4, paste0("0", y2014$GEO.ID), y2014$GEO.ID)


y2013nums <- y2013
y2013nums$State.Abbreviation <- NULL
y2013nums$State <- NULL
y2013nums$County.Name <- NULL
y2013nums$County.Rate.Stability <- NULL
y2013nums$Male.Rate.Stability <- NULL
y2013nums$Female.Rate.Stability <- NULL
y2013nums$Black.Rate.Stability <- NULL
y2013nums$Male.Rate.Stability <- NULL
y2013nums$White.Rate.Stability <- NULL
y2013nums$Hispanic.Rate.Stability <- NULL
y2013nums$Age.13.24.Rate.Stability <- NULL
y2013nums$Age.25.34.Rate.Stability <- NULL
y2013nums$Age.35.44.Rate.Stability <- NULL
y2013nums$Age.45.54.Rate.Stability <- NULL
y2013nums$Age.55..Rate.Stability <- NULL

y2013b <- y2013

State counties average of rate

state_sum <- y2013 %>% 
  group_by(State) %>%
  summarise(Rate=mean(County.Rate))
colnames(state_sum) <- c("County", "Overall.Rate")


state_black <- y2013 %>% 
  group_by(State) %>%
  summarise(Rate=mean(Black.Rate))
colnames(state_black) <- c("County", "Black.Rate")

state_hispanic <- y2013 %>% 
  group_by(State) %>%
  summarise(Rate=mean(Hispanic.Rate))
colnames(state_hispanic) <- c("County", "Hispanic.Rate")

state_white <- y2013 %>% 
  group_by(State) %>%
  summarise(Rate=mean(White.Rate))
colnames(state_white) <- c("County", "White.Rate")

state_all <- left_join(state_sum, state_black)
## Joining by: "County"
state_all <- left_join(state_all, state_hispanic)
## Joining by: "County"
state_all <- left_join(state_all, state_white)
## Joining by: "County"
kable(state_all)
County Overall.Rate Black.Rate Hispanic.Rate White.Rate
Alabama 215.208955 459.686567 88.358209 75.835821
Alaska -4.000000 -4.000000 -4.000000 -4.000000
Arizona 133.800000 270.933333 155.800000 94.066667
Arkansas 138.226667 252.973333 77.973333 70.013333
California 185.448276 599.224138 188.586207 191.344828
Colorado 88.250000 207.109375 70.734375 73.187500
Connecticut 245.125000 992.000000 616.250000 129.875000
Delaware 333.000000 -2.000000 -2.000000 -2.000000
Florida 460.283582 1713.343284 452.328358 212.059701
Georgia 314.672956 803.433962 167.075472 90.088050
Hawaii 121.000000 319.200000 140.000000 230.800000
Idaho 24.045455 -2.000000 -2.000000 -2.000000
Illinois 112.500000 26.029412 7.794118 2.931373
Indiana 76.902174 7.695652 3.000000 2.141304
Iowa 28.101010 54.242424 17.191919 19.383838
Kansas 30.257143 33.304762 12.857143 6.752381
Kentucky 73.941667 60.608333 31.025000 11.325000
Louisiana 332.359375 700.687500 622.109375 131.109375
Maine 70.625000 243.562500 95.750000 62.375000
Maryland 367.958333 1027.375000 277.791667 135.125000
Massachusetts 274.071429 1077.357143 775.214286 176.642857
Michigan 61.289157 221.289157 55.361446 36.722892
Minnesota 38.770115 208.747126 129.850575 20.701149
Mississippi 289.695122 536.329268 95.865854 68.670732
Missouri 78.956522 255.634783 50.269565 54.521739
Montana 9.107143 -1.000000 1.964286 7.250000
Nebraska 16.569893 21.559140 5.129032 1.462366
Nevada 86.647059 414.764706 72.764706 74.529412
New Hampshire 82.600000 -2.000000 -2.000000 -2.000000
New Jersey 428.476190 1522.761905 663.714286 174.047619
New Mexico 96.151515 276.969697 93.181818 70.545455
New York 344.322581 1509.048387 1895.822581 136.274193
North Carolina 242.230000 669.300000 170.500000 87.260000
North Dakota 3.792453 -1.000000 -1.000000 -1.000000
Ohio 96.761364 464.920455 98.772727 65.238636
Oklahoma 74.649351 131.883117 38.675325 58.311688
Oregon 67.638889 130.916667 75.944444 60.333333
Pennsylvania 129.477612 645.462687 397.194030 72.164179
Puerto Rico -4.000000 -4.000000 -4.000000 -4.000000
Rhode Island 140.600000 541.400000 235.400000 102.800000
South Carolina 408.934783 867.152174 251.934783 112.021739
South Dakota -2.000000 -2.000000 -2.000000 -2.000000
Tennessee 118.305263 313.463158 77.789474 75.031579
Texas 137.102362 317.090551 104.023622 61.023622
Utah 25.172414 135.551724 26.000000 20.655172
Vermont 58.285714 157.785714 33.071429 51.428571
Virginia 231.917293 659.383459 109.022556 98.646616
Washington 77.538461 243.923077 89.256410 64.256410
West Virginia 77.290909 264.854545 46.109091 45.236364
Wisconsin 48.666667 184.152778 44.597222 29.791667
Wyoming 32.478261 -2.000000 -2.000000 -2.000000
NA NA NA NA NA

County rate

Can swap out state abbrevation to get the data for any state.

ct_2013 <- subset(y2013, State.Abbreviation=="CT")

ct_2013 <- ct_2013 %>%
  select(County.Name, County.Rate, Black.Rate, White.Rate, Hispanic.Rate)

ct_2013$County.Name <- gsub(" County", "", ct_2013$County.Name)
kable(ct_2013)
County.Name County.Rate Black.Rate White.Rate Hispanic.Rate
Fairfield 343 1228 143 632
Hartford 406 966 157 1156
Litchfield 112 769 97 221
Middlesex 176 920 118 646
New Haven 422 1300 195 847
New London 220 1015 140 507
Tolland 92 365 78 204
Windham 190 1373 111 717

Looking at 2013 diagnoses data

cmap <- fortify(counties_composite(), region="fips")

y2013 <- subset(y2013b, County.Rate>1)

y2013$breaks <- "NA"
y2013$breaks <- ifelse(y2013$County.Rate < 41, "0-40", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Rate >= 41 & y2013$County.Rate <= 60, "41-60", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Rate >= 61 & y2013$County.Rate <= 70, "61-70", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Rate >= 71 & y2013$County.Rate <= 90, "71-90", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Rate >= 91 & y2013$County.Rate <= 110, "91-110", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Rate >= 111 & y2013$County.Rate <= 140, "111-140", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Rate >= 141 & y2013$County.Rate <= 190, "141-190", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Rate >= 191 & y2013$County.Rate <= 260, "191-260", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Rate >= 261 & y2013$County.Rate <= 410, "260-410", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Rate >= 411, "411+", y2013$breaks)

y2013$breaks <- factor(y2013$breaks, levels=c("0-40", "41-60", "61-70", "71-90", "91-110", "111-140", "141-190", "191-260", "260-410","411+"))

gg <- ggplot()
gg <- gg + geom_map(data=cmap, map=cmap,
                    aes(x=long, y=lat, map_id=id),
                    color="#2b2b2b", size=0.05, fill=NA)
gg <- gg + geom_map(data=y2013, map=cmap,
                    aes(fill=breaks, map_id=GEO.ID),
                    color="#2b2b2b", size=0.05, na.rm=TRUE)
gg <- gg + scale_fill_manual(values = c('#f7fcf0','#e0f3db','#ccebc5','#a8ddb5','#7bccc4','#4eb3d3','#2b8cbe','#0868ac','#084081', "#081d58"), name="Rate")
gg <- gg + labs(title="Rates of persons living with diagnosed HIV",
                subtitle="In 2013 (per 100,000 residents)",
                caption="TrendCT.org\nSOURCE: AIDSVu")
gg <- gg + coord_proj(us_laea_proj)
gg <- gg + theme_map(base_family="Arial Narrow")
gg <- gg + theme(legend.position=c(0.8, 0.25))
gg <- gg + theme(plot.title=element_text(face="bold", size=14, margin=margin(b=6)))
gg <- gg + theme(plot.subtitle=element_text(size=10, margin=margin(b=-14)))
gg

y2013 <- subset(y2013b, County.Cases>=1)

y2013$breaks <- "NA"
y2013$breaks <- ifelse(y2013$County.Cases >= 5 & y2013$County.Cases <=8 , "5-8", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Cases >= 9 & y2013$County.Cases <= 12, "9-12", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Cases >= 13 & y2013$County.Cases <= 18, "13-18", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Cases >= 19 & y2013$County.Cases <= 24, "19-24", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Cases >= 25 & y2013$County.Cases <= 36, "25-36", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Cases >= 37 & y2013$County.Cases <= 56, "37-56", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Cases >= 57 & y2013$County.Cases <= 96, "57-96", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Cases >= 97 & y2013$County.Cases <= 166, "97-166", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Cases >= 167 & y2013$County.Cases <= 518, "167-518", y2013$breaks)
y2013$breaks <- ifelse(y2013$County.Cases >= 519, "519+", y2013$breaks)

y2013$breaks <- factor(y2013$breaks, levels = c("5-8", "9-12", "13-18", "19-24", "25-36", "37-56", "57-96", "97-166", "167-518","519+"))

gg <- ggplot()
gg <- gg + geom_map(data=cmap, map=cmap,
                    aes(x=long, y=lat, map_id=id),
                    color="#2b2b2b", size=0.05, fill=NA)
gg <- gg + geom_map(data=y2013, map=cmap,
                    aes(fill=breaks, map_id=GEO.ID),
                    color="#2b2b2b", size=0.05)
gg <- gg + scale_fill_manual(values = c('#f7fcf0','#e0f3db','#ccebc5','#a8ddb5','#7bccc4','#4eb3d3','#2b8cbe','#0868ac','#084081', "#081d58"), name="Cases")
gg <- gg + labs(title="Number of persons living with diagnosed HIV in 2013",
                caption="TrendCT.org\nSOURCE: AIDSVu")
gg <- gg + coord_proj(us_laea_proj)
gg <- gg + theme_map(base_family="Arial Narrow")
gg <- gg + theme(legend.position=c(0.8, 0.25))
gg <- gg + theme(plot.title=element_text(face="bold", size=14, margin=margin(b=6)))
gg <- gg + theme(plot.subtitle=element_text(size=10, margin=margin(b=-14)))
gg

Looking at 2014 new diagnoses data

y2014b <- y2014

y2014 <- subset(y2014b, New.Diagnoses.Rate>1)

y2014$breaks <- "NA"
y2014$breaks <- ifelse(y2014$New.Diagnoses.Rate < 6, "0-6", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Rate >= 7 & y2014$New.Diagnoses.Rate <= 8, "7-8", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Rate >= 9 & y2014$New.Diagnoses.Rate <= 9, "9-9", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Rate >= 10 & y2014$New.Diagnoses.Rate <= 11, "10-11", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Rate >= 12 & y2014$New.Diagnoses.Rate <= 14, "12-14", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Rate >= 15 & y2014$New.Diagnoses.Rate <= 17, "15-17", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Rate >= 18 & y2014$New.Diagnoses.Rate <= 21, "18-21", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Rate >= 22 & y2014$New.Diagnoses.Rate <= 28, "22-28", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Rate >= 29 & y2014$New.Diagnoses.Rate <= 40, "29-40", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Rate >= 41, "41+", y2014$breaks)

y2014$breaks <- factor(y2014$breaks, levels=c("0-6", "7-8", "9-9", "10-11", "12-14", "15-17", "18-21", "22-28", "29-40","41+"))



gg <- ggplot()
gg <- gg + geom_map(data=cmap, map=cmap,
                    aes(x=long, y=lat, map_id=id),
                    color="#2b2b2b", size=0.05, fill=NA)
gg <- gg + geom_map(data=y2014, map=cmap,
                    aes(fill=breaks, map_id=GEO.ID),
                    color="#2b2b2b", size=0.05)
gg <- gg + scale_fill_manual(values = c('#fff7ec','#fee8c8','#fdd49e','#fdbb84','#fc8d59','#ef6548','#d7301f','#b30000','#7f0000', '#67000d'), name="Rate")
gg <- gg + labs(title="Rates of persons newly diagnosed with HIV",
                subtitle="In 2014 (per 100,000)",
                caption="TrendCT.org\nSOURCE: AIDSVu")
gg <- gg + coord_proj(us_laea_proj)
gg <- gg + theme_map(base_family="Arial Narrow")
gg <- gg + theme(legend.position=c(0.8, 0.25))
gg <- gg + theme(plot.title=element_text(face="bold", size=14, margin=margin(b=6)))
gg <- gg + theme(plot.subtitle=element_text(size=10, margin=margin(b=-14)))
gg

y2014 <- subset(y2014b, New.Diagnoses.Cases>1)

y2014$breaks <- "NA"
y2014$breaks <- ifelse(y2014$New.Diagnoses.Cases == 5, "5", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Cases == 6, "6", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Cases >= 7 & y2014$New.Diagnoses.Cases <= 8, "7-8", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Cases >= 9 & y2014$New.Diagnoses.Cases <= 10, "9-10", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Cases >= 11 & y2014$New.Diagnoses.Cases <= 13, "11-13", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Cases >= 14 & y2014$New.Diagnoses.Cases <= 19, "14-19", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Cases >= 20 & y2014$New.Diagnoses.Cases <= 28, "20-28", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Cases >= 29 & y2014$New.Diagnoses.Cases <= 49, "29-49", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Cases >= 50 & y2014$New.Diagnoses.Cases <= 111, "50-111", y2014$breaks)
y2014$breaks <- ifelse(y2014$New.Diagnoses.Cases >= 112, "112+", y2014$breaks)

y2014$breaks <- factor(y2014$breaks, levels=c("5", "6", "7-8", "9-10", "11-13", "14-19", "20-28", "29-49", "50-111","112+"))


gg <- ggplot()
gg <- gg + geom_map(data=cmap, map=cmap,
                    aes(x=long, y=lat, map_id=id),
                    color="#2b2b2b", size=0.05, fill=NA)
gg <- gg + geom_map(data=y2014, map=cmap,
                    aes(fill=breaks, map_id=GEO.ID),
                    color="#2b2b2b", size=0.05)
gg <- gg + scale_fill_manual(values = c('#fff7ec','#fee8c8','#fdd49e','#fdbb84','#fc8d59','#ef6548','#d7301f','#b30000','#7f0000', '#67000d'), name="Cases")
gg <- gg + labs(title="Number of persons newly diagnosed with HIV in 2014",
                caption="TrendCT.org\nSOURCE: AIDSVu")
gg <- gg + coord_proj(us_laea_proj)
gg <- gg + theme_map(base_family="Arial Narrow")
gg <- gg + theme(legend.position=c(0.8, 0.25))
gg <- gg + theme(plot.title=element_text(face="bold", size=14, margin=margin(b=6)))
gg <- gg + theme(plot.subtitle=element_text(size=10, margin=margin(b=-14)))
gg

Looking at 2014 new diagnoses data for Connecticut

ct_2014 <- subset(y2014, State.Abbreviation=="CT")

ct_2014 <- ct_2014 %>%
  select(County.Name, New.Diagnoses.Rate, New.Diagnoses.Cases)

ct_2014$County.Name <- gsub(" County", "", ct_2014$County.Name)

kable(ct_2014)
County.Name New.Diagnoses.Rate New.Diagnoses.Cases
Fairfield 13 101
Hartford 11 81
Middlesex 7 10
New Haven 12 86
New London 7 16
Tolland 4 5