An analysis for the TrendCT story: Increase in gun purchases triggered by Connecticut state legislation
The data is/are from the Buzzfeed repo that scraped the FBI NICS PDFs on the number of firearm checks by month, state, and type.
Check the Trend CT repo for the scripts and data sets that were used and generated. Just want the seasonally adjusted state per capita data? Here’s the csv.
There are many caveats to the data, but the basic thing to take away is that a background check does not necessarily mean a gun sale. The Buzzfeed repo has more details.
This analysis will go over the process used to put together the story. In short, the background checks were normalized for annual population by state, adjusted for seasonality, and then charted.
Population estimates
The historical population data was gathered from Census.gov.
# Packages we'll need
library(knitr)
library(stringr)
library(ggplot2)
library(lubridate)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(seasonal)
us <- read.csv("data/historicalpop-US.csv", stringsAsFactors=FALSE)
states <- read.csv("data/historicalpop-StatesT.csv", stringsAsFactors=FALSE)
kable(head(us))
1998 |
275854104 |
1999 |
279040168 |
2000 |
282162411 |
2001 |
284968955 |
2002 |
287625193 |
2003 |
290107933 |
kable(head(states[,1:5]))
1998 |
4404701 |
619932 |
4883342 |
2626289 |
1999 |
4430141 |
624779 |
5023823 |
2651860 |
2000 |
4452173 |
627963 |
5160586 |
2678588 |
2001 |
4467634 |
633714 |
5273477 |
2691571 |
2002 |
4480089 |
642337 |
5396255 |
2705927 |
2003 |
4503491 |
648414 |
5510364 |
2724816 |
Bringing in the data from the Buzzfeed repo.
nics <- read.csv("data/nics-firearm-background-checks.csv", stringsAsFactors=FALSE)
# Quick trimming up of leading and trailing white spaces
# Using the stringr package
nics$state <- str_trim(nics$state)
kable(head(nics[,1:5]))
2015-11 |
Alabama |
18870 |
23022 |
22650 |
2015-11 |
Alaska |
209 |
3062 |
3209 |
2015-11 |
Arizona |
2303 |
12382 |
9041 |
2015-11 |
Arkansas |
3298 |
6359 |
11611 |
2015-11 |
California |
98452 |
41181 |
35007 |
2015-11 |
Colorado |
4144 |
19784 |
16082 |
Preliminary annual US analysis
# Adding a column for year in the dataframe
# But first have to convert the data into a date-friendly format
nics$date <- as.Date(paste(nics$month,"-01",sep=""))
nics$year <- year(nics$date)
# Creating a new dataframe with the total background checks in the country by year
annual <- data.frame(tapply(nics$totals, nics$year, sum))
# Cleaning up the new dataframe
annual$year <- rownames(annual)
rownames(annual) <- NULL
colnames(annual) <- c("Total", "Year")
annual <- annual[c("Year", "Total")]
# Merging background checks to population dataframe
annual <- merge(annual, us)
# Calculating per capita (1,000 residents)
annual$percapita <- round((annual$Total/annual$US)*1000,2)
# 1998 is an incomplete year, so let's take that out
annual <- annual[-1,]
kable(head(annual))
2 |
1999 |
9043747 |
279040168 |
32.41 |
3 |
2000 |
8427096 |
282162411 |
29.87 |
4 |
2001 |
8820045 |
284968955 |
30.95 |
5 |
2002 |
8367069 |
287625193 |
29.09 |
6 |
2003 |
8402244 |
290107933 |
28.96 |
7 |
2004 |
8579891 |
292805298 |
29.30 |
Charting the total background checks by year
ggplot(data=annual, aes(x=Year,y=Total, group=1)) +
geom_line() +
ggtitle("Background checks for firearms in the US") +
labs(x="Year", y="Total")
Charting the total background checks by year per capita
ggplot(data=annual, aes(x=Year,y=percapita, group=1)) +
geom_line() +
ggtitle("Background checks per capita for firearms in the US") +
labs(x="Year", y="Per 1,000 residents")
Not much difference, right.
Monthly totals for the US
monthly <-data.frame(tapply(nics$totals, nics$date, sum))
monthly$date <- rownames(monthly)
rownames(monthly) <- NULL
colnames(monthly) <- c("Total", "Month")
monthly<- monthly[c("Month", "Total")]
kable(head(monthly))
1998-11-01 |
21176 |
1998-12-01 |
870722 |
1999-01-01 |
585974 |
1999-02-01 |
690215 |
1999-03-01 |
741687 |
1999-04-01 |
638666 |
Monthly background checks for the US per capita
monthly$Month <- ymd(monthly$Month)
monthly$Year <- year(monthly$Month)
# Join by annual population from the Census
monthly <- left_join(monthly, us)
## Joining by: "Year"
monthly$percapita <- round((monthly$Total/monthly$US)*1000,2)
us_month <- monthly[c("Month", "percapita")]
colnames(us_month) <- c("Month", "US")
kable(head(us_month))
1998-11-01 |
0.08 |
1998-12-01 |
3.16 |
1999-01-01 |
2.10 |
1999-02-01 |
2.47 |
1999-03-01 |
2.66 |
1999-04-01 |
2.29 |
Monthly totals for Connecticut
ct <- subset(nics, state=="Connecticut")
ctpop <- read.csv("data/historicalpop-CT.csv", stringsAsFactors=FALSE)
ct_monthly<-data.frame(tapply(ct$totals, ct$month, sum))
ct_monthly$date <- rownames(ct_monthly)
rownames(ct_monthly) <- NULL
colnames(ct_monthly) <- c("Total", "Month")
ct_monthly<- ct_monthly[c("Month", "Total")]
kable(head(ct_monthly))
1998-11 |
80 |
1998-12 |
6790 |
1999-01 |
6265 |
1999-02 |
8069 |
1999-03 |
7877 |
1999-04 |
9111 |
Monthly checks per capita: US vs CT
ct_monthly$Month <- as.Date(paste(ct_monthly$Month, "-01", sep=""))
ct_monthly$Month <- ymd(ct_monthly$Month)
ct_monthly$Year <- year(ct_monthly$Month)
ct_monthly <- left_join(ct_monthly, ctpop)
## Joining by: "Year"
ct_monthly$percapita <- round((ct_monthly$Total/ct_monthly$CT)*1000,2)
ct_month <- ct_monthly[c("Month", "percapita")]
colnames(ct_month) <- c("Month", "CT")
ct_us_month <- left_join(us_month, ct_month)
## Joining by: "Month"
# Prepping the dataframe for ggplot
ct_us_month_gg <- gather(ct_us_month, "State", "Per.Capita", 2:3)
## Warning: attributes are not identical across measure variables; they will
## be dropped
ggplot(data=ct_us_month_gg, aes(x=Month,y=Per.Capita, group=State, colour=State)) +
geom_line() +
ggtitle("Background checks per capita for firearms in the US and CT") +
labs(x="Year", y="Per 1,000 residents")
Calculating the background checks per capita by state
# per capita for US one more time
monthly <-data.frame(tapply(nics$totals, nics$month, sum))
monthly$date <- rownames(monthly)
rownames(monthly) <- NULL
colnames(monthly) <- c("Total", "Month")
monthly<- monthly[c("Month", "Total")]
monthly$Month <- as.Date(paste(monthly$Month, "-01", sep=""))
monthly$Month <- ymd(monthly$Month)
monthly$Year <- year(monthly$Month)
monthly <- left_join(monthly, us)
monthly$percapita <- round((monthly$Total/monthly$US)*1000,2)
by_month <- monthly[c("Month", "percapita")]
colnames(by_month) <- c("Month", "US")
by_month$Month <- substr(by_month$Month, 1, 7)
# Monthly for all states now
# Restructure data.frame
totals_only <- nics[c("month", "state", "totals")]
spreaded_totals <- spread(totals_only, state, totals)
spreaded_totals <- spreaded_totals[,colSums(is.na(spreaded_totals))<nrow(spreaded_totals)]
spreaded_totals$year <- substr(spreaded_totals$month, 1, 4)
spreaded_totals$year <- as.numeric(spreaded_totals$year)
# Setting up the loop to go through the two dataframes
# - One with annual historical state population
# - One from the NICS for total background checks
states_num <- ncol(states)
states_list <- 2:states_num
# This is the messy loop
for (i in states_list) {
state_name <- colnames(states[i])
temp_df <- states[c("Year", state_name)]
colnames(temp_df) <- c("year", "Population")
state_name <- gsub("\\.", " ", state_name)
test_this <- grepl(state_name, colnames(spreaded_totals))
test_sum <- sum(test_this)
if (test_sum>0) {
# looking just at totals
nics_df <- spreaded_totals[c("month", "year", state_name)]
temp_df <- left_join(temp_df, nics_df)
temp_df$per_capita <- round((temp_df[,4]/temp_df[,2])*1000,2)
temp_df <- temp_df[c("month", "per_capita")]
colnames(temp_df) <- c("Month", state_name)
by_month <- left_join(by_month, temp_df)
}
}
Charting out the state per capita results
test_df <- by_month
test_df$Month <- factor(test_df$Month)
test_df <- gather(test_df, "State", "Per.Capita", 2:53)
## Warning: attributes are not identical across measure variables; they will
## be dropped
ggplot(data=test_df, aes(x=test_df$Month,y=Per.Capita, group=State)) +
geom_line() +
ggtitle("Background checks by state") +
labs(x="Month", y="Per 1,000 residents")
Charting out the state per capita with small multiples
ggplot(data=test_df, aes(x=Month,y=Per.Capita)) +
geom_bar(stat="identity") +
facet_wrap(~State) +
ggtitle("Background checks by state") +
theme(plot.title = element_text(family="Trebuchet MS", face="bold", size=20, hjust=0, color="#555555")) +
theme(axis.text.x = element_text(angle=90))
US vs. CT adjusted
usct <- us_col[c("Month", "Adjusted")]
colnames(usct) <- c("Month", "US.Adjusted")
ct_adjusted <- dfct[c("Month", "Adjusted")]
colnames(ct_adjusted) <- c("Month", "CT.Adjusted")
usct <- left_join(usct, ct_adjusted)
## Joining by: "Month"
kable(head(usct))
1998-12-01 |
2.19 |
1.83 |
1999-01-01 |
2.25 |
1.89 |
1999-02-01 |
2.45 |
2.30 |
1999-03-01 |
2.52 |
1.97 |
1999-04-01 |
2.53 |
2.43 |
1999-05-01 |
2.65 |
2.46 |
Charting out the seasonally adjusted per capita data
us_seas_only <- us_col[c("Month", "Adjusted")]
us_seas_only$Month <- as.character(us_seas_only$Month)
colnames(us_seas_only) <- c("Month", "US")
us_seas_only$US <- as.numeric(as.character(us_seas_only$US))
months_only <- as.character(us_seas_only$Month)
adj_seas <- by_month
adj_seas_num <- 2:ncol(adj_seas)
adj_seas_list <- colnames(adj_seas)
adj_seas$Month <- as.Date(adj_seas$Month, format="%Y-%m")
for (i in adj_seas_num) {
adj_seas_col <- colnames(adj_seas[i])
seas_df_subset <- adj_seas[c("Month", adj_seas_col)]
subset_series <- ts(seas_df_subset[,2],frequency=12,start=c(1998,11))
subset_seas <- seas(subset_series)
subset_seas_df <- as.data.frame(final(subset_seas))
subset_seas_df <- subset_seas_df[-1,]
months_only_df <- months_only
months_only_df <- as.data.frame(cbind(months_only_df, subset_seas_df))
colnames(months_only_df) <- c("Month", adj_seas_col)
months_only_df$Month <- as.character(months_only_df$Month)
months_only_df[,2] <- as.numeric(as.character(months_only_df[,2]))
months_only_df[,2] <- round(months_only_df[,2], 2)
if (!exists("months_only_all")) {
months_only_all <- left_join(us_seas_only, months_only_df)
} else {
months_only_all <- left_join(months_only_all, months_only_df)
}
}
kable(head(months_only_all[,1:5]))
1998-12-01 |
2.19 |
4.59 |
5.38 |
2.75 |
1999-01-01 |
2.25 |
4.54 |
5.61 |
2.73 |
1999-02-01 |
2.45 |
4.63 |
5.47 |
2.81 |
1999-03-01 |
2.52 |
4.65 |
5.65 |
2.89 |
1999-04-01 |
2.53 |
4.53 |
5.83 |
2.96 |
1999-05-01 |
2.65 |
4.59 |
5.67 |
2.91 |
# Exporting seasonally-adjusted figures to csv
write.csv(months_only_all, "data/states_monthly_adjusted.csv")
# Charting out with ggplot
for_gg <- months_only_all
# Adjusting timestamp just for cosmetic reasons
for_gg$Month <- substring(for_gg$Month, 1,7)
# Prepping the time stamps for ggplot
for_gg$Month <- factor(for_gg$Month)
# Reshaping the dataframe for ggplot and also D3
for_gg <- gather(for_gg, "State", "Per.Capita", 2:53)
## Charting it in one line graph
ggplot(data=for_gg, aes(x=for_gg$Month,y=Per.Capita, group=State)) +
geom_line() +
ggtitle("Background checks by state") +
labs(x="Month", y="Per 1,000 residents")
Charting it using small multiples
ggplot(data=for_gg, aes(x=Month,y=Per.Capita)) +
geom_bar(stat="identity") +
facet_wrap(~State) +
ggtitle(expression(atop("Background checks by state", atop(italic("Adjusted for seasonality"), "")))) +
theme(plot.title = element_text(family="Trebuchet MS", face="bold", size=20, hjust=0, color="#555555")) +
theme(axis.text.x = element_text(angle=90))
## Warning: Stacking not well defined when ymin != 0
## Warning: Stacking not well defined when ymin != 0
# Exporting for D3
write.table(for_gg, file="seasonally_adjusted.tsv", quote=FALSE, sep='\t', col.names=NA)