An analysis for the TrendCT story: Increase in gun purchases triggered by Connecticut state legislation

The data is/are from the Buzzfeed repo that scraped the FBI NICS PDFs on the number of firearm checks by month, state, and type.

Check the Trend CT repo for the scripts and data sets that were used and generated. Just want the seasonally adjusted state per capita data? Here’s the csv.

There are many caveats to the data, but the basic thing to take away is that a background check does not necessarily mean a gun sale. The Buzzfeed repo has more details.

This analysis will go over the process used to put together the story. In short, the background checks were normalized for annual population by state, adjusted for seasonality, and then charted.

Population estimates

The historical population data was gathered from Census.gov.

# Packages we'll need
library(knitr)
library(stringr)
library(ggplot2)
library(lubridate)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(seasonal)

us <- read.csv("data/historicalpop-US.csv", stringsAsFactors=FALSE)
states <- read.csv("data/historicalpop-StatesT.csv", stringsAsFactors=FALSE)
kable(head(us))
Year US
1998 275854104
1999 279040168
2000 282162411
2001 284968955
2002 287625193
2003 290107933
kable(head(states[,1:5]))
Year Alabama Alaska Arizona Arkansas
1998 4404701 619932 4883342 2626289
1999 4430141 624779 5023823 2651860
2000 4452173 627963 5160586 2678588
2001 4467634 633714 5273477 2691571
2002 4480089 642337 5396255 2705927
2003 4503491 648414 5510364 2724816

Bringing in the data from the Buzzfeed repo.

nics <- read.csv("data/nics-firearm-background-checks.csv", stringsAsFactors=FALSE)
# Quick trimming up of leading and trailing white spaces
# Using the stringr package
nics$state <- str_trim(nics$state)
kable(head(nics[,1:5]))
month state permit handgun long_gun
2015-11 Alabama 18870 23022 22650
2015-11 Alaska 209 3062 3209
2015-11 Arizona 2303 12382 9041
2015-11 Arkansas 3298 6359 11611
2015-11 California 98452 41181 35007
2015-11 Colorado 4144 19784 16082

Preliminary annual US analysis

# Adding a column for year in the dataframe
# But first have to convert the data into a date-friendly format

nics$date <- as.Date(paste(nics$month,"-01",sep=""))
nics$year <- year(nics$date)

# Creating a new dataframe with the total background checks in the country by year
annual <- data.frame(tapply(nics$totals, nics$year, sum))

# Cleaning up the new dataframe
annual$year <- rownames(annual)
rownames(annual) <- NULL
colnames(annual) <- c("Total", "Year")
annual <- annual[c("Year", "Total")]

# Merging background checks to population dataframe
annual <- merge(annual, us)

# Calculating per capita (1,000 residents)
annual$percapita <- round((annual$Total/annual$US)*1000,2)

# 1998 is an incomplete year, so let's take that out
annual <- annual[-1,]

kable(head(annual))
Year Total US percapita
2 1999 9043747 279040168 32.41
3 2000 8427096 282162411 29.87
4 2001 8820045 284968955 30.95
5 2002 8367069 287625193 29.09
6 2003 8402244 290107933 28.96
7 2004 8579891 292805298 29.30

Charting the total background checks by year

ggplot(data=annual, aes(x=Year,y=Total, group=1)) +
  geom_line() +
  ggtitle("Background checks for firearms in the US") +
  labs(x="Year", y="Total")

Charting the total background checks by year per capita

ggplot(data=annual, aes(x=Year,y=percapita, group=1)) +
  geom_line() +
  ggtitle("Background checks per capita for firearms in the US") +
  labs(x="Year", y="Per 1,000 residents")

Not much difference, right.

Monthly totals for the US

monthly <-data.frame(tapply(nics$totals, nics$date, sum))
monthly$date <- rownames(monthly)
rownames(monthly) <- NULL
colnames(monthly) <- c("Total", "Month")
monthly<- monthly[c("Month", "Total")]
kable(head(monthly))
Month Total
1998-11-01 21176
1998-12-01 870722
1999-01-01 585974
1999-02-01 690215
1999-03-01 741687
1999-04-01 638666

Monthly background checks for the US per capita

monthly$Month <- ymd(monthly$Month)
monthly$Year <- year(monthly$Month)

# Join by annual population from the Census
monthly <- left_join(monthly, us)
## Joining by: "Year"
monthly$percapita <- round((monthly$Total/monthly$US)*1000,2)

us_month <- monthly[c("Month", "percapita")]
colnames(us_month) <- c("Month", "US")
kable(head(us_month))
Month US
1998-11-01 0.08
1998-12-01 3.16
1999-01-01 2.10
1999-02-01 2.47
1999-03-01 2.66
1999-04-01 2.29

Monthly totals for Connecticut

ct <- subset(nics, state=="Connecticut")
ctpop  <- read.csv("data/historicalpop-CT.csv", stringsAsFactors=FALSE)
ct_monthly<-data.frame(tapply(ct$totals, ct$month, sum))
ct_monthly$date <- rownames(ct_monthly)
rownames(ct_monthly) <- NULL
colnames(ct_monthly) <- c("Total", "Month")
ct_monthly<- ct_monthly[c("Month", "Total")]
kable(head(ct_monthly))
Month Total
1998-11 80
1998-12 6790
1999-01 6265
1999-02 8069
1999-03 7877
1999-04 9111

Monthly checks per capita: US vs CT

ct_monthly$Month <- as.Date(paste(ct_monthly$Month, "-01", sep=""))
ct_monthly$Month <- ymd(ct_monthly$Month)
ct_monthly$Year <- year(ct_monthly$Month)

ct_monthly <- left_join(ct_monthly, ctpop)
## Joining by: "Year"
ct_monthly$percapita <- round((ct_monthly$Total/ct_monthly$CT)*1000,2)

ct_month <- ct_monthly[c("Month", "percapita")]
colnames(ct_month) <- c("Month", "CT")

ct_us_month <- left_join(us_month, ct_month)
## Joining by: "Month"
# Prepping the dataframe for ggplot
ct_us_month_gg <- gather(ct_us_month, "State", "Per.Capita", 2:3)
## Warning: attributes are not identical across measure variables; they will
## be dropped
ggplot(data=ct_us_month_gg, aes(x=Month,y=Per.Capita, group=State, colour=State)) +
  geom_line() +
  ggtitle("Background checks per capita for firearms in the US and CT") +
  labs(x="Year", y="Per 1,000 residents")

Calculating the background checks per capita by state

# per capita for US one more time
monthly <-data.frame(tapply(nics$totals, nics$month, sum))
monthly$date <- rownames(monthly)
rownames(monthly) <- NULL
colnames(monthly) <- c("Total", "Month")
monthly<- monthly[c("Month", "Total")]
monthly$Month <- as.Date(paste(monthly$Month, "-01", sep=""))

monthly$Month <- ymd(monthly$Month)
monthly$Year <- year(monthly$Month)

monthly <- left_join(monthly, us)
monthly$percapita <- round((monthly$Total/monthly$US)*1000,2)

by_month <- monthly[c("Month", "percapita")]
colnames(by_month) <- c("Month", "US")

by_month$Month <- substr(by_month$Month, 1, 7)

# Monthly for all states now
# Restructure data.frame 

totals_only <- nics[c("month", "state", "totals")] 

spreaded_totals <- spread(totals_only, state, totals)
spreaded_totals <- spreaded_totals[,colSums(is.na(spreaded_totals))<nrow(spreaded_totals)]
spreaded_totals$year <- substr(spreaded_totals$month, 1, 4)
spreaded_totals$year <- as.numeric(spreaded_totals$year)

# Setting up the loop to go through the two dataframes
# - One with annual historical state population
# - One from the NICS for total background checks

states_num <- ncol(states)
states_list <- 2:states_num

# This is the messy loop
for (i in states_list) {
  state_name <- colnames(states[i])
  temp_df <- states[c("Year", state_name)]
  colnames(temp_df) <- c("year", "Population")
  state_name <- gsub("\\.", " ", state_name)
  test_this <- grepl(state_name, colnames(spreaded_totals))
  test_sum <- sum(test_this)
  if (test_sum>0) {
    # looking just at totals
    nics_df <- spreaded_totals[c("month", "year", state_name)]
    temp_df <- left_join(temp_df, nics_df)
    temp_df$per_capita <- round((temp_df[,4]/temp_df[,2])*1000,2)
    temp_df <- temp_df[c("month", "per_capita")]
    colnames(temp_df) <- c("Month", state_name)
    
    by_month <- left_join(by_month, temp_df)
  }
}

Charting out the state per capita results

test_df <- by_month

test_df$Month <- factor(test_df$Month)

test_df <- gather(test_df, "State", "Per.Capita", 2:53)
## Warning: attributes are not identical across measure variables; they will
## be dropped
ggplot(data=test_df, aes(x=test_df$Month,y=Per.Capita, group=State)) +
  geom_line() +
  ggtitle("Background checks by state") +
  labs(x="Month", y="Per 1,000 residents")

Charting out the state per capita with small multiples

ggplot(data=test_df, aes(x=Month,y=Per.Capita)) +
  geom_bar(stat="identity") +
  facet_wrap(~State) +
  ggtitle("Background checks by state") +
  theme(plot.title = element_text(family="Trebuchet MS", face="bold", size=20, hjust=0, color="#555555")) +
  theme(axis.text.x = element_text(angle=90)) 

Adjusting CT figures for seasonality

# Visit [seasonal.website](http://www.seasonal.website/seasonal.html) to learn how to set up the seasonal package from the Census
Sys.setenv(X13_PATH = "/Users/andrewtran/Documents/Github/seasonal")

ct_seas <- ts(ct_month[,2],frequency=12,start=c(1998,11))
m <- seas(ct_seas)
plot(m)

# Prepping for export as a spreadsheet
dfct <- as.data.frame(ct_seas)

ct_months_only <- ct_month$Month

dfct <- cbind(ct_months_only, dfct)
colnames(dfct) <- c("Month", "Original")
dfct2 <- data.frame(final(m))
dfct <- cbind(dfct, dfct2)

colnames(dfct) <- c("Month", "Original", "Adjusted")
dfct$Adjusted <- round(dfct$Adjusted, 2)

dfct = dfct[-1,]
kable(head(dfct))
Month Original Adjusted
1998-12 1998-12-01 2.02 1.83
1999-01 1999-01-01 1.85 1.89
1999-02 1999-02-01 2.38 2.30
1999-03 1999-03-01 2.33 1.97
1999-04 1999-04-01 2.69 2.43
1999-05 1999-05-01 2.42 2.46

Adjusting US figures for seasonality

## Prep dataframe 
us_col <- monthly[c("Month", "percapita")]
us_col <- us_col[-1,]

## Turn it into a time series
us_tl <- ts(us_col[,2],frequency=12,start=c(1998,11))

## Adjust it for seasonality
us_m <- seas(us_tl)
plot(us_m)

## new data.frame specifically for US
colnames(us_col) <- c("Month", "Original") 
us_df <- as.data.frame(final(us_m))
us_col <- cbind(us_col, us_df)
colnames(us_col) <- c("Month", "Original", "Adjusted")
us_col$Adjusted <- round(us_col$Adjusted, 2)
kable(head(us_col))
Month Original Adjusted
2 1998-12-01 3.16 2.19
3 1999-01-01 2.10 2.25
4 1999-02-01 2.47 2.45
5 1999-03-01 2.66 2.52
6 1999-04-01 2.29 2.53
7 1999-05-01 2.04 2.65

US vs. CT adjusted

usct <- us_col[c("Month", "Adjusted")]
colnames(usct) <- c("Month", "US.Adjusted")
ct_adjusted <- dfct[c("Month", "Adjusted")]
colnames(ct_adjusted) <- c("Month", "CT.Adjusted")

usct <- left_join(usct, ct_adjusted)
## Joining by: "Month"
kable(head(usct))
Month US.Adjusted CT.Adjusted
1998-12-01 2.19 1.83
1999-01-01 2.25 1.89
1999-02-01 2.45 2.30
1999-03-01 2.52 1.97
1999-04-01 2.53 2.43
1999-05-01 2.65 2.46

Charting out the seasonally adjusted per capita data

us_seas_only <- us_col[c("Month", "Adjusted")]
us_seas_only$Month <- as.character(us_seas_only$Month)
colnames(us_seas_only) <- c("Month", "US")
us_seas_only$US <- as.numeric(as.character(us_seas_only$US))
months_only <- as.character(us_seas_only$Month)

adj_seas <- by_month
adj_seas_num <- 2:ncol(adj_seas)
adj_seas_list <- colnames(adj_seas)

adj_seas$Month <- as.Date(adj_seas$Month, format="%Y-%m")

for (i in adj_seas_num) {
  adj_seas_col <- colnames(adj_seas[i])
  seas_df_subset <- adj_seas[c("Month", adj_seas_col)]
  subset_series <- ts(seas_df_subset[,2],frequency=12,start=c(1998,11))
  subset_seas <- seas(subset_series)
  subset_seas_df <- as.data.frame(final(subset_seas))
  
  subset_seas_df <- subset_seas_df[-1,]
  months_only_df <- months_only
  months_only_df <- as.data.frame(cbind(months_only_df, subset_seas_df))
  colnames(months_only_df) <- c("Month", adj_seas_col)
  months_only_df$Month <- as.character(months_only_df$Month)
  months_only_df[,2] <- as.numeric(as.character(months_only_df[,2]))
  months_only_df[,2] <- round(months_only_df[,2], 2)
  
  if (!exists("months_only_all")) {
    months_only_all <- left_join(us_seas_only, months_only_df)
  } else {
    months_only_all <- left_join(months_only_all, months_only_df)
  }
  
}
kable(head(months_only_all[,1:5]))
Month US Alabama Alaska Arizona
1998-12-01 2.19 4.59 5.38 2.75
1999-01-01 2.25 4.54 5.61 2.73
1999-02-01 2.45 4.63 5.47 2.81
1999-03-01 2.52 4.65 5.65 2.89
1999-04-01 2.53 4.53 5.83 2.96
1999-05-01 2.65 4.59 5.67 2.91
# Exporting seasonally-adjusted figures to csv
write.csv(months_only_all, "data/states_monthly_adjusted.csv")

# Charting out with ggplot 
for_gg <- months_only_all

# Adjusting timestamp just for cosmetic reasons
for_gg$Month <- substring(for_gg$Month, 1,7)

# Prepping the time stamps for ggplot
for_gg$Month <- factor(for_gg$Month)

# Reshaping the dataframe for ggplot and also D3
for_gg <- gather(for_gg, "State", "Per.Capita", 2:53)

## Charting it in one line graph
ggplot(data=for_gg, aes(x=for_gg$Month,y=Per.Capita, group=State)) +
  geom_line() +
  ggtitle("Background checks by state") +
  labs(x="Month", y="Per 1,000 residents")

Charting it using small multiples

ggplot(data=for_gg, aes(x=Month,y=Per.Capita)) +
  geom_bar(stat="identity") +
  facet_wrap(~State) +
  ggtitle(expression(atop("Background checks by state", atop(italic("Adjusted for seasonality"), "")))) +
  theme(plot.title = element_text(family="Trebuchet MS", face="bold", size=20, hjust=0, color="#555555")) +
  theme(axis.text.x = element_text(angle=90)) 
## Warning: Stacking not well defined when ymin != 0
## Warning: Stacking not well defined when ymin != 0

# Exporting for D3
write.table(for_gg, file="seasonally_adjusted.tsv", quote=FALSE, sep='\t', col.names=NA)