Increase in gun purchases triggered by Connecticut state legislation

An analysis for the TrendCT story: Increase in gun purchases triggered by Connecticut state legislation

The data is/are from the Buzzfeed repo that scraped the FBI NICS PDFs on the number of firearm checks by month, state, and type.

Check the Trend CT repo for the scripts and data sets that were used and generated. Just want the seasonally adjusted state per capita data? Here’s the csv.

There are many caveats to the data, but the basic thing to take away is that a background check does not necessarily mean a gun sale. The Buzzfeed repo has more details.

This analysis will go over the process used to put together the story. In short, the background checks were normalized for annual population by state, adjusted for seasonality, and then charted.

Population estimates

The historical population data was gathered from Census.gov.

# Packages we'll need
library(knitr)
library(stringr)
library(ggplot2)
library(lubridate)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(seasonal)

us <- read.csv("data/historicalpop-US.csv", stringsAsFactors=FALSE)
states <- read.csv("data/historicalpop-StatesT.csv", stringsAsFactors=FALSE)
kable(head(us))

Year	US
1998	275854104
1999	279040168
2000	282162411
2001	284968955
2002	287625193
2003	290107933

kable(head(states[,1:5]))

Year	Alabama	Alaska	Arizona	Arkansas
1998	4404701	619932	4883342	2626289
1999	4430141	624779	5023823	2651860
2000	4452173	627963	5160586	2678588
2001	4467634	633714	5273477	2691571
2002	4480089	642337	5396255	2705927
2003	4503491	648414	5510364	2724816

Bringing in the data from the Buzzfeed repo.

nics <- read.csv("data/nics-firearm-background-checks.csv", stringsAsFactors=FALSE)
# Quick trimming up of leading and trailing white spaces
# Using the stringr package
nics$state <- str_trim(nics$state)
kable(head(nics[,1:5]))

month	state	permit	handgun	long_gun
2015-11	Alabama	18870	23022	22650
2015-11	Alaska	209	3062	3209
2015-11	Arizona	2303	12382	9041
2015-11	Arkansas	3298	6359	11611
2015-11	California	98452	41181	35007
2015-11	Colorado	4144	19784	16082

Preliminary annual US analysis

# Adding a column for year in the dataframe
# But first have to convert the data into a date-friendly format

nics$date <- as.Date(paste(nics$month,"-01",sep=""))
nics$year <- year(nics$date)

# Creating a new dataframe with the total background checks in the country by year
annual <- data.frame(tapply(nics$totals, nics$year, sum))

# Cleaning up the new dataframe
annual$year <- rownames(annual)
rownames(annual) <- NULL
colnames(annual) <- c("Total", "Year")
annual <- annual[c("Year", "Total")]

# Merging background checks to population dataframe
annual <- merge(annual, us)

# Calculating per capita (1,000 residents)
annual$percapita <- round((annual$Total/annual$US)*1000,2)

# 1998 is an incomplete year, so let's take that out
annual <- annual[-1,]

kable(head(annual))

	Year	Total	US	percapita
2	1999	9043747	279040168	32.41
3	2000	8427096	282162411	29.87
4	2001	8820045	284968955	30.95
5	2002	8367069	287625193	29.09
6	2003	8402244	290107933	28.96
7	2004	8579891	292805298	29.30

Charting the total background checks by year

ggplot(data=annual, aes(x=Year,y=Total, group=1)) +
  geom_line() +
  ggtitle("Background checks for firearms in the US") +
  labs(x="Year", y="Total")

Charting the total background checks by year per capita

ggplot(data=annual, aes(x=Year,y=percapita, group=1)) +
  geom_line() +
  ggtitle("Background checks per capita for firearms in the US") +
  labs(x="Year", y="Per 1,000 residents")

Not much difference, right.

Monthly totals for the US

monthly <-data.frame(tapply(nics$totals, nics$date, sum))
monthly$date <- rownames(monthly)
rownames(monthly) <- NULL
colnames(monthly) <- c("Total", "Month")
monthly<- monthly[c("Month", "Total")]
kable(head(monthly))

Month	Total
1998-11-01	21176
1998-12-01	870722
1999-01-01	585974
1999-02-01	690215
1999-03-01	741687
1999-04-01	638666

Monthly background checks for the US per capita

monthly$Month <- ymd(monthly$Month)
monthly$Year <- year(monthly$Month)

# Join by annual population from the Census
monthly <- left_join(monthly, us)

## Joining by: "Year"

monthly$percapita <- round((monthly$Total/monthly$US)*1000,2)

us_month <- monthly[c("Month", "percapita")]
colnames(us_month) <- c("Month", "US")
kable(head(us_month))

Month	US
1998-11-01	0.08
1998-12-01	3.16
1999-01-01	2.10
1999-02-01	2.47
1999-03-01	2.66
1999-04-01	2.29

Monthly totals for Connecticut

ct <- subset(nics, state=="Connecticut")
ctpop  <- read.csv("data/historicalpop-CT.csv", stringsAsFactors=FALSE)
ct_monthly<-data.frame(tapply(ct$totals, ct$month, sum))
ct_monthly$date <- rownames(ct_monthly)
rownames(ct_monthly) <- NULL
colnames(ct_monthly) <- c("Total", "Month")
ct_monthly<- ct_monthly[c("Month", "Total")]
kable(head(ct_monthly))

Month	Total
1998-11	80
1998-12	6790
1999-01	6265
1999-02	8069
1999-03	7877
1999-04	9111

Monthly checks per capita: US vs CT

ct_monthly$Month <- as.Date(paste(ct_monthly$Month, "-01", sep=""))
ct_monthly$Month <- ymd(ct_monthly$Month)
ct_monthly$Year <- year(ct_monthly$Month)

ct_monthly <- left_join(ct_monthly, ctpop)

## Joining by: "Year"

ct_monthly$percapita <- round((ct_monthly$Total/ct_monthly$CT)*1000,2)

ct_month <- ct_monthly[c("Month", "percapita")]
colnames(ct_month) <- c("Month", "CT")

ct_us_month <- left_join(us_month, ct_month)

## Joining by: "Month"

# Prepping the dataframe for ggplot
ct_us_month_gg <- gather(ct_us_month, "State", "Per.Capita", 2:3)

## Warning: attributes are not identical across measure variables; they will
## be dropped

ggplot(data=ct_us_month_gg, aes(x=Month,y=Per.Capita, group=State, colour=State)) +
  geom_line() +
  ggtitle("Background checks per capita for firearms in the US and CT") +
  labs(x="Year", y="Per 1,000 residents")

Calculating the background checks per capita by state

# per capita for US one more time
monthly <-data.frame(tapply(nics$totals, nics$month, sum))
monthly$date <- rownames(monthly)
rownames(monthly) <- NULL
colnames(monthly) <- c("Total", "Month")
monthly<- monthly[c("Month", "Total")]
monthly$Month <- as.Date(paste(monthly$Month, "-01", sep=""))

monthly$Month <- ymd(monthly$Month)
monthly$Year <- year(monthly$Month)

monthly <- left_join(monthly, us)
monthly$percapita <- round((monthly$Total/monthly$US)*1000,2)

by_month <- monthly[c("Month", "percapita")]
colnames(by_month) <- c("Month", "US")

by_month$Month <- substr(by_month$Month, 1, 7)

# Monthly for all states now
# Restructure data.frame 

totals_only <- nics[c("month", "state", "totals")] 

spreaded_totals <- spread(totals_only, state, totals)
spreaded_totals <- spreaded_totals[,colSums(is.na(spreaded_totals))<nrow(spreaded_totals)]
spreaded_totals$year <- substr(spreaded_totals$month, 1, 4)
spreaded_totals$year <- as.numeric(spreaded_totals$year)

# Setting up the loop to go through the two dataframes
# - One with annual historical state population
# - One from the NICS for total background checks

states_num <- ncol(states)
states_list <- 2:states_num

# This is the messy loop
for (i in states_list) {
  state_name <- colnames(states[i])
  temp_df <- states[c("Year", state_name)]
  colnames(temp_df) <- c("year", "Population")
  state_name <- gsub("\\.", " ", state_name)
  test_this <- grepl(state_name, colnames(spreaded_totals))
  test_sum <- sum(test_this)
  if (test_sum>0) {
    # looking just at totals
    nics_df <- spreaded_totals[c("month", "year", state_name)]
    temp_df <- left_join(temp_df, nics_df)
    temp_df$per_capita <- round((temp_df[,4]/temp_df[,2])*1000,2)
    temp_df <- temp_df[c("month", "per_capita")]
    colnames(temp_df) <- c("Month", state_name)
    
    by_month <- left_join(by_month, temp_df)
  }
}

Charting out the state per capita results

test_df <- by_month

test_df$Month <- factor(test_df$Month)

test_df <- gather(test_df, "State", "Per.Capita", 2:53)

## Warning: attributes are not identical across measure variables; they will
## be dropped

ggplot(data=test_df, aes(x=test_df$Month,y=Per.Capita, group=State)) +
  geom_line() +
  ggtitle("Background checks by state") +
  labs(x="Month", y="Per 1,000 residents")

Charting out the state per capita with small multiples

ggplot(data=test_df, aes(x=Month,y=Per.Capita)) +
  geom_bar(stat="identity") +
  facet_wrap(~State) +
  ggtitle("Background checks by state") +
  theme(plot.title = element_text(family="Trebuchet MS", face="bold", size=20, hjust=0, color="#555555")) +
  theme(axis.text.x = element_text(angle=90))

Adjusting CT figures for seasonality

# Visit [seasonal.website](http://www.seasonal.website/seasonal.html) to learn how to set up the seasonal package from the Census
Sys.setenv(X13_PATH = "/Users/andrewtran/Documents/Github/seasonal")

ct_seas <- ts(ct_month[,2],frequency=12,start=c(1998,11))
m <- seas(ct_seas)
plot(m)

# Prepping for export as a spreadsheet
dfct <- as.data.frame(ct_seas)

ct_months_only <- ct_month$Month

dfct <- cbind(ct_months_only, dfct)
colnames(dfct) <- c("Month", "Original")
dfct2 <- data.frame(final(m))
dfct <- cbind(dfct, dfct2)

colnames(dfct) <- c("Month", "Original", "Adjusted")
dfct$Adjusted <- round(dfct$Adjusted, 2)

dfct = dfct[-1,]
kable(head(dfct))

	Month	Original	Adjusted
1998-12	1998-12-01	2.02	1.83
1999-01	1999-01-01	1.85	1.89
1999-02	1999-02-01	2.38	2.30
1999-03	1999-03-01	2.33	1.97
1999-04	1999-04-01	2.69	2.43
1999-05	1999-05-01	2.42	2.46

Adjusting US figures for seasonality

## Prep dataframe 
us_col <- monthly[c("Month", "percapita")]
us_col <- us_col[-1,]

## Turn it into a time series
us_tl <- ts(us_col[,2],frequency=12,start=c(1998,11))

## Adjust it for seasonality
us_m <- seas(us_tl)
plot(us_m)

## new data.frame specifically for US
colnames(us_col) <- c("Month", "Original") 
us_df <- as.data.frame(final(us_m))
us_col <- cbind(us_col, us_df)
colnames(us_col) <- c("Month", "Original", "Adjusted")
us_col$Adjusted <- round(us_col$Adjusted, 2)
kable(head(us_col))

	Month	Original	Adjusted
2	1998-12-01	3.16	2.19
3	1999-01-01	2.10	2.25
4	1999-02-01	2.47	2.45
5	1999-03-01	2.66	2.52
6	1999-04-01	2.29	2.53
7	1999-05-01	2.04	2.65

US vs. CT adjusted

usct <- us_col[c("Month", "Adjusted")]
colnames(usct) <- c("Month", "US.Adjusted")
ct_adjusted <- dfct[c("Month", "Adjusted")]
colnames(ct_adjusted) <- c("Month", "CT.Adjusted")

usct <- left_join(usct, ct_adjusted)

## Joining by: "Month"

kable(head(usct))

Month	US.Adjusted	CT.Adjusted
1998-12-01	2.19	1.83
1999-01-01	2.25	1.89
1999-02-01	2.45	2.30
1999-03-01	2.52	1.97
1999-04-01	2.53	2.43
1999-05-01	2.65	2.46

Charting out the seasonally adjusted per capita data

us_seas_only <- us_col[c("Month", "Adjusted")]
us_seas_only$Month <- as.character(us_seas_only$Month)
colnames(us_seas_only) <- c("Month", "US")
us_seas_only$US <- as.numeric(as.character(us_seas_only$US))
months_only <- as.character(us_seas_only$Month)

adj_seas <- by_month
adj_seas_num <- 2:ncol(adj_seas)
adj_seas_list <- colnames(adj_seas)

adj_seas$Month <- as.Date(adj_seas$Month, format="%Y-%m")

for (i in adj_seas_num) {
  adj_seas_col <- colnames(adj_seas[i])
  seas_df_subset <- adj_seas[c("Month", adj_seas_col)]
  subset_series <- ts(seas_df_subset[,2],frequency=12,start=c(1998,11))
  subset_seas <- seas(subset_series)
  subset_seas_df <- as.data.frame(final(subset_seas))
  
  subset_seas_df <- subset_seas_df[-1,]
  months_only_df <- months_only
  months_only_df <- as.data.frame(cbind(months_only_df, subset_seas_df))
  colnames(months_only_df) <- c("Month", adj_seas_col)
  months_only_df$Month <- as.character(months_only_df$Month)
  months_only_df[,2] <- as.numeric(as.character(months_only_df[,2]))
  months_only_df[,2] <- round(months_only_df[,2], 2)
  
  if (!exists("months_only_all")) {
    months_only_all <- left_join(us_seas_only, months_only_df)
  } else {
    months_only_all <- left_join(months_only_all, months_only_df)
  }
  
}
kable(head(months_only_all[,1:5]))

Month	US	Alabama	Alaska	Arizona
1998-12-01	2.19	4.59	5.38	2.75
1999-01-01	2.25	4.54	5.61	2.73
1999-02-01	2.45	4.63	5.47	2.81
1999-03-01	2.52	4.65	5.65	2.89
1999-04-01	2.53	4.53	5.83	2.96
1999-05-01	2.65	4.59	5.67	2.91

# Exporting seasonally-adjusted figures to csv
write.csv(months_only_all, "data/states_monthly_adjusted.csv")

# Charting out with ggplot 
for_gg <- months_only_all

# Adjusting timestamp just for cosmetic reasons
for_gg$Month <- substring(for_gg$Month, 1,7)

# Prepping the time stamps for ggplot
for_gg$Month <- factor(for_gg$Month)

# Reshaping the dataframe for ggplot and also D3
for_gg <- gather(for_gg, "State", "Per.Capita", 2:53)

## Charting it in one line graph
ggplot(data=for_gg, aes(x=for_gg$Month,y=Per.Capita, group=State)) +
  geom_line() +
  ggtitle("Background checks by state") +
  labs(x="Month", y="Per 1,000 residents")

Charting it using small multiples

ggplot(data=for_gg, aes(x=Month,y=Per.Capita)) +
  geom_bar(stat="identity") +
  facet_wrap(~State) +
  ggtitle(expression(atop("Background checks by state", atop(italic("Adjusted for seasonality"), "")))) +
  theme(plot.title = element_text(family="Trebuchet MS", face="bold", size=20, hjust=0, color="#555555")) +
  theme(axis.text.x = element_text(angle=90))

## Warning: Stacking not well defined when ymin != 0

## Warning: Stacking not well defined when ymin != 0

# Exporting for D3
write.table(for_gg, file="seasonally_adjusted.tsv", quote=FALSE, sep='\t', col.names=NA)