An analysis for the TrendCT story: When do police most often ticket drivers? And for what?

The data is from the Connecticut Racial Profiling Prohibition Project hosted at the CT Data Collaborative which collects traffic incident reports between October 2013 and September 2014.

# Bringing the data in. This is a huge file. Brace yourself.

incidents <- read.csv("http://ctrp3viz.s3.amazonaws.com/data/Connecticut_r1.csv", stringsAsFactors=FALSE)

Which month has the most tickets?

incidents$month <- month(incidents$RealDate, label=TRUE)

c <- ggplot(incidents, aes(x=month))
c <- c + geom_histogram(colour="darkred",fill="white", binwidth=1)
c

Which day of the week gets the most tickets?

c <- ggplot(incidents, aes(x=Day.of.Week))
c <- c + geom_histogram(colour="darkred",fill="white", binwidth=1)
c

Which specific day of the year gets the most tickets?

incidents$calendar <- floor_date(incidents$RealDate, "day")

cal_days <- data.frame(table(incidents$calendar))
colnames(cal_days) <- c("Date", "Tickets")
library(dygraphs)
library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# This is to convert the time series into another format called eXtensible Time Series
tix <- xts(cal_days[,-1],order.by=as.POSIXct(cal_days$Date))
colnames(tix) <- "Tickets"
dygraph(tix, main="Tickets by day in Connecticut") %>%
  dyOptions(stepPlot = TRUE, fillGraph=TRUE) %>%
  dyRangeSelector()

10 days that got the most tickets

kable(head(cal_days[order(-cal_days$Tickets),],10))
Date Tickets
334 2014-08-29 3244
236 2014-05-23 3190
348 2014-09-12 2859
340 2014-09-04 2850
277 2014-07-03 2842
233 2014-05-20 2826
237 2014-05-24 2813
341 2014-09-05 2755
234 2014-05-21 2706
354 2014-09-18 2648
#Most day of the month gets the most tickets?
incidents$day_num <- day(incidents$RealDate)
c <- ggplot(incidents, aes(x=day_num))
c <- c + geom_histogram(colour="darkred",fill="white", binwidth=1)
c

# That's not very useful-- 11 months of the year have 30 days while half of that has 31.
# Let's try to be more accurate

# Let's focus on the last week, first week, and everything in between as middle weeks. 
# What's the average ticket per day based on those categories?

incidents$cal_month <- floor_date(incidents$RealDate, "month")
cal_months <- data.frame(table(incidents$cal_month))

incidents$cal_week <- floor_date(incidents$RealDate, "week")

cal_week <- data.frame(table(incidents$cal_week))

incidents$cal_days <- days_in_month(incidents$RealDate)

incidents$day_num <- day(incidents$RealDate)

day_num <- data.frame(sort(table(incidents$day_num), descending=TRUE))

longmonth <- subset(incidents, cal_days==31)
midmonth <- subset(incidents, cal_days==30)
shortmonth <- subset(incidents, cal_days==28)

##31 DAY MONTH ANALYSIS HERE
longmonth_end <- subset(longmonth, (day_num==25 | day_num==26 | day_num==27 | 
                                      day_num==28 | day_num==29 | day_num==30 | day_num==31))

last_seven_long <- nrow(longmonth_end)/7

longmonth_start <- subset(longmonth, (day_num==1 | day_num==2 | day_num==3 | 
                                      day_num==4 | day_num==5 | day_num==6 | day_num==7))

first_seven_long <- nrow(longmonth_start)/7

longmonth_mid <- subset(longmonth, (day_num!=1 & day_num!=2 & day_num!=3 & 
                                        day_num!=4 & day_num!=5 & day_num!=6 & day_num!=7 &
                                      day_num!=25 & day_num!=26 & day_num!=27 & 
                                      day_num!=28 & day_num!=29 & day_num!=30 & day_num!=31))

mid_seven_long <- nrow(longmonth_mid)/17

Average number of tickets per day in a 31 day month

avg_long <- data.frame(first_seven_long, mid_seven_long, last_seven_long)
colnames(avg_long) <- c("First week", "Middle weeks", "Last week")
kable(avg_long)
First week Middle weeks Last week
10757.86 11375.59 12190.57
##30 DAY MONTH ANALYSIS HERE
midmonth_end <- subset(midmonth, (day_num==24 | day_num==25 | day_num==26 | 
                                      day_num==27 | day_num==28 | day_num==29 | day_num==30))

last_seven_mid <- nrow(midmonth_end)/7

midmonth_start <- subset(midmonth, (day_num==1 | day_num==2 | day_num==3 | 
                                        day_num==4 | day_num==5 | day_num==6 | day_num==7))

first_seven_mid <- nrow(midmonth_start)/7

midmonth_mid <- subset(midmonth, (day_num!=1 & day_num!=2 & day_num!=3 & 
                                      day_num!=4 & day_num!=5 & day_num!=6 & day_num!=7 &
                                      day_num!=24 & day_num!=25 & day_num!=26 & 
                                      day_num!=27 & day_num!=28 & day_num!=29 & day_num!=30))

mid_seven_mid <- nrow(midmonth_mid)/16

Average number of tickets per day in a 30 day month

avg_mid <- data.frame(first_seven_mid, mid_seven_mid, last_seven_mid)
colnames(avg_mid) <- c("First week", "Middle weeks", "Last week")
kable(avg_mid)
First week Middle weeks Last week
7015.857 6925.562 6554
##FEBRUARY ANALYSIS HERE
shortmonth_end <- subset(shortmonth, (day_num==22 | day_num==23 | day_num==24 | 
                                    day_num==25 | day_num==26 | day_num==27 | day_num==28))

last_seven_short <- nrow(shortmonth_end)/7

shortmonth_start <- subset(shortmonth, (day_num==1 | day_num==2 | day_num==3 | 
                                      day_num==4 | day_num==5 | day_num==6 | day_num==7))

first_seven_short <- nrow(shortmonth_start)/7

shortmonth_mid <- subset(shortmonth, (day_num!=1 & day_num!=2 & day_num!=3 & 
                                    day_num!=4 & day_num!=5 & day_num!=6 & day_num!=7 &
                                    day_num!=22 & day_num!=23 & day_num!=24 & 
                                    day_num!=25 & day_num!=26 & day_num!=27 & day_num!=28))

mid_seven_short <- nrow(shortmonth_mid)/14

Average number of tickets per day in February

avg_short <- data.frame(first_seven_short, mid_seven_short, last_seven_short)
colnames(avg_short) <- c("First week", "Middle weeks", "Last week")
kable(avg_short)
First week Middle weeks Last week
1251.429 1092 1577

Average (of the averages) number of tickets per day based on time of month

avg_last <- (last_seven_short+last_seven_mid+last_seven_long)/3
avg_mid <- (mid_seven_short+mid_seven_mid+mid_seven_long)/3
avg_start<- (first_seven_short+first_seven_mid+first_seven_long)/3

avg <- data.frame(avg_start, avg_mid, avg_last)
colnames(avg) <- c("First week", "Middle weeks", "Last weeks")
kable(avg)
First week Middle weeks Last weeks
6341.714 6464.384 6773.857

Overall, it looks like the daily average of tickets increases through the month.

Except in months with 30 days of the year. Then the trend reverses.