Traffic tickets in Connecticut overview

An analysis for the TrendCT story: When do police most often ticket drivers? And for what?

The data is from the Connecticut Racial Profiling Prohibition Project hosted at the CT Data Collaborative which collects traffic incident reports between October 2013 and September 2014.

# Bringing the data in. This is a huge file. Brace yourself.

incidents <- read.csv("http://ctrp3viz.s3.amazonaws.com/data/Connecticut_r1.csv", stringsAsFactors=FALSE)

Most popular ticket type

c <- ggplot(incidents, aes(x=StatutoryReasonForStop))
c <- c + geom_histogram(colour="darkred",fill="white", binwidth=1)
c + theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Cleaning up the dates and time
incidents$RealDate <- as.POSIXct(as.Date(incidents$InterventionDateTime, origin="1899-12-30"))
incidents$RealTime <- format(as.POSIXct((incidents$InterventionTime) * 86400, origin = "1970-01-01"), "%H:%M")
incidents$RealTime <- hm(incidents$RealTime)
incidents$Day.of.Week <- as.factor(incidents$Day.of.Week)
levels(incidents$Day.of.Week) <- c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")

Which month has the most tickets?

incidents$month <- month(incidents$RealDate, label=TRUE)

c <- ggplot(incidents, aes(x=month))
c <- c + geom_histogram(colour="darkred",fill="white", binwidth=1)
c

Which day of the week gets the most tickets?

c <- ggplot(incidents, aes(x=Day.of.Week))
c <- c + geom_histogram(colour="darkred",fill="white", binwidth=1)
c

Which specific day of the year gets the most tickets?

incidents$calendar <- floor_date(incidents$RealDate, "day")

cal_days <- data.frame(table(incidents$calendar))
colnames(cal_days) <- c("Date", "Tickets")
library(dygraphs)
library(xts)

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# This is to convert the time series into another format called eXtensible Time Series
tix <- xts(cal_days[,-1],order.by=as.POSIXct(cal_days$Date))
colnames(tix) <- "Tickets"
dygraph(tix, main="Tickets by day in Connecticut") %>%
  dyOptions(stepPlot = TRUE, fillGraph=TRUE) %>%
  dyRangeSelector()

10 days that got the most tickets

kable(head(cal_days[order(-cal_days$Tickets),],10))

	Date	Tickets
334	2014-08-29	3244
236	2014-05-23	3190
348	2014-09-12	2859
340	2014-09-04	2850
277	2014-07-03	2842
233	2014-05-20	2826
237	2014-05-24	2813
341	2014-09-05	2755
234	2014-05-21	2706
354	2014-09-18	2648

#Most day of the month gets the most tickets?
incidents$day_num <- day(incidents$RealDate)
c <- ggplot(incidents, aes(x=day_num))
c <- c + geom_histogram(colour="darkred",fill="white", binwidth=1)
c

# That's not very useful-- 11 months of the year have 30 days while half of that has 31.
# Let's try to be more accurate

# Let's focus on the last week, first week, and everything in between as middle weeks. 
# What's the average ticket per day based on those categories?

incidents$cal_month <- floor_date(incidents$RealDate, "month")
cal_months <- data.frame(table(incidents$cal_month))

incidents$cal_week <- floor_date(incidents$RealDate, "week")

cal_week <- data.frame(table(incidents$cal_week))

incidents$cal_days <- days_in_month(incidents$RealDate)

incidents$day_num <- day(incidents$RealDate)

day_num <- data.frame(sort(table(incidents$day_num), descending=TRUE))

longmonth <- subset(incidents, cal_days==31)
midmonth <- subset(incidents, cal_days==30)
shortmonth <- subset(incidents, cal_days==28)

##31 DAY MONTH ANALYSIS HERE
longmonth_end <- subset(longmonth, (day_num==25 | day_num==26 | day_num==27 | 
                                      day_num==28 | day_num==29 | day_num==30 | day_num==31))

last_seven_long <- nrow(longmonth_end)/7

longmonth_start <- subset(longmonth, (day_num==1 | day_num==2 | day_num==3 | 
                                      day_num==4 | day_num==5 | day_num==6 | day_num==7))

first_seven_long <- nrow(longmonth_start)/7

longmonth_mid <- subset(longmonth, (day_num!=1 & day_num!=2 & day_num!=3 & 
                                        day_num!=4 & day_num!=5 & day_num!=6 & day_num!=7 &
                                      day_num!=25 & day_num!=26 & day_num!=27 & 
                                      day_num!=28 & day_num!=29 & day_num!=30 & day_num!=31))

mid_seven_long <- nrow(longmonth_mid)/17

Average number of tickets per day in a 31 day month

avg_long <- data.frame(first_seven_long, mid_seven_long, last_seven_long)
colnames(avg_long) <- c("First week", "Middle weeks", "Last week")
kable(avg_long)

First week	Middle weeks	Last week
10757.86	11375.59	12190.57

##30 DAY MONTH ANALYSIS HERE
midmonth_end <- subset(midmonth, (day_num==24 | day_num==25 | day_num==26 | 
                                      day_num==27 | day_num==28 | day_num==29 | day_num==30))

last_seven_mid <- nrow(midmonth_end)/7

midmonth_start <- subset(midmonth, (day_num==1 | day_num==2 | day_num==3 | 
                                        day_num==4 | day_num==5 | day_num==6 | day_num==7))

first_seven_mid <- nrow(midmonth_start)/7

midmonth_mid <- subset(midmonth, (day_num!=1 & day_num!=2 & day_num!=3 & 
                                      day_num!=4 & day_num!=5 & day_num!=6 & day_num!=7 &
                                      day_num!=24 & day_num!=25 & day_num!=26 & 
                                      day_num!=27 & day_num!=28 & day_num!=29 & day_num!=30))

mid_seven_mid <- nrow(midmonth_mid)/16

Average number of tickets per day in a 30 day month

avg_mid <- data.frame(first_seven_mid, mid_seven_mid, last_seven_mid)
colnames(avg_mid) <- c("First week", "Middle weeks", "Last week")
kable(avg_mid)

First week	Middle weeks	Last week
7015.857	6925.562	6554

##FEBRUARY ANALYSIS HERE
shortmonth_end <- subset(shortmonth, (day_num==22 | day_num==23 | day_num==24 | 
                                    day_num==25 | day_num==26 | day_num==27 | day_num==28))

last_seven_short <- nrow(shortmonth_end)/7

shortmonth_start <- subset(shortmonth, (day_num==1 | day_num==2 | day_num==3 | 
                                      day_num==4 | day_num==5 | day_num==6 | day_num==7))

first_seven_short <- nrow(shortmonth_start)/7

shortmonth_mid <- subset(shortmonth, (day_num!=1 & day_num!=2 & day_num!=3 & 
                                    day_num!=4 & day_num!=5 & day_num!=6 & day_num!=7 &
                                    day_num!=22 & day_num!=23 & day_num!=24 & 
                                    day_num!=25 & day_num!=26 & day_num!=27 & day_num!=28))

mid_seven_short <- nrow(shortmonth_mid)/14

Average number of tickets per day in February

avg_short <- data.frame(first_seven_short, mid_seven_short, last_seven_short)
colnames(avg_short) <- c("First week", "Middle weeks", "Last week")
kable(avg_short)

First week	Middle weeks	Last week
1251.429	1092	1577

Average (of the averages) number of tickets per day based on time of month

avg_last <- (last_seven_short+last_seven_mid+last_seven_long)/3
avg_mid <- (mid_seven_short+mid_seven_mid+mid_seven_long)/3
avg_start<- (first_seven_short+first_seven_mid+first_seven_long)/3

avg <- data.frame(avg_start, avg_mid, avg_last)
colnames(avg) <- c("First week", "Middle weeks", "Last weeks")
kable(avg)

First week	Middle weeks	Last weeks
6341.714	6464.384	6773.857

Traffic tickets in Connecticut overview

Andrew Ba Tran

July 3, 2015

Most popular ticket type

Which month has the most tickets?

Which day of the week gets the most tickets?

Which specific day of the year gets the most tickets?

10 days that got the most tickets

Average number of tickets per day in a 31 day month

Average number of tickets per day in a 30 day month

Average number of tickets per day in February

Average (of the averages) number of tickets per day based on time of month

Overall, it looks like the daily average of tickets increases through the month.

Except in months with 30 days of the year. Then the trend reverses.