An analysis for the TrendCT story: Who gets off with a warning after a traffic stop in Connecticut?

The data is from the Connecticut Racial Profiling Prohibition Project hosted at the CT Data Collaborative which collects traffic incident reports between October 2013 and September 2014.

library(lubridate)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DT)
library(knitr)
library(stringr)
library(tidyr)
incidents <- read.csv("http://ctrp3viz.s3.amazonaws.com/data/Connecticut_r1.csv", stringsAsFactors=FALSE)

#warnings <- incidents[grepl("Warn",incidents$InterventionDispositionReasonText),]

Observations


OK, let’s take a look at departments and tickets vs warnings

incidents$InterventionDispositionCode <- str_trim(incidents$InterventionDispositionCode)

# Graph 1. Tickets versus warnings
index <- c("I", "M", "N", "U", "V", "W")

values <- c("Ticket", "Ticket", "Ticket", "Ticket", "Warning", "Warning")

# for all
incidents$What <- values[match(incidents$InterventionDispositionCode, index)]

violations <- data.frame(table(incidents$StatutoryReasonForStop, incidents$What))
colnames(violations) <- c("violation", "type", "freq")
violations_df <- violations %>% spread(type, freq)
violations_df$perc_ticket <- round((violations_df$Ticket/(violations_df$Ticket+violations_df$Warning))*100, digits=2)
violations_df$perc_warning <- round((violations_df$Warning/(violations_df$Ticket+violations_df$Warning))*100, digits=2)
datatable(violations_df)

Observations


Stats on each department that gave out warnings

warnings <- data.frame(table(incidents$Department.Name, incidents$InterventionDispositionCode))
colnames(warnings) <- c("Department", "Disposition", "Freq")
warnings_df <- warnings %>% spread(Disposition, Freq)
warnings_df$total <- warnings_df$I + warnings_df$M + warnings_df$N + warnings_df$U + warnings_df$V + warnings_df$W
warnings_df$warnings <- warnings_df$V + warnings_df$W
warnings_df$tickets <- warnings_df$total - warnings_df$warnings

warnings_df$perc_warnings <- round((warnings_df$warnings/warnings_df$total)*100, digits=2)
warnings_df$perc_tickets <- 100 - warnings_df$perc_warnings
warnings_df_table <- warnings_df[c("Department", "total", "tickets", "warnings", "perc_tickets", "perc_warnings")]
write.csv(warnings_df_table, "department_warnings.csv")
datatable(warnings_df_table)

Observations


Interesting. Ok let’s look at age

# by age
incidents_all <- incidents
incidents$SubjectAge <- as.numeric(incidents$SubjectAge)
## Warning: NAs introduced by coercion
# We have to take out the outlier ages, first. There are some weird negative ages and such

incidents <- subset(incidents, SubjectAge > 14 & SubjectAge < 100)

incidents_age <- data.frame(table(incidents$SubjectAge,incidents$What))
colnames(incidents_age) <- c("age", "type", "freq")

incidents_age_df <- incidents_age %>% spread(type, freq)
incidents_age_df$perc_ticket <- round((incidents_age_df$Ticket/(incidents_age_df$Ticket+incidents_age_df$Warning))*100, digits=2)
incidents_age_df$perc_warning <- round((incidents_age_df$Warning/(incidents_age_df$Ticket+incidents_age_df$Warning))*100, digits=2)

warnings_only <- subset(incidents, What=="Warning")
tickets_only <- subset(incidents, What=="Ticket")

ggplot(incidents_age, aes(age, freq, group=type, fill=type)) + ggtitle("Tickets and warnings by percent") + geom_area(position="fill")

ggplot(incidents_age, aes(age, y=freq, group=type, colour=type)) +
  ggtitle("Warnings by age total") +
  geom_line() +
  geom_point()

Does age change between men and women for warnings?

warnings_gender <- data.frame(table(warnings_only$SubjectAge,warnings_only$SubjectSexCode))
colnames(warnings_gender) <- c("age", "gender", "freq")

ggplot(warnings_gender, aes(age, freq, group=gender, fill=gender)) + ggtitle("Warnings by gender(percent)") + geom_area(position="fill")

warnings_gender_df <- warnings_gender %>% spread(gender, freq)
warnings_gender_df$perc_male <- round((warnings_gender_df$M/(warnings_gender_df$M+warnings_gender_df$F))*100, digits=2)
warnings_gender_df$perc_women <- round((warnings_gender_df$F/(warnings_gender_df$M+warnings_gender_df$F))*100, digits=2)

ggplot(warnings_gender, aes(age, y=freq, group=gender, colour=gender)) +
  ggtitle("Warnings by age total") +
  geom_line() +
  geom_point()

OK, what about tickets by gender?

tickets_gender <- data.frame(table(tickets_only$SubjectAge,tickets_only$SubjectSexCode))
colnames(tickets_gender) <- c("age", "gender", "freq")

tickets_gender_df <- tickets_gender %>% spread(gender, freq)
tickets_gender_df$perc_male <- round((tickets_gender_df$M/(tickets_gender_df$M+tickets_gender_df$F))*100, digits=2)
tickets_gender_df$perc_women <- round((tickets_gender_df$F/(tickets_gender_df$M+tickets_gender_df$F))*100, digits=2)

ggplot(tickets_gender, aes(age, freq, group=gender, fill=gender)) + ggtitle("Tickets by gender(percent)") + geom_area(position="fill")

ggplot(tickets_gender, aes(age, y=freq, group=gender, colour=gender)) +
  ggtitle("Tickets by age total") +
  geom_line() +
  geom_point()

tickets_gender <- data.frame(table(tickets_only$SubjectAge,tickets_only$SubjectSexCode))
colnames(tickets_gender) <- c("age", "gender", "freq")

tickets_gender_df <- tickets_gender %>% spread(gender, freq)
tickets_gender_df$perc_male <- round((tickets_gender_df$M/(tickets_gender_df$M+tickets_gender_df$F))*100, digits=2)
tickets_gender_df$perc_women <- round((tickets_gender_df$F/(tickets_gender_df$M+tickets_gender_df$F))*100, digits=2)

ggplot(tickets_gender, aes(age, freq, group=gender, fill=gender)) + ggtitle("Tickets by gender(percent)") + geom_area(position="fill")

ggplot(tickets_gender, aes(age, y=freq, group=gender, colour=gender)) +
  ggtitle("Tickets by age total") +
  geom_line() +
  geom_point()

What about race?

incidents$Race <- paste(incidents$SubjectRaceCode, incidents$SubjectEthnicityCode)

index_race <- c("A H", "A M", "A N", "B H", "B M", "B N", 
           "I H", "I M", "I N", "W H", "W M", "W N")

values_race <- c("Hispanic", "Middle Eastern", "Asian", "Hispanic", "Middle Eastern", "Black", 
            "Hispanic", "Middle Eastern", "Indian", "Hispanic", "Middle Eastern", "White")

incidents$Def_Race <- values_race[match(incidents$Race, index_race)]

race_df <- data.frame(table(incidents$Def_Race, incidents$What))
colnames(race_df) <- c("Race", "Type", "Freq")
race_df_spread <- race_df %>% spread(Type, Freq)
race_df_spread$perc_ticket <- round((race_df_spread$Ticket/(race_df_spread$Ticket+race_df_spread$Warning))*100, digits=2)
race_df_spread$perc_warning <- round((race_df_spread$Warning/(race_df_spread$Ticket+race_df_spread$Warning))*100, digits=2)
write.csv(race_df_spread, "race_warnings.csv")
kable(race_df_spread)
Race Ticket Warning perc_ticket perc_warning
Asian 3609 3098 53.81 46.19
Black 44167 31927 58.04 41.96
Hispanic 44493 24884 64.13 35.87
Indian 655 499 56.76 43.24
Middle Eastern 22384 16648 57.35 42.65
White 216325 185910 53.78 46.22