An analysis for the TrendCT story: Who gets off with a warning after a traffic stop in Connecticut?
The data is from the Connecticut Racial Profiling Prohibition Project hosted at the CT Data Collaborative which collects traffic incident reports between October 2013 and September 2014.
library(lubridate)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(DT)
library(knitr)
library(stringr)
library(tidyr)
incidents <- read.csv("http://ctrp3viz.s3.amazonaws.com/data/Connecticut_r1.csv", stringsAsFactors=FALSE)
#warnings <- incidents[grepl("Warn",incidents$InterventionDispositionReasonText),]
Observations
incidents$InterventionDispositionCode <- str_trim(incidents$InterventionDispositionCode)
# Graph 1. Tickets versus warnings
index <- c("I", "M", "N", "U", "V", "W")
values <- c("Ticket", "Ticket", "Ticket", "Ticket", "Warning", "Warning")
# for all
incidents$What <- values[match(incidents$InterventionDispositionCode, index)]
violations <- data.frame(table(incidents$StatutoryReasonForStop, incidents$What))
colnames(violations) <- c("violation", "type", "freq")
violations_df <- violations %>% spread(type, freq)
violations_df$perc_ticket <- round((violations_df$Ticket/(violations_df$Ticket+violations_df$Warning))*100, digits=2)
violations_df$perc_warning <- round((violations_df$Warning/(violations_df$Ticket+violations_df$Warning))*100, digits=2)
datatable(violations_df)
Observations
warnings <- data.frame(table(incidents$Department.Name, incidents$InterventionDispositionCode))
colnames(warnings) <- c("Department", "Disposition", "Freq")
warnings_df <- warnings %>% spread(Disposition, Freq)
warnings_df$total <- warnings_df$I + warnings_df$M + warnings_df$N + warnings_df$U + warnings_df$V + warnings_df$W
warnings_df$warnings <- warnings_df$V + warnings_df$W
warnings_df$tickets <- warnings_df$total - warnings_df$warnings
warnings_df$perc_warnings <- round((warnings_df$warnings/warnings_df$total)*100, digits=2)
warnings_df$perc_tickets <- 100 - warnings_df$perc_warnings
warnings_df_table <- warnings_df[c("Department", "total", "tickets", "warnings", "perc_tickets", "perc_warnings")]
write.csv(warnings_df_table, "department_warnings.csv")
datatable(warnings_df_table)
Observations
# by age
incidents_all <- incidents
incidents$SubjectAge <- as.numeric(incidents$SubjectAge)
## Warning: NAs introduced by coercion
# We have to take out the outlier ages, first. There are some weird negative ages and such
incidents <- subset(incidents, SubjectAge > 14 & SubjectAge < 100)
incidents_age <- data.frame(table(incidents$SubjectAge,incidents$What))
colnames(incidents_age) <- c("age", "type", "freq")
incidents_age_df <- incidents_age %>% spread(type, freq)
incidents_age_df$perc_ticket <- round((incidents_age_df$Ticket/(incidents_age_df$Ticket+incidents_age_df$Warning))*100, digits=2)
incidents_age_df$perc_warning <- round((incidents_age_df$Warning/(incidents_age_df$Ticket+incidents_age_df$Warning))*100, digits=2)
warnings_only <- subset(incidents, What=="Warning")
tickets_only <- subset(incidents, What=="Ticket")
ggplot(incidents_age, aes(age, freq, group=type, fill=type)) + ggtitle("Tickets and warnings by percent") + geom_area(position="fill")
ggplot(incidents_age, aes(age, y=freq, group=type, colour=type)) +
ggtitle("Warnings by age total") +
geom_line() +
geom_point()
warnings_gender <- data.frame(table(warnings_only$SubjectAge,warnings_only$SubjectSexCode))
colnames(warnings_gender) <- c("age", "gender", "freq")
ggplot(warnings_gender, aes(age, freq, group=gender, fill=gender)) + ggtitle("Warnings by gender(percent)") + geom_area(position="fill")
warnings_gender_df <- warnings_gender %>% spread(gender, freq)
warnings_gender_df$perc_male <- round((warnings_gender_df$M/(warnings_gender_df$M+warnings_gender_df$F))*100, digits=2)
warnings_gender_df$perc_women <- round((warnings_gender_df$F/(warnings_gender_df$M+warnings_gender_df$F))*100, digits=2)
ggplot(warnings_gender, aes(age, y=freq, group=gender, colour=gender)) +
ggtitle("Warnings by age total") +
geom_line() +
geom_point()
tickets_gender <- data.frame(table(tickets_only$SubjectAge,tickets_only$SubjectSexCode))
colnames(tickets_gender) <- c("age", "gender", "freq")
tickets_gender_df <- tickets_gender %>% spread(gender, freq)
tickets_gender_df$perc_male <- round((tickets_gender_df$M/(tickets_gender_df$M+tickets_gender_df$F))*100, digits=2)
tickets_gender_df$perc_women <- round((tickets_gender_df$F/(tickets_gender_df$M+tickets_gender_df$F))*100, digits=2)
ggplot(tickets_gender, aes(age, freq, group=gender, fill=gender)) + ggtitle("Tickets by gender(percent)") + geom_area(position="fill")
ggplot(tickets_gender, aes(age, y=freq, group=gender, colour=gender)) +
ggtitle("Tickets by age total") +
geom_line() +
geom_point()
tickets_gender <- data.frame(table(tickets_only$SubjectAge,tickets_only$SubjectSexCode))
colnames(tickets_gender) <- c("age", "gender", "freq")
tickets_gender_df <- tickets_gender %>% spread(gender, freq)
tickets_gender_df$perc_male <- round((tickets_gender_df$M/(tickets_gender_df$M+tickets_gender_df$F))*100, digits=2)
tickets_gender_df$perc_women <- round((tickets_gender_df$F/(tickets_gender_df$M+tickets_gender_df$F))*100, digits=2)
ggplot(tickets_gender, aes(age, freq, group=gender, fill=gender)) + ggtitle("Tickets by gender(percent)") + geom_area(position="fill")
ggplot(tickets_gender, aes(age, y=freq, group=gender, colour=gender)) +
ggtitle("Tickets by age total") +
geom_line() +
geom_point()
incidents$Race <- paste(incidents$SubjectRaceCode, incidents$SubjectEthnicityCode)
index_race <- c("A H", "A M", "A N", "B H", "B M", "B N",
"I H", "I M", "I N", "W H", "W M", "W N")
values_race <- c("Hispanic", "Middle Eastern", "Asian", "Hispanic", "Middle Eastern", "Black",
"Hispanic", "Middle Eastern", "Indian", "Hispanic", "Middle Eastern", "White")
incidents$Def_Race <- values_race[match(incidents$Race, index_race)]
race_df <- data.frame(table(incidents$Def_Race, incidents$What))
colnames(race_df) <- c("Race", "Type", "Freq")
race_df_spread <- race_df %>% spread(Type, Freq)
race_df_spread$perc_ticket <- round((race_df_spread$Ticket/(race_df_spread$Ticket+race_df_spread$Warning))*100, digits=2)
race_df_spread$perc_warning <- round((race_df_spread$Warning/(race_df_spread$Ticket+race_df_spread$Warning))*100, digits=2)
write.csv(race_df_spread, "race_warnings.csv")
kable(race_df_spread)
Race | Ticket | Warning | perc_ticket | perc_warning |
---|---|---|---|---|
Asian | 3609 | 3098 | 53.81 | 46.19 |
Black | 44167 | 31927 | 58.04 | 41.96 |
Hispanic | 44493 | 24884 | 64.13 | 35.87 |
Indian | 655 | 499 | 56.76 | 43.24 |
Middle Eastern | 22384 | 16648 | 57.35 | 42.65 |
White | 216325 | 185910 | 53.78 | 46.22 |