correlator

An analysis for the TrendCT quiz: Quiz: What correlates with traffic tickets per town?

The traffic ticket data is from the Connecticut Racial Profiling Prohibition Project hosted at the CT Data Collaborative which collects traffic incident reports between October 2013 and September 2014.

The vehicle density and traffic count data is from the Department of Transportation.

Population, town area, commuter count, and median income data is from the US Census.

The business revenue and taxes data is from the Connecticut Data Portal.

The Dunkin’ Donuts data was scraped from the Dunkin’ Donuts website.

Just interested in the big joined Tickets dataframe? Here’s the data.

library(stringr)
library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ctnamecleaner)
library(tidyr)
library(ggplot2)
library(DT)

Bringing in the data, cleaning it, setting it up for analysis

#Bringing in the traffic count statistics from the Department of Transportation
traffic <- read.csv("data/traffic_count.csv", stringsAsFactors=FALSE)

traffic_towns <- tapply(traffic$ADT, traffic$Town_Name, mean)

towns <- data.frame(traffic_towns)

towns$town <- row.names(towns)
towns <- towns[c("town", "traffic_towns")]
colnames(towns) <- c("town", "ADT Average")
roads <- data.frame(table(traffic$Town_Name))
colnames(roads) <- c("town", "roads")

towns <- left_join(towns, roads)

## Joining by: "town"

## Warning in left_join_impl(x, y, by$x, by$y): joining factor and character
## vector, coercing into character vector

colnames(towns) <- c("town", "ADT", "roads")
towns$ADT <- round(towns$ADT, digits=2)
types <- data.frame(table(traffic$Town_Name, traffic$FClass))
colnames(types) <- c("town", "ftype", "number")
types$ftype <- as.character(types$ftype)
types$ftype <- gsub("1", "f1", types$ftype)
types$ftype <- gsub("2", "f2", types$ftype)
types$ftype <- gsub("3", "f3", types$ftype)
types$ftype <- gsub("4", "f4", types$ftype)
types$ftype <- gsub("5", "f5", types$ftype)
types$ftype <- gsub("6", "f6", types$ftype)
types$ftype <- gsub("7", "f7", types$ftype)
types <- spread(types, ftype, number)

towns <- left_join(towns, types)

## Joining by: "town"

## Warning in left_join_impl(x, y, by$x, by$y): joining factor and character
## vector, coercing into character vector

towns$town <- str_trim(towns$town)
dmvt <- read.csv("data/dmvt.csv", stringsAsFactors=FALSE)
dmvt$town <- str_to_title(dmvt$Town)
dmvt$DVMT <- gsub(",", "", dmvt$DVMT)
dmvt$DVMT <- as.numeric(dmvt$DVMT)
dmvt <- dmvt[,-1]
dmvt <- dmvt[,-1]

towns <- left_join(towns, dmvt)

## Joining by: "town"

#

  # Bringing in square miles per town
  area <- read.csv("data/area.csv", stringsAsFactors=FALSE)
colnames(area) <- c("town", "sqmiles")
area$town <- str_trim(area$town)
towns <- left_join(towns, area)

## Joining by: "town"

# Bringing in population stats
towns <- ctpopulator(town, towns)

## [1] "Checking to see if names match..."

## Joining by: "name2"

towns$density <- towns$pop2013/towns$sqmiles

#

# Bring in retail sales by town
sales <- read.csv("data/salestax.csv", stringsAsFactors=FALSE)
salestx <- subset(sales, NAICS.Industry.Code=="All NAICS Codes")
sales2014 <- subset(salestx, Calendar.Year==2014)
retail <- tapply(sales2014$Retail.Sales.of.Goods, sales2014$Municipality, sum)
write.csv(retail, "data/temp_retail.csv")
retail <- read.csv("data/temp_retail.csv", stringsAsFactors=FALSE)
file.remove("data/temp_retail.csv")

## [1] TRUE

colnames(retail) <- c("town", "retail.tax")
retail <- retail[-1,]
retail$town <- str_trim(retail$town)
towns <- left_join(towns, retail)

## Joining by: "town"

#retail$town <- rownames(retail)
#colnames(retail) <- c("retail.sales", "town")
#retail <- retail[c("town", "retail.sales")]
#towns2 <- left_join(retail, towns)
#towns <- left_join(retail, towns)


taxes <- tapply(sales2014$Total.Tax.Due.at.6.35., sales2014$Municipality, sum)
write.csv(taxes, "data/temp_taxes.csv")
taxes <- read.csv("data/temp_taxes.csv", stringsAsFactors=FALSE)
file.remove("data/temp_taxes.csv")

## [1] TRUE

colnames(taxes) <- c("town", "total.tax")
taxes <- taxes[-1,]
taxes$town <- str_trim(taxes$town)

towns <- left_join(towns, taxes)

## Joining by: "town"

# Bringing in commuters stats from the US Census
commuters <- read.csv("data/commuters.csv", stringsAsFactors=FALSE)

commuters_filtered <- commuters[c("name", "B08006001...Total.", "B08006002...Car..truck..or.van.")]
commuters_filtered <- commuters_filtered[-1,]
commuters_filtered <- commuters_filtered[-1,]

colnames(commuters_filtered) <- c("town","commute_total", "drivers")
commuters_filtered$town <- sub(" town,.*", "", commuters_filtered$town)
commuters_filtered$percent.drivers <- round((commuters_filtered$drivers/commuters_filtered$commute_total)*100, digits=2)
commuters_filtered$town <- str_to_upper(commuters_filtered$town)

towns <- left_join(towns, commuters_filtered)

## Joining by: "town"

towns$percent.drivers.all <- round((towns$drivers/towns$pop2013)*100, digits=2)

  # Bringing in median income data 
  income <- read.csv("data/medianincome.csv", stringsAsFactors=FALSE)
income <- income[,-1]
income <- income[-1,]
colnames(income) <- c("town", "median.income", "error")
income <- income[c("town", "median.income")]
income$town <- sub(" town,.*", "", income$town)
income$town <- str_to_upper(income$town)

towns <- left_join(towns, income)

## Joining by: "town"

# Bringing in the tickets
incidents <- read.csv("http://ctrp3viz.s3.amazonaws.com/data/Connecticut_r1.csv", stringsAsFactors=FALSE)
warnings <- data.frame(table(incidents$Department.Name, incidents$InterventionDispositionCode))
colnames(warnings) <- c("Department", "Disposition", "Freq")
warnings_df <- warnings %>% spread(Disposition, Freq)
warnings_df$total <- warnings_df$I + warnings_df$M + warnings_df$N + warnings_df$U + warnings_df$V + warnings_df$W
warnings_df$warnings <- warnings_df$V + warnings_df$W
warnings_df$tickets <- warnings_df$total - warnings_df$warnings

warnings_df$perc_warnings <- round((warnings_df$warnings/warnings_df$total)*100, digits=2)
warnings_df$perc_tickets <- 100 - warnings_df$perc_warnings
warnings_df_table <- warnings_df[c("Department", "total", "tickets", "warnings", "perc_tickets", "perc_warnings")]

#exclude Stamford because of unreliable data

warnings_df_table <- filter(warnings_df_table, Department!="Stamford")

tix <- warnings_df_table

colnames(tix) <- c("town", "stops", "tickets", "warnings", "perc_tickets", "perc_warnings")
tix$town <- str_to_upper(tix$town)

tix_df <- left_join(towns, tix)

## Joining by: "town"

tix_df <- na.omit(tix_df)

# Bringin police department staffing data
pers <- read.csv("data/police_dept.csv", stringsAsFactors=FALSE)
pers$sworn <- pers$sworn.male+ pers$sworn.female
incidents_by_dept <- data.frame(table(incidents$Department.Name))
colnames(incidents_by_dept) <- c("name", "incidents")
incidents_by_dept$name <- toupper(incidents_by_dept$name)

incidents_by_dept <- left_join(incidents_by_dept, pers)

## Joining by: "name"

incidents_by_dept$inc.rate <- round((incidents_by_dept$incidents/incidents_by_dept$sworn), digits=2)

dept_staff <- incidents_by_dept[c("name", "sworn.male", "sworn.female", "sworn", "total")]
colnames(dept_staff) <- c("town", "sworn.male", "sworn.female", "sworn", "dept.total")

tix_df <- left_join(tix_df, dept_staff)

## Joining by: "town"

tix_only <- tix_df[,25]
tix_save <- tix_df

colnames(tix_df) <- c("town", "Average daily traffic", "Number of roads", 
                      "Number of interstate roads", "Number of arterials: Freeways and expressways",
                      "Number of arterials: Other", "Minor arterial roads", "Major collector roads",
                      "Minor collector roads", "Local roads", "Daily vehicle miles traveled", 
                      "Miles of road", "Mileage", "Town square miles", "Town population", 
                      "Population per square mile", "Retail tax 2014", "Total tax 2014",
                      "Total commuters", "Total drivers", "Drivers as a percent of commuters", 
                      "Drivers as a percent of total population", "Town median income", 
                      "Total traffic stops in 2014", "Total tickets issued in 2014", "Total warnings issued in 2014", 
                      "Percent of tickets given out of all stops", 
                      "Percent of warnings given out of all stops", "Sworn male officers", 
                      "Sworn female officers", "Total sworn officers", "Total police department employees")

write.csv(tix_df, "data/big_tickets_dataframe.csv")
datatable(tix_df)

Generating a list of correlations to tickets by town

tix_list <- 2:ncol(tix_df)
for (i in tix_list) {
  row_name <- colnames(tix_df[i])
  corre <- cor(tix_only, tix_df[,i], use="pairwise.complete.obs")
  loop_array <- data.frame(row_name, corre)
  if (i == 2) {
    array <- loop_array } else 
    { array <- rbind(array, loop_array) }
}
write.csv(array, "data/tickets_correlation_list.csv")
datatable(array)

Plot it out. Comparing tickets

for (i in tix_list) {
title <- paste("Tickets vs ", array$row_name[i-1], ": ", round(array$corre[i-1], digits=2))
p <- ggplot(tix_df, aes(tix_df[,25], tix_df[,i])) + geom_point() + ggtitle(title) + ylab(array$row_name[i-1]) + xlab("Tickets")
plot(p)
}

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

Comparing warnings

warn_only <- tix_df[,26]
for (i in tix_list) {
  row_name <- colnames(tix_df[i])
  corre <- cor(warn_only, tix_df[,i], use="pairwise.complete.obs")
  loop_array <- data.frame(row_name, corre)
  if (i == 2) {
    array <- loop_array } else 
    { array <- rbind(array, loop_array) }
}

for (i in tix_list) {
  title <- paste("Warnings vs ", array$row_name[i-1], ": ", round(array$corre[i-1], digits=2))
  p <- ggplot(tix_df, aes(tix_df[,26], tix_df[,i])) + geom_point() + ggtitle(title) + ylab(array$row_name[i-1]) + xlab("Warnings")
plot(p)
}

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

Comparing traffic stops total

stops_only <- tix_df[,24]
for (i in tix_list) {
  row_name <- colnames(tix_df[i])
  corre <- cor(stops_only, tix_df[,i], use="pairwise.complete.obs")
  loop_array <- data.frame(row_name, corre)
  if (i == 2) {
    array <- loop_array } else 
    { array <- rbind(array, loop_array) }
}

for (i in tix_list) {
  title <- paste("Traffic stops vs ", array$row_name[i-1], ": ", round(array$corre[i-1], digits=2))
  p <- ggplot(tix_df, aes(tix_df[,24], tix_df[,i])) + geom_point() + ggtitle(title) + ylab(array$row_name[i-1]) + xlab("Traffic stops")
plot(p)
}

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

Dunkin Donuts

dunks <- read.csv("data/dunks.csv", stringsAsFactors=FALSE)
dunks <- ctnamecleaner(Towns, dunks, case="upper")

## Joining by: "name2"

## [1] "...All names matched. That's a rare thing."

colnames(dunks) <- c("d.town", "dunkinrate","dunks", "town")
dunks$town <- str_to_upper(dunks$town)
dunkin <- left_join(tix_df, dunks)

## Joining by: "town"

dunkin <- na.omit(dunkin)

dcorr <- cor(dunkin$dunks, dunkin[,25])

  title <- paste("Traffic stops vs Dunkin' Donuts: ", round(dcorr, digits=2))
  p <- ggplot(dunkin, aes(dunkin[,25], dunkin$dunks)) + geom_point() + ggtitle(title) + ylab("Dunkin' Donut stores") + xlab("Tickets")
plot(p)

Correlograms

library(corrgram)
txd1 <- tix_save[c("tickets", "ADT", "roads", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "DVMT", "Lane.Miles", "Mileage")]

corrgram(txd1, order=TRUE, lower.panel=panel.shade,
  upper.panel=panel.pie, text.panel=panel.txt,
  main="Tickets and roads")

txd2 <- tix_save[c("tickets", "sqmiles", "pop2013", "density", "retail.tax", "total.tax", "commute_total", "drivers", "percent.drivers", "percent.drivers.all", "median.income")]

corrgram(txd2, order=TRUE, lower.panel=panel.shade,
  upper.panel=panel.pie, text.panel=panel.txt,
  main="Tickets and taxes and drivers and income")

txd3 <- tix_save[c("tickets", "stops", "warnings", "perc_tickets", "perc_warnings", "sworn.male", "sworn.female", "sworn", "dept.total")]
  
corrgram(txd3, order=TRUE, lower.panel=panel.shade,
  upper.panel=panel.pie, text.panel=panel.txt,
  main="Tickets and warnings and stops and staffing")