We use R at Trend CT not just for data analysis and creating data visualizations, but also for working with spatial analysis and creating geographic graphics.

If you have a data set with latitude and longitude data, it’s easy to just throw it on a map with a dot for every instance. But really what does it serve? You see the intensity of the cluster of dots over the area but that’s it. If there’s no context or explanation it’s a flashy visualization and that’s it.

This tutorial will show you how to dig deeper and tell better stories with location data.

We’ll be working with traffic stop data.

We’ll figure out which town and census tract each stop occurred in and then pull in demographic data from the Census to determine what types of neighborhoods police tend to pull people over more often.

You could conduct this analysis using software like ArcGIS or QGIS but we’re going to be doing it all in R.

It is better to stay in a single environment from data importing, to analysis, to exporting visualizations because the produced scripts make it easier for others (including your future self) to replicate and verify your work in the future.

Difficulty level

Advanced. We’ll be using many packages, including dplyr (for pipes, as well as working with shape files and census data. You should have a basic understanding of these concepts before continuing.

Go through the beginner tutorial to get R and RStudio installed before you begin.

Download the dataset, set the working directory, and follow along. You can also skip straight to the full script.

Importing and preparing the data

# Bring in the data
stops <- read.csv("https://github.com/trendct/walkthroughs/raw/master/spatial-analysis-r-walkthrough/data/hamden_stops.csv", stringsAsFactors=FALSE)

# Check and eliminate the rows that don't have location information 
stops <- stops[!is.na(stops$InterventionLocationLatitude),]

# Add a column to identifying a driver as white or a minority
stops$ethnicity <- ifelse(((stops$SubjectRaceCode ==  "W") & (stops$SubjectEthnicityCode =="N")), "White", "Minority")

Importing the shapefiles

# Bring in the shape files for census tracts

require(rgdal)

# dsn is the folder the shape files are in. layer is the name of the file.
towntracts <- readOGR(dsn="shapes", layer="census_tracts")
## OGR data source with driver: ESRI Shapefile 
## Source: "shapes", layer: "census_tracts"
## with 829 features
## It has 14 fields
# creating a copy
towntracts_only <- towntracts

# turn the shapefile into a dataframe that can be worked on in R
require(maptools)
require(ggplot2)

towntracts <- fortify(towntracts, region="GEOID10")

# So now we have towntracts and towntracts_only
# What's the difference? towntracts is a dataframe and can be seen easily
library(knitr)
kable(head(towntracts))
long lat order hole piece id group
-73.66336 41.04170 1 FALSE 1 09001010101 09001010101.1
-73.66348 41.04147 2 FALSE 1 09001010101 09001010101.1
-73.66362 41.04115 3 FALSE 1 09001010101 09001010101.1
-73.66410 41.04023 4 FALSE 1 09001010101 09001010101.1
-73.66428 41.03983 5 FALSE 1 09001010101 09001010101.1
-73.66451 41.03934 6 FALSE 1 09001010101 09001010101.1
# It's for analyzing data and performing joins and calculations

# While towntracts_only is a Large SpatialPolygonsDataFrame

# It's for rendering the spatial data in R graphically

Mapping the data

# We only need the columns with the latitude and longitude
coords <- stops[c("InterventionLocationLongitude", "InterventionLocationLatitude")]

# Making sure we are working with rows that don't have any blanks
coords <- coords[complete.cases(coords),]

library(sp)

# Letting R know that these are specifically spatial coordinates
sp <- SpatialPoints(coords)

# Applying projections to the coordinates so they match up with the shapefile we're joining them with
# More projections information http://trac.osgeo.org/proj/wiki/GenParms 
proj4string(sp) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
proj4string(sp)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
# Rendering the census tracts
plot(towntracts_only)

# Adding the coordinates of the traffic stops
plot(sp, col="red" , add=TRUE)

Points in a polygon

# Calculating points in a polygon

by_tract <- over(sp, towntracts_only)

# What just happened: Every point in the list now has a corresponding census tract 
kable(head(by_tract, 5))
STATEFP10 COUNTYFP10 TRACTCE10 GEOID10 NAME10 NAMELSAD10 MTFCC10 FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10 INTPTLON10 GEOID_AFF1 GEOID_AFF2
09 009 166001 09009166001 1660.01 Census Tract 1660.01 G5020 S 4071781 0 +41.3897855 -072.9079481 14000US09009166001 140000US09009166001
09 009 165500 09009165500 1655 Census Tract 1655 G5020 S 1558578 0 +41.3404053 -072.9338173 14000US09009165500 140000US09009165500
09 009 165600 09009165600 1656 Census Tract 1656 G5020 S 2347326 0 +41.3558996 -072.9323558 14000US09009165600 140000US09009165600
09 009 141900 09009141900 1419 Census Tract 1419 G5020 S 1301209 34505 +41.3223483 -072.9106575 14000US09009141900 140000US09009141900
09 009 165900 09009165900 1659 Census Tract 1659 G5020 S 27028863 42903 +41.4292195 -072.9366177 14000US09009165900 140000US09009165900
# Use dplyr to gather up and count how many instances of census tracts there are

require(dplyr)

by_tract <- by_tract %>%
  group_by(GEOID10) %>%
  summarise(total=n())

# Get rid of the census tracts with no data
by_tract <- by_tract[!is.na(by_tract$GEOID10),]

kable(head(by_tract,5))
GEOID10 total
09003400200 1
09009141300 7
09009141400 3
09009141500 96
09009141600 1
# Rename the columns of this datframe so it can be joined to future data
colnames(by_tract) <- c("id", "total")

# Changing the GEOID number to character so it can be joined to future data
by_tract$id <- as.character(by_tract$id)

# Bring in a dataframe that has matches census tract ID numbers to town names
tracts2towns <- read.csv("data/tracts_to_towns.csv", stringsAsFactors=FALSE)

kable(head(tracts2towns, 5))
tract town
9001010101 Greenwich
9001010102 Greenwich
9001010201 Greenwich
9001010202 Greenwich
9001010300 Greenwich
# Changing the column names so it can be joined to the by_tract dataframe
colnames(tracts2towns) <- c("id", "town_name")

# Changing the GEOID number to character so it can be joined to the by_tract dataframe
tracts2towns$id <- as.character(tracts2towns$id)

# Adding a 0 to the front of the GEOID string because it was originally left out when it was imported
tracts2towns$id <- paste0("0", tracts2towns$id)

# Bringing in a library to deal with strings
library(stringr)

# Eliminating leading and trailing white space just in case
tracts2towns$town_name <- str_trim(tracts2towns$town_name)

# Joining the by_tract dataframe to the tracts2towns dataframe

by_tract <- left_join(by_tract, tracts2towns)

kable(head(by_tract,5))
id total town_name
09003400200 1 Berlin
09009141300 7 New Haven
09009141400 3 New Haven
09009141500 96 New Haven
09009141600 1 New Haven
# Why did we do this? Because Hamden sometimes made traffic stops outside of its jurisdiction. Now we can tell for sure which towns police overextended themselves.

Making a choropleth

# Now we can finally map it

# Join the by_tract points to polygon dataframe to the original census tracts dataframe
total_map <- left_join(towntracts, by_tract)


require(ggmap)
require(scales)

tm_ct <- ggplot() +
  geom_polygon(data = total_map, aes(x=long, y=lat, group=group, fill=total), color = "black", size=0.2) +
  geom_polygon(data = total_map, aes(x=long, y=lat, group=group, fill=total), color = "black", size=0.2) +
  coord_map() +
  scale_fill_distiller(type="seq", trans="reverse", palette = "Reds", breaks=pretty_breaks(n=10)) +
  theme_nothing(legend=TRUE) +
  labs(title="Where Hamden police conduct traffic stops", fill="")
print(tm_ct)

# Excellent. This is a great start but we have a lot of gray tracts. 
# Let's try again but without the gray.

# Filter out the tracts with NA in the total column

total_map <- subset(total_map, !is.na(total))

tm_ct <- ggplot() +
  geom_polygon(data = total_map, aes(x=long, y=lat, group=group, fill=total), color = "black", size=0.2) +
  coord_map() +
  scale_fill_distiller(type="seq", trans="reverse", palette = "Reds", breaks=pretty_breaks(n=10)) +
  theme_nothing(legend=TRUE) +
  labs(title="Where Hamden police conduct traffic stops", fill="")
print(tm_ct)

# Ok, much better. But we're still unclear which part is Hamden and which are parts of other towns.

townborders <- readOGR(dsn="shapes", layer="ctgeo")
## OGR data source with driver: ESRI Shapefile 
## Source: "shapes", layer: "ctgeo"
## with 169 features
## It has 6 fields
townborders_only <- townborders
townborders<- fortify(townborders, region="NAME10")

# Subset the town borders to just Hamden since that's the department we're looking at
town_borders <- subset(townborders, id=="Hamden")

tm_ct <- ggplot() +
  geom_polygon(data = total_map, aes(x=long, y=lat, group=group, fill=total), color = "black", size=0.2) +
  geom_polygon(data = town_borders, aes(x=long, y=lat, group=group, fill=total), color = "black", fill=NA, size=0.5) +
  coord_map() +
  scale_fill_distiller(type="seq", trans="reverse", palette = "Reds", breaks=pretty_breaks(n=10)) +
  theme_nothing(legend=TRUE) +
  labs(title="Where Hamden police conduct traffic stops", fill="")
print(tm_ct)

# That's much clearer
# Before we move on, we need to also calculate the number of minority stops in Hamden census tracts

coords <- subset(stops, ethnicity=="Minority")
coords <- coords[c("InterventionLocationLongitude", "InterventionLocationLatitude")]
coords <- coords[complete.cases(coords),]
sp <- SpatialPoints(coords)
proj4string(sp) <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
proj4string(sp)
## [1] "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
m_tract <- over(sp, towntracts_only)

m_tract <- m_tract %>%
  group_by(GEOID10) %>%
  summarise(minority=n())

m_tract <- m_tract[!is.na(m_tract$GEOID10),]
colnames(m_tract) <- c("id", "minority")
m_tract$id <- as.character(m_tract$id)

joined_tracts <- left_join(by_tract, m_tract)
## Joining by: "id"
kable(head(joined_tracts))
id total town_name minority
09003400200 1 Berlin NA
09009141300 7 New Haven 6
09009141400 3 New Haven 2
09009141500 96 New Haven 79
09009141600 1 New Haven 1
09009141800 6 New Haven 4
# Calculating percent of stops that were minorities

joined_tracts$minority_p <- round(joined_tracts$minority/joined_tracts$total*100,2)
kable(head(joined_tracts))
id total town_name minority minority_p
09003400200 1 Berlin NA NA
09009141300 7 New Haven 6 85.71
09009141400 3 New Haven 2 66.67
09009141500 96 New Haven 79 82.29
09009141600 1 New Haven 1 100.00
09009141800 6 New Haven 4 66.67

Importing census data

# We'll be using the censusapi package from Hanna Recht
# https://github.com/hrecht/censusapi
# Some documentation
# http://urbaninstitute.github.io/R-Trainings/accesing-census-apis/presentation/index.html

# If you do not yet have the censusapi package installed, uncomment the lines below and run them.

#install.packages("devtools")
#devtools::install_github("hrecht/censusapi")
library("censusapi")

# Loading my census key from an external script
source("keys.R")

# Replace census_key below with "your_own_key_whatever_it_is"
# Apply for one here http://api.census.gov/data/key_signup.html

race_tracts <- getCensus(name="acs5",
    vintage=2014,
    key=census_key,
    vars=c("NAME", "B02001_001E", "B02001_002E"),
    region="tract:*", regionin="state:09")

# What did we just do?

# I pulled the following population data for all census tracts in state 09, which is Connecticut

# B02001_001E - Total
# B02001_002E - White alone

kable(head(race_tracts))
NAME state county tract B02001_001E B02001_002E
Census Tract 101.01, Fairfield County, Connecticut 09 001 010101 4392 4050
Census Tract 101.02, Fairfield County, Connecticut 09 001 010102 4134 3800
Census Tract 102.01, Fairfield County, Connecticut 09 001 010201 3387 2928
Census Tract 102.02, Fairfield County, Connecticut 09 001 010202 5066 4447
Census Tract 103, Fairfield County, Connecticut 09 001 010300 4209 3846
Census Tract 104, Fairfield County, Connecticut 09 001 010400 5701 4968
# ok, let's clean this up
race_tracts$NAME <- NULL

# Creating a new column for the GEOID that can be joined with the dataframe we already have
race_tracts$id <- paste0(race_tracts$state, race_tracts$county, race_tracts$tract)

# Renaming the column names for clarity
colnames(race_tracts) <- c("state_code", "county_code", "tract_code", "total_pop", "white_pop", "id")

Calculating disparity

# Making some calculations

# Determining the minority population by subtracting the white population from the total
race_tracts$minority_pop <- race_tracts$total_pop - race_tracts$white_pop

# Now figuring out the percent makeup of each census tract
race_tracts$white_pop_p <- round(race_tracts$white_pop/race_tracts$total_pop*100,2)
race_tracts$minority_pop_p <- round(race_tracts$minority_pop/race_tracts$total_pop*100,2)

kable(head(race_tracts,5))
state_code county_code tract_code total_pop white_pop id minority_pop white_pop_p minority_pop_p
09 001 010101 4392 4050 09001010101 342 92.21 7.79
09 001 010102 4134 3800 09001010102 334 91.92 8.08
09 001 010201 3387 2928 09001010201 459 86.45 13.55
09 001 010202 5066 4447 09001010202 619 87.78 12.22
09 001 010300 4209 3846 09001010300 363 91.38 8.62
# Joining the two datframes
joined_tracts <- left_join(joined_tracts, race_tracts)

kable(head(joined_tracts,5))
id total town_name minority minority_p state_code county_code tract_code total_pop white_pop minority_pop white_pop_p minority_pop_p
09003400200 1 Berlin NA NA 09 003 400200 5893 5592 301 94.89 5.11
09009141300 7 New Haven 6 85.71 09 009 141300 5951 2984 2967 50.14 49.86
09009141400 3 New Haven 2 66.67 09 009 141400 4968 1051 3917 21.16 78.84
09009141500 96 New Haven 79 82.29 09 009 141500 6166 353 5813 5.72 94.28
09009141600 1 New Haven 1 100.00 09 009 141600 4884 1084 3800 22.19 77.81
# Calculating disparity between minority traffic stops and population

joined_tracts$min_disp <- joined_tracts$minority_p - joined_tracts$minority_pop_p

kable(head(joined_tracts[c("tract_code", "min_disp")]))
tract_code min_disp
400200 NA
141300 35.85
141400 -12.17
141500 -11.99
141600 22.19
141800 11.23

Visualizing geographic disparity

mapping_disparity <- left_join(towntracts, joined_tracts)
mapping_disparity <- subset(mapping_disparity, !is.na(min_disp))

# A library for color scales

pm_ct <- ggplot() 
pm_ct <- pm_ct + geom_polygon(data = mapping_disparity, aes(x=long, y=lat, group=group, fill=min_disp/100), color="white", size=.25)
pm_ct <- pm_ct + geom_polygon(data = town_borders, aes(x=long, y=lat, group=group), fill=NA, color = "black", size=0.5)
pm_ct <- pm_ct + coord_map() 
pm_ct <- pm_ct + scale_fill_distiller(type="seq", trans="reverse", palette = "PuOr", label=percent, breaks=pretty_breaks(n=10), name="Gap") 
pm_ct <- pm_ct + theme_nothing(legend=TRUE) 
pm_ct <- pm_ct + labs(x=NULL, y=NULL, title="Hamden: Minority traffic stops versus population")
pm_ct <- pm_ct + theme(text = element_text(size=15))
pm_ct <- pm_ct + theme(plot.title=element_text(face="bold", hjust=.4))
pm_ct <- pm_ct + theme(plot.subtitle=element_text(face="italic", size=9, margin=margin(l=20)))
pm_ct <- pm_ct + theme(plot.caption=element_text(size=12, margin=margin(t=12), color="#7a7d7e", hjust=0))
pm_ct <- pm_ct + theme(legend.key.size = unit(1, "cm"))
print(pm_ct)

Visualizing geographic disparity

With annotations

pm_ct <- ggplot() 
pm_ct <- pm_ct + geom_polygon(data = mapping_disparity, aes(x=long, y=lat, group=group, fill=min_disp/100), color="white", size=.25)
pm_ct <- pm_ct + geom_polygon(data = town_borders, aes(x=long, y=lat, group=group), fill=NA, color = "black", size=0.5)
pm_ct <- pm_ct + coord_map() 
pm_ct <- pm_ct + scale_fill_distiller(type="seq", trans="reverse", palette = "PuOr", label=percent, breaks=pretty_breaks(n=10), name="Gap") 
pm_ct <- pm_ct + theme_nothing(legend=TRUE) 
pm_ct <- pm_ct + labs(x=NULL, y=NULL, title="Hamden: Minority traffic stops versus population")
pm_ct <- pm_ct + theme(text = element_text(size=15))
pm_ct <- pm_ct + theme(plot.title=element_text(face="bold", hjust=.4))
pm_ct <- pm_ct + theme(plot.subtitle=element_text(face="italic", size=9, margin=margin(l=20)))
pm_ct <- pm_ct + theme(plot.caption=element_text(size=12, margin=margin(t=12), color="#7a7d7e", hjust=0))
pm_ct <- pm_ct + theme(legend.key.size = unit(1, "cm"))

# Annotations 

pm_ct <- pm_ct + annotate("segment", x = -72.93, xend = -72.87, y = 41.325, yend = 41.325, colour = "lightblue", size=.5) 
pm_ct <- pm_ct + annotate("point", x = -72.93, y = 41.325, colour = "lightblue", size = 2) 
pm_ct <- pm_ct + annotate("text", x = -72.85, y = 41.325, label = "New Haven", size=5, colour="gray30") 
pm_ct <- pm_ct + annotate("segment", x = -72.89, xend = -72.86, y = 41.375, yend = 41.375, colour = "lightblue", size=.5) 
pm_ct <- pm_ct + annotate("point", x = -72.89, y = 41.375, colour = "lightblue", size = 2) 
pm_ct <- pm_ct + annotate("text", x = -72.845, y = 41.375, label = "Hamden", size=5, colour="gray30") 
pm_ct <- pm_ct + annotate("point", x = -72.83, y = 41.375, colour="white", size=.2) 

print(pm_ct)