This analysis accompanies the TrendCT.org story: Dunkin’ Donuts owns name rights to Hartford ball park, already owns New England

Here’s the raw data we pulled and cleaned from DunkinDonuts.com.


Let’s start out by mapping out all the Dunkin’ Donuts locations

library(leaflet)
dunk <- read.csv("dunkindonuts.csv", stringsAsFactors=FALSE)
leaflet(dunk) %>% addTiles('http://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}.png', attribution='Map tiles by <a href="http://stamen.com">Stamen Design</a>, <a href="http://creativecommons.org/licenses/by/3.0">CC BY 3.0</a> &mdash; Map data &copy; <a href="http://www.openstreetmap.org/copyright">OpenStreetMap</a>') %>% 
  setView(-98.557087, 39.810502, zoom = 4) %>% 
  addCircles(~lng, ~lat, weight = 1, radius=3, 
                 color="#ffa500", stroke = FALSE, fillOpacity = 0.8)


Let’s look specifically at Connecticut


library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ctnamecleaner)
library(stringr)
ct <- dunk %>% filter(state=="CT") %>% count(city)

ctnamecleaner(city, ct)
## Joining by: "name2"
## [1] "...All names matched. That's a rare thing."
## Congrats. The new dataframe is called CTDATA. 
## Don't forget to collapse the duplicate rows and sum/average the numeric columns.
ct <- data.frame(tapply(CTDATA$n, CTDATA$real.town.name, sum))
ct$town <- rownames(ct)
colnames(ct) <- c("Dunkins", "id")

ctpopulator(id, ct)
## [1] "Checking to see if names match..."
## Joining by: "name2"
## [1] "Congrats. The new dataframe is called CTPOPULATED."
ct <- CTPOPULATED
ct$percapita <- (ct$Dunkins/ct$pop2013)*10000
ct$percapita <- round(ct$percapita, digits=2)
ct$id <- str_to_title(ct$id)

require(gtools)
## Loading required package: gtools
require(ggplot2)
## Loading required package: ggplot2
require(rgdal)
## Loading required package: rgdal
## Loading required package: sp
## rgdal: version: 0.9-3, (SVN revision 530)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 1.11.2, released 2015/02/10
##  Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/gdal
##  Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
##  Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/proj
##  Linking to sp version: 1.1-0
require(scales)
## Loading required package: scales
require(ggmap)
## Loading required package: ggmap
require(Cairo)
## Loading required package: Cairo
require(gpclib)
## Loading required package: gpclib
## General Polygon Clipper Library for R (version 1.5-5)
##  Type 'class ? gpc.poly' for help
require(maptools)
## Loading required package: maptools
## Checking rgeos availability: TRUE
require(reshape)
## Loading required package: reshape
## 
## Attaching package: 'reshape'
## 
## The following object is masked from 'package:dplyr':
## 
##     rename
gpclibPermit()
## Warning in gpclibPermit(): support for gpclib will be withdrawn from
## maptools at the next major release
## [1] TRUE
gpclibPermitStatus()
## [1] TRUE
towntracts <- readOGR(dsn="townsmap", layer="towns")
## OGR data source with driver: ESRI Shapefile 
## Source: "townsmap", layer: "towns"
## with 169 features
## It has 20 fields
towntracts <- fortify(towntracts, region="NAME10")

Dunk_Data <- left_join(towntracts, ct)
## Joining by: "id"
Dunk_Data$Dunkins <- as.numeric(Dunk_Data$Dunkins)
Dunk_Data$percapita <- as.numeric(Dunk_Data$percapita)

dd <- ggplot() +
  geom_polygon(data = Dunk_Data, aes(x=long, y=lat, group=group, 
                                         fill=percapita), color = "black", size=0.2) +
  coord_map() +
  scale_fill_distiller(type="seq", palette = "Reds", breaks=pretty_breaks(n=5)) +
  theme_nothing(legend=TRUE) +
  labs(title="Dunkin' Donuts stores per 10,000 residents", fill="")
dd

dd_all <- ggplot() +
  geom_polygon(data = Dunk_Data, aes(x=long, y=lat, group=group, 
                                         fill=Dunkins), color = "black", size=0.2) +
  coord_map() +
  scale_fill_distiller(type="seq", palette = "Reds", breaks=pretty_breaks(n=5)) +
  theme_nothing(legend=TRUE) +
  labs(title="Total Dunkin' Donuts by town", fill="")
dd_all


Analysis for Connecticut

library(DT)
ct <- ct[c("id", "pop2013", "Dunkins", "percapita")]
colnames(ct) <- c("Town", "Population", "Dunkins", "Per Capita")
datatable(ct)


Expanding scope to New England

I had to do this part of the analysis outside of R. I used QGIS and did a spatial join to count the number of stores within town boundaries. I joined that with population counts and brought the spreadsheet into R below.

dc <- read.csv("DunkinCount.csv", stringsAsFactors=FALSE)
dc_table <- dc[c("Towns", "State", "Population", "Dunkin.Donuts","Per.1.000.residents")]
colnames(dc_table) <- c("Town", "State","Population","Dunkins", "Per Capita")
datatable(dc_table)


Is there a correlation between a town’s population and number of Dunkin’ Donuts?

ddne <- ggplot(data=dc, aes(x=Dunkin.Donuts, y=Population)) +
  geom_point(aes(text=paste(Towns, State, sep=", ")))+ 
               geom_smooth(method=lm, formula = y ~ poly(x, 2), size=1) +
               theme_minimal() +
               ggtitle("Dunkin' Donuts versus town population in New England") +
               labs(y="Population", x="Dunkin' Donuts")
ddne

That outlier in the top right is Boston.

Will have to readjust by taking out the outlier and trying a polynomial model.

dcnotboston <- filter(dc, Towns!="Boston")

p <- ggplot(data=dcnotboston, aes(x=Dunkin.Donuts, y=Population)) +
  geom_point(aes(text=paste(Towns, State, sep=", ")))+ 
  geom_smooth(method=lm, formula = y ~ poly(x, 2), size=1) +
  theme_minimal() +
  theme(text=element_text(family="Lato", size=14)) +
  ggtitle("Dunkin' Donuts versus town population in New England") +
  labs(y="Population", x="Dunkin' Donuts")
p


Let’s apply this model to predict how many Dunkin’ Donuts a town should have.

#Determining which factor of polynomial is most ideal
dcm_lm1 <- lm(dcnotboston$Dunkin.Donuts ~ poly(dcnotboston$Population, 1))
summary(dcm_lm1)
## 
## Call:
## lm(formula = dcnotboston$Dunkin.Donuts ~ poly(dcnotboston$Population, 
##     1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6154 -0.7133 -0.1622  0.5110  7.6112 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.37781    0.06184   54.62   <2e-16 ***
## poly(dcnotboston$Population, 1) 82.53208    1.54224   53.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.542 on 620 degrees of freedom
## Multiple R-squared:  0.822,  Adjusted R-squared:  0.8217 
## F-statistic:  2864 on 1 and 620 DF,  p-value: < 2.2e-16
dcm_lm2 <- lm(dcnotboston$Dunkin.Donuts ~ poly(dcnotboston$Population, 2))
summary(dcm_lm2)
## 
## Call:
## lm(formula = dcnotboston$Dunkin.Donuts ~ poly(dcnotboston$Population, 
##     2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6397 -0.6953 -0.0753  0.5055  7.4212 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       3.37781    0.06141  55.002  < 2e-16 ***
## poly(dcnotboston$Population, 2)1 82.53208    1.53162  53.885  < 2e-16 ***
## poly(dcnotboston$Population, 2)2 -4.75205    1.53162  -3.103  0.00201 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.532 on 619 degrees of freedom
## Multiple R-squared:  0.8248, Adjusted R-squared:  0.8242 
## F-statistic:  1457 on 2 and 619 DF,  p-value: < 2.2e-16
dcm_lm3 <- lm(dcnotboston$Dunkin.Donuts ~ poly(dcnotboston$Population, 3))
summary(dcm_lm3)
## 
## Call:
## lm(formula = dcnotboston$Dunkin.Donuts ~ poly(dcnotboston$Population, 
##     3))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7070 -0.6837 -0.0981  0.5209  7.3161 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       3.37781    0.06143  54.986  < 2e-16 ***
## poly(dcnotboston$Population, 3)1 82.53208    1.53208  53.869  < 2e-16 ***
## poly(dcnotboston$Population, 3)2 -4.75205    1.53208  -3.102  0.00201 ** 
## poly(dcnotboston$Population, 3)3 -1.21104    1.53208  -0.790  0.42957    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.532 on 618 degrees of freedom
## Multiple R-squared:  0.8249, Adjusted R-squared:  0.8241 
## F-statistic: 970.7 on 3 and 618 DF,  p-value: < 2.2e-16
#OK, looks like the ideal is 2

new <- data.frame(Population = dcnotboston$Population)

ddpredict <- predict(dcm_lm2, newdata = new, interval="confidence")

predicted <- cbind(dcnotboston, ddpredict)
predicted <- predicted %>% filter(State=="CT")
predicted$rounded <- round(predicted$upr, digits=0)
predicted$prediction <-predicted$rounded - predicted$Dunkin.Donuts
predicted <- predicted[order(predicted$prediction),]

total <- predicted[c("Towns", "Dunkin.Donuts", "Population", "fit", "lwr", "upr", "rounded", "prediction")]

total$fit <- round(total$fit, digits=2)
total$lwr <- round(total$lwr, digits=2)
total$upr <- round(total$upr, digits=2)

datatable(total)