This analysis accompanies the TrendCT.org story: Dunkin’ Donuts owns name rights to Hartford ball park, already owns New England
Here’s the raw data we pulled and cleaned from DunkinDonuts.com.
library(leaflet)
dunk <- read.csv("dunkindonuts.csv", stringsAsFactors=FALSE)
leaflet(dunk) %>% addTiles('http://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}.png', attribution='Map tiles by <a href="http://stamen.com">Stamen Design</a>, <a href="http://creativecommons.org/licenses/by/3.0">CC BY 3.0</a> — Map data © <a href="http://www.openstreetmap.org/copyright">OpenStreetMap</a>') %>%
setView(-98.557087, 39.810502, zoom = 4) %>%
addCircles(~lng, ~lat, weight = 1, radius=3,
color="#ffa500", stroke = FALSE, fillOpacity = 0.8)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ctnamecleaner)
library(stringr)
ct <- dunk %>% filter(state=="CT") %>% count(city)
ctnamecleaner(city, ct)
## Joining by: "name2"
## [1] "...All names matched. That's a rare thing."
## Congrats. The new dataframe is called CTDATA.
## Don't forget to collapse the duplicate rows and sum/average the numeric columns.
ct <- data.frame(tapply(CTDATA$n, CTDATA$real.town.name, sum))
ct$town <- rownames(ct)
colnames(ct) <- c("Dunkins", "id")
ctpopulator(id, ct)
## [1] "Checking to see if names match..."
## Joining by: "name2"
## [1] "Congrats. The new dataframe is called CTPOPULATED."
ct <- CTPOPULATED
ct$percapita <- (ct$Dunkins/ct$pop2013)*10000
ct$percapita <- round(ct$percapita, digits=2)
ct$id <- str_to_title(ct$id)
require(gtools)
## Loading required package: gtools
require(ggplot2)
## Loading required package: ggplot2
require(rgdal)
## Loading required package: rgdal
## Loading required package: sp
## rgdal: version: 0.9-3, (SVN revision 530)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 1.11.2, released 2015/02/10
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/gdal
## Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
## Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/proj
## Linking to sp version: 1.1-0
require(scales)
## Loading required package: scales
require(ggmap)
## Loading required package: ggmap
require(Cairo)
## Loading required package: Cairo
require(gpclib)
## Loading required package: gpclib
## General Polygon Clipper Library for R (version 1.5-5)
## Type 'class ? gpc.poly' for help
require(maptools)
## Loading required package: maptools
## Checking rgeos availability: TRUE
require(reshape)
## Loading required package: reshape
##
## Attaching package: 'reshape'
##
## The following object is masked from 'package:dplyr':
##
## rename
gpclibPermit()
## Warning in gpclibPermit(): support for gpclib will be withdrawn from
## maptools at the next major release
## [1] TRUE
gpclibPermitStatus()
## [1] TRUE
towntracts <- readOGR(dsn="townsmap", layer="towns")
## OGR data source with driver: ESRI Shapefile
## Source: "townsmap", layer: "towns"
## with 169 features
## It has 20 fields
towntracts <- fortify(towntracts, region="NAME10")
Dunk_Data <- left_join(towntracts, ct)
## Joining by: "id"
Dunk_Data$Dunkins <- as.numeric(Dunk_Data$Dunkins)
Dunk_Data$percapita <- as.numeric(Dunk_Data$percapita)
dd <- ggplot() +
geom_polygon(data = Dunk_Data, aes(x=long, y=lat, group=group,
fill=percapita), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", palette = "Reds", breaks=pretty_breaks(n=5)) +
theme_nothing(legend=TRUE) +
labs(title="Dunkin' Donuts stores per 10,000 residents", fill="")
dd
dd_all <- ggplot() +
geom_polygon(data = Dunk_Data, aes(x=long, y=lat, group=group,
fill=Dunkins), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", palette = "Reds", breaks=pretty_breaks(n=5)) +
theme_nothing(legend=TRUE) +
labs(title="Total Dunkin' Donuts by town", fill="")
dd_all
library(DT)
ct <- ct[c("id", "pop2013", "Dunkins", "percapita")]
colnames(ct) <- c("Town", "Population", "Dunkins", "Per Capita")
datatable(ct)
I had to do this part of the analysis outside of R. I used QGIS and did a spatial join to count the number of stores within town boundaries. I joined that with population counts and brought the spreadsheet into R below.
dc <- read.csv("DunkinCount.csv", stringsAsFactors=FALSE)
dc_table <- dc[c("Towns", "State", "Population", "Dunkin.Donuts","Per.1.000.residents")]
colnames(dc_table) <- c("Town", "State","Population","Dunkins", "Per Capita")
datatable(dc_table)
ddne <- ggplot(data=dc, aes(x=Dunkin.Donuts, y=Population)) +
geom_point(aes(text=paste(Towns, State, sep=", ")))+
geom_smooth(method=lm, formula = y ~ poly(x, 2), size=1) +
theme_minimal() +
ggtitle("Dunkin' Donuts versus town population in New England") +
labs(y="Population", x="Dunkin' Donuts")
ddne
That outlier in the top right is Boston.
Will have to readjust by taking out the outlier and trying a polynomial model.
dcnotboston <- filter(dc, Towns!="Boston")
p <- ggplot(data=dcnotboston, aes(x=Dunkin.Donuts, y=Population)) +
geom_point(aes(text=paste(Towns, State, sep=", ")))+
geom_smooth(method=lm, formula = y ~ poly(x, 2), size=1) +
theme_minimal() +
theme(text=element_text(family="Lato", size=14)) +
ggtitle("Dunkin' Donuts versus town population in New England") +
labs(y="Population", x="Dunkin' Donuts")
p
#Determining which factor of polynomial is most ideal
dcm_lm1 <- lm(dcnotboston$Dunkin.Donuts ~ poly(dcnotboston$Population, 1))
summary(dcm_lm1)
##
## Call:
## lm(formula = dcnotboston$Dunkin.Donuts ~ poly(dcnotboston$Population,
## 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.6154 -0.7133 -0.1622 0.5110 7.6112
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.37781 0.06184 54.62 <2e-16 ***
## poly(dcnotboston$Population, 1) 82.53208 1.54224 53.51 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.542 on 620 degrees of freedom
## Multiple R-squared: 0.822, Adjusted R-squared: 0.8217
## F-statistic: 2864 on 1 and 620 DF, p-value: < 2.2e-16
dcm_lm2 <- lm(dcnotboston$Dunkin.Donuts ~ poly(dcnotboston$Population, 2))
summary(dcm_lm2)
##
## Call:
## lm(formula = dcnotboston$Dunkin.Donuts ~ poly(dcnotboston$Population,
## 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6397 -0.6953 -0.0753 0.5055 7.4212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.37781 0.06141 55.002 < 2e-16 ***
## poly(dcnotboston$Population, 2)1 82.53208 1.53162 53.885 < 2e-16 ***
## poly(dcnotboston$Population, 2)2 -4.75205 1.53162 -3.103 0.00201 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.532 on 619 degrees of freedom
## Multiple R-squared: 0.8248, Adjusted R-squared: 0.8242
## F-statistic: 1457 on 2 and 619 DF, p-value: < 2.2e-16
dcm_lm3 <- lm(dcnotboston$Dunkin.Donuts ~ poly(dcnotboston$Population, 3))
summary(dcm_lm3)
##
## Call:
## lm(formula = dcnotboston$Dunkin.Donuts ~ poly(dcnotboston$Population,
## 3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7070 -0.6837 -0.0981 0.5209 7.3161
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.37781 0.06143 54.986 < 2e-16 ***
## poly(dcnotboston$Population, 3)1 82.53208 1.53208 53.869 < 2e-16 ***
## poly(dcnotboston$Population, 3)2 -4.75205 1.53208 -3.102 0.00201 **
## poly(dcnotboston$Population, 3)3 -1.21104 1.53208 -0.790 0.42957
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.532 on 618 degrees of freedom
## Multiple R-squared: 0.8249, Adjusted R-squared: 0.8241
## F-statistic: 970.7 on 3 and 618 DF, p-value: < 2.2e-16
#OK, looks like the ideal is 2
new <- data.frame(Population = dcnotboston$Population)
ddpredict <- predict(dcm_lm2, newdata = new, interval="confidence")
predicted <- cbind(dcnotboston, ddpredict)
predicted <- predicted %>% filter(State=="CT")
predicted$rounded <- round(predicted$upr, digits=0)
predicted$prediction <-predicted$rounded - predicted$Dunkin.Donuts
predicted <- predicted[order(predicted$prediction),]
total <- predicted[c("Towns", "Dunkin.Donuts", "Population", "fit", "lwr", "upr", "rounded", "prediction")]
total$fit <- round(total$fit, digits=2)
total$lwr <- round(total$lwr, digits=2)
total$upr <- round(total$upr, digits=2)
datatable(total)