library(stringr)
library(dplyr)
library(lubridate)
library(tidyr)
library(ggplot2)
library(DT)
This is the analysis to go with the following story:
The data is from the U.S. Environmental Protection Agency’s Safe Drinking Water Information System (SDWIS) as of April 2016.
violations <- read.csv("data/violation_report.csv", stringsAsFactors=FALSE)
cities <- read.csv("data/cities.csv", stringsAsFactors=FALSE)
actions <- read.csv("data/actions_count.csv", stringsAsFactors=FALSE)
all <- actions
all$PWS.ID <- all$water.system.id
all <- all[!duplicated(all$PWS.ID),]
violations <- left_join(violations, all, by="PWS.ID")
violations <- left_join(violations, cities, by="city.s.served")
violations_list <- violations %>%
select(PWS.Name, Contaminant.Name, Rule.Name,
Violation.Type, Is.Health.Based,
PWS.Type, Population.Served.Count,
Compliance.Period.Begin.Date,
Compliance.Period.End.Date, Compliance.Status,
RTC.Date, Is.Major.Violation, address,
link, towns.served)
v_count <- violations_list %>%
group_by(Contaminant.Name, Is.Major.Violation) %>%
summarise(Count=n())
compliance_count <- violations_list %>%
group_by(Compliance.Status) %>%
summarise(Count=n())
violations_list$date <- dmy(violations_list$RTC.Date)
violations_list$year <- year(violations_list$date)
v_year <- violations_list %>%
group_by(year) %>%
summarise(Returned.to.compliance=n())
colnames(v_year) <- c("year", "Returned.to.compliance")
violations_list$begin.date <- dmy(violations_list$Compliance.Period.Begin.Date)
violations_list$begin.year <- year(violations_list$begin.date)
begin_year <- violations_list %>%
group_by(begin.year) %>%
summarise(Compliance.began=n())
colnames(begin_year) <- c("year", "Compliance.begin")
violations_list$end.date <- dmy(violations_list$Compliance.Period.End.Date)
violations_list$end.year <- year(violations_list$end.date)
end_year <- violations_list %>%
group_by(end.year) %>%
summarise(Compliance.end=n())
colnames(end_year) <- c("year", "Compliance.end")
violations_by_year <- left_join(begin_year, end_year)
violations_by_year <- left_join(violations_by_year, v_year)
violations_by_year <- gather(violations_by_year, "type", "violations", 2:4)
violations_by_year$type <- as.factor(violations_by_year$type)
ggplot(violations_by_year, aes(x=year, y=violations, group=type, colour=type)) +
geom_line() +
geom_point()
major_year <- violations_list %>%
group_by(end.year, Is.Major.Violation) %>%
summarise(major=n())
ggplot(major_year, aes(x=end.year, y=major, group=Is.Major.Violation, colour=Is.Major.Violation)) +
geom_line() +
geom_point()
all_links_only <- all[c("name", "link")]
most_system <- violations_list %>%
group_by(PWS.Name) %>%
summarise(violations=n()) %>%
arrange(-violations)
# most_system <- most_system[1:10,]
most_system_y <- violations_list %>%
filter(Is.Health.Based=="Y") %>%
group_by(PWS.Name) %>%
summarise(health.based.violations=n()) %>%
arrange(-health.based.violations)
most_system_n <- violations_list %>%
filter(Is.Health.Based=="N") %>%
group_by(PWS.Name) %>%
summarise(health.based.violations_n=n()) %>%
arrange(-health.based.violations_n)
systems_most <- left_join(most_system, most_system_y)
## Joining by: "PWS.Name"
systems_most <- left_join(systems_most, most_system_n)
## Joining by: "PWS.Name"
compliance_system_r <- violations_list %>%
filter(Compliance.Status=="Returned to Compliance") %>%
group_by(PWS.Name) %>%
summarise(complied=n()) %>%
arrange(-complied)
compliance_system_k <- violations_list %>%
filter(Compliance.Status=="Known") %>%
group_by(PWS.Name) %>%
summarise(known=n()) %>%
arrange(-known)
systems_most <- left_join(systems_most, compliance_system_r)
## Joining by: "PWS.Name"
systems_most <- left_join(systems_most, compliance_system_k)
## Joining by: "PWS.Name"
systems_most$percent.health.based <- round(systems_most$health.based.violations/systems_most$violations*100,2)
systems_most$percent.complied <- round(systems_most$complied/systems_most$violations*100,2)
systems_most <- systems_most[c("PWS.Name", "violations", "percent.health.based", "percent.complied")]
colnames(systems_most) <- c("name", "violations", "percent.health.based", "percent.complied")
systems_most <- left_join(systems_most, all_links_only)
## Joining by: "name"
systems_most$system <- paste0("<a href='", systems_most$link, "' target='_blank'>", systems_most$name, "</a>")
systems_most <- systems_most[c("name", "violations", "percent.health.based", "percent.complied")]
systems_most <- data.frame(systems_most)
datatable(systems_most)
violations_rule <- violations %>%
group_by(Rule.Name) %>%
summarise(Count=n()) %>%
arrange(-Count)
violations_rule <- violations_rule[1:10,]
violations_type <- violations %>%
group_by(Violation.Type) %>%
summarise(Count=n()) %>%
arrange(-Count)
violations_type <- violations_type[1:10,]
violations_rule <- data.frame(violations_rule)
c <- ggplot(violations_rule, aes(x=Rule.Name, y=Count))
c + geom_bar(stat="identity") + coord_flip()
violations_type <- data.frame(violations_type)
c <- ggplot(violations_type, aes(x=Violation.Type, y=Count))
c + geom_bar(stat="identity") + coord_flip()