library(stringr)
library(dplyr)
library(lubridate)
library(tidyr)
library(ggplot2)
library(DT)

This is the analysis to go with the following story:

The data is from the U.S. Environmental Protection Agency’s Safe Drinking Water Information System (SDWIS) as of April 2016.

violations <- read.csv("data/violation_report.csv", stringsAsFactors=FALSE)
cities <- read.csv("data/cities.csv", stringsAsFactors=FALSE)
actions <- read.csv("data/actions_count.csv", stringsAsFactors=FALSE)
all <- actions
all$PWS.ID <- all$water.system.id
all <- all[!duplicated(all$PWS.ID),]

violations <- left_join(violations, all, by="PWS.ID")
violations <- left_join(violations, cities, by="city.s.served")

violations_list <- violations %>%
  select(PWS.Name, Contaminant.Name, Rule.Name, 
         Violation.Type, Is.Health.Based, 
         PWS.Type, Population.Served.Count,
          Compliance.Period.Begin.Date,
         Compliance.Period.End.Date, Compliance.Status,
         RTC.Date, Is.Major.Violation, address,
         link, towns.served)

Violations over time by compliance status

v_count <- violations_list %>%
  group_by(Contaminant.Name, Is.Major.Violation) %>%
  summarise(Count=n())

compliance_count <- violations_list %>%
  group_by(Compliance.Status) %>%
  summarise(Count=n())

violations_list$date <- dmy(violations_list$RTC.Date)
violations_list$year <- year(violations_list$date)

v_year <- violations_list %>%
  group_by(year) %>%
  summarise(Returned.to.compliance=n())
colnames(v_year) <- c("year", "Returned.to.compliance")


violations_list$begin.date <- dmy(violations_list$Compliance.Period.Begin.Date)
violations_list$begin.year <- year(violations_list$begin.date)

begin_year <- violations_list %>%
  group_by(begin.year) %>%
  summarise(Compliance.began=n())
colnames(begin_year) <- c("year", "Compliance.begin")

violations_list$end.date <- dmy(violations_list$Compliance.Period.End.Date)
violations_list$end.year <- year(violations_list$end.date)

end_year <- violations_list %>%
  group_by(end.year) %>%
  summarise(Compliance.end=n())

colnames(end_year) <- c("year", "Compliance.end")

violations_by_year <- left_join(begin_year, end_year)
violations_by_year <- left_join(violations_by_year, v_year)

violations_by_year <- gather(violations_by_year, "type", "violations", 2:4)
violations_by_year$type <- as.factor(violations_by_year$type)

ggplot(violations_by_year, aes(x=year, y=violations, group=type, colour=type)) +
  geom_line() +
  geom_point()

Major violations over time

major_year <- violations_list %>%
  group_by(end.year, Is.Major.Violation) %>%
  summarise(major=n())

ggplot(major_year, aes(x=end.year, y=major, group=Is.Major.Violation, colour=Is.Major.Violation)) +
  geom_line() +
  geom_point()

all_links_only <- all[c("name", "link")]

most_system <- violations_list %>%
  group_by(PWS.Name) %>%
  summarise(violations=n()) %>%
  arrange(-violations)

# most_system <- most_system[1:10,]

most_system_y <- violations_list %>%
  filter(Is.Health.Based=="Y") %>%
  group_by(PWS.Name) %>%
  summarise(health.based.violations=n()) %>%
  arrange(-health.based.violations)


most_system_n <- violations_list %>%
  filter(Is.Health.Based=="N") %>%
  group_by(PWS.Name) %>%
  summarise(health.based.violations_n=n()) %>%
  arrange(-health.based.violations_n)

systems_most <- left_join(most_system, most_system_y)
## Joining by: "PWS.Name"
systems_most <- left_join(systems_most, most_system_n)
## Joining by: "PWS.Name"
compliance_system_r <- violations_list %>%
  filter(Compliance.Status=="Returned to Compliance") %>%
  group_by(PWS.Name) %>%
  summarise(complied=n()) %>%
  arrange(-complied)

compliance_system_k <- violations_list %>%
  filter(Compliance.Status=="Known") %>%
  group_by(PWS.Name) %>%
  summarise(known=n()) %>%
  arrange(-known)

systems_most <- left_join(systems_most, compliance_system_r)
## Joining by: "PWS.Name"
systems_most <- left_join(systems_most, compliance_system_k)
## Joining by: "PWS.Name"
systems_most$percent.health.based <- round(systems_most$health.based.violations/systems_most$violations*100,2)
systems_most$percent.complied <- round(systems_most$complied/systems_most$violations*100,2)

systems_most <- systems_most[c("PWS.Name", "violations", "percent.health.based", "percent.complied")]
colnames(systems_most) <- c("name", "violations", "percent.health.based", "percent.complied")

systems_most <- left_join(systems_most, all_links_only)
## Joining by: "name"
systems_most$system <- paste0("<a href='", systems_most$link, "' target='_blank'>", systems_most$name, "</a>")
systems_most <- systems_most[c("name", "violations", "percent.health.based", "percent.complied")]
systems_most <- data.frame(systems_most)

Rate of compliance and health-based violations

datatable(systems_most)


10 most-frequent drinking water violation by rule

violations_rule <- violations %>%
  group_by(Rule.Name) %>%
  summarise(Count=n()) %>%
  arrange(-Count)
violations_rule <- violations_rule[1:10,]

violations_type <- violations %>%
  group_by(Violation.Type) %>%
  summarise(Count=n()) %>%
  arrange(-Count)
violations_type <- violations_type[1:10,]

violations_rule <- data.frame(violations_rule)
c <- ggplot(violations_rule, aes(x=Rule.Name, y=Count))
 c + geom_bar(stat="identity") + coord_flip()


10 most-frequent drinking water violation by type

violations_type <- data.frame(violations_type)
c <- ggplot(violations_type, aes(x=Violation.Type, y=Count))
 c + geom_bar(stat="identity") + coord_flip()