Lab 02: Data Wrangling

Question 1

library(tidyverse)
library(zoo)

url = ('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')
covid = read_csv(url)

library(readxl)

pop <- read_excel("../data/PopulationEstimates.xls",
                                  skip = 2)

covid_filter = covid %>%
    filter(state %in% state.name) %>%
    filter(date == max(date))

pop_filter = pop %>%
    select(fips = "FIPStxt", state = "State", "Area_Name", pop2019 = "POP_ESTIMATE_2019") %>%
    group_by(Area_Name) %>%
    slice_max(pop2019, n = 1) %>%
    ungroup()

info = covid


cumu = covid %>%
    filter(state == "California", date == max(date)) %>%
    group_by(county) %>%
    summarise(cases = sum(cases)) %>%
    ungroup() %>%
    mutate(newCases = cases - lag(cases)) %>%
    slice_max(newCases, n = 5) %>%
    select(county, cases)

knitr::kable(cumu, caption = "Most Cumulative Cases in CA Counties",
             col.names = c("County", "Cases"))

Most Cumulative Cases in CA Counties
County	Cases
Los Angeles	253985
Riverside	55073
Orange	52121
San Bernardino	50699
Kern	30735

library(tidyverse)
library(zoo)

new_most_cases = info %>%
    filter(state == "California", date == max(date)) %>%
    group_by(county) %>%
    summarise(cases = sum(cases)) %>%
    ungroup() %>%
    mutate(newCases = cases - lag(cases)) %>%
    slice_max(newCases, n = 5) %>%
    select(county, newCases)

knitr::kable(new_most_cases, caption = "Most New COVID-19 Cases from CA Counties",
             col.names = c("County", "New Cases"))

Most New COVID-19 Cases from CA Counties
County	New Cases
Los Angeles	253258
Riverside	55031
Orange	51639
San Bernardino	49461
Kern	30558

library(tidyverse)
library(zoo)

covid_pop = covid_filter %>%
    right_join(pop_filter, by = c('county' = 'Area_Name')) %>%
    select(state.y, county, pop2019, cases) %>%
    mutate(cases_per_capita = cases/pop2019) %>%
    slice_max(cases, n = 5) %>%
    select (county, cases_per_capita, cases)

knitr::kable(covid_pop, caption = "Counties with the Most Cumulative Cases Per Capita",
             col.names = c("County", "Cases", "Cases Per Capita"))

Counties with the Most Cumulative Cases Per Capita
County	Cases	Cases Per Capita
Baltimore city	0.0257325	15272
Oklahoma	0.0038358	15178
Utah	0.0040924	13120
Delaware	0.0111495	10857
Washington	0.0010784	8212

library(tidyverse)
library(zoo)

most_cpc = covid_filter %>%
    right_join(pop_filter, by = c('county' = 'Area_Name')) %>%
    select(state.y, county, pop2019, cases) %>%
    mutate(newCases = cases - lag(cases)) %>%
    mutate(new_cpc = newCases/pop2019) %>%
    slice_max(newCases, n =5) %>%
    select(county, newCases, new_cpc)

knitr::kable(most_cpc, caption = "Counties with the Most New COVID-19 Cases Per Capita",
             col.names = c("County", "New Cases", "Cases Per Capita"))

Counties with the Most New COVID-19 Cases Per Capita
County	New Cases	Cases Per Capita
Baltimore city	15257	0.0257073
Oklahoma	14545	0.0036758
Utah	12526	0.0039071
Washington	8035	0.0010552
Delaware	6750	0.0069319

library(tidyverse)
library(zoo)

two_weeks = covid %>%
    right_join(pop_filter, by = "fips") %>%
    select(state.y, county, pop2019, cases, date) %>%
    filter(date > max(date, na.rm = TRUE) - 14) %>%
    group_by(county) %>%
    mutate(newCases = cases - lag(cases)) %>%
    summarize(tot14 = sum(newCases, na.rm =TRUE) / (pop2019[1]/100000)) %>%
    slice_max(tot14, n = 5)

knitr::kable(two_weeks, caption = "Total Number of New Case per 100,000 people
             in the Last 14 Days",
             col.names = c("County", "Total Number of New Cases"))

Total Number of New Case per 100,000 people in the Last 14 Days
County	Total Number of New Cases
Cameron	312361.97
Franklin	117431.93
Union	75339.24
Richland	64362.39
Jackson	38084.35

Based on my findings, I found that Los Angeles had the most cumulative cases in CA counties with 223,031 cases while, also, being first for having the most new COVID-19 cases in CA counties with 223,328 cases daily. Finally, I have concluded the following counties to be safe based on their low Total Cases Per Capita: Kalawao, Loving, Kenedy, Arthur, and Petroleum.

Question 2

library(tidyverse)
library(zoo)

specific_sates = covid %>%
    filter(state %in% c('California', 'New York', 'Louisiana', 'Florida')) %>%
    group_by(date, state) %>%
    summarise(cases = sum(cases)) %>%
    group_by(state) %>%
    mutate(newCases = cases - lag(cases),
           roll7 = rollmean(newCases, 7, fill = NA, align="right"))

specific_sates %>%
    ggplot(aes(x = date, y = newCases, col = state)) +
    geom_col(aes(y = newCases), col = NA, fill = "green") +
    geom_line(aes(y = roll7), color = "darkred", size = 1) +
    facet_wrap(~state, scale = "free_y") +
    ggthemes::theme_gdocs() +
    theme(legend.position = "none") +
    labs(title = "Daily New COVID-19 Cases at the State Level",
         x = "Date",
         y = "New Cases",
         caption = "2020 Summer Session B for GEOG 176A",
         subtitle = "COVID-19 Data: NY-Times")

library(tidyverse)
library(zoo)

nc_tc = covid %>%
    right_join(pop_filter, by = "fips") %>%
    filter(state.x %in% c('California', 'New York', 'Louisiana', 'Florida')) %>% 
    select(date, state.x, cases, pop2019) %>%
    group_by(date,state.x, pop2019) %>% 
    summarise(cases = sum(cases)) %>%
    group_by(state.x) %>%
    mutate(newCases = cases - lag(cases)) %>%
    mutate(tot14 = sum(newCases, na.rm =TRUE) / pop2019) %>%
    mutate(roll7 = rollmean(tot14, 7, fill = NA, align="right")) 
    
              
nc_tc %>%
    ggplot(aes(x = date , y = tot14)) +
    geom_col(aes(y = tot14), fill = "yellow", col = NA) +
    geom_line(aes(y = roll7), col = "blue", size = .5) +
    facet_wrap(~state.x, scale = "free_y") +
    ggthemes::theme_gdocs() +
    theme(legend.position = "none") +
    labs(title = "COVID-19 Cases Per Capita at the State Level",
         x = "Date",
         y = "Cases Per Capita",
         caption = "2020 Summer Session B for GEOG 176A",
         subtitle = "COVID-19 Data: NY-Times")

Implementing the population as scale has a deep impact on the results because not every State has the same number of population; some have more, some have less. This makes a few States to standout. For example, Louisiana stands out because it is skewed to the left and and skewed to the right, as well, with a gap in the middle. This was due to the population being much less than the others, putting them side by side with a free scale.

Lab 02: Data Wrangling

GEOG 176A Summer Session B 2020

Jovany Cota

14 September, 2020

Question 1

Question 2