library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
library(usmap)
library(lmtest)
library(pracma)
library(lmtest)
library(forecast)
library(vars)
library(rvest)
library(RSelenium)
library(seleniumPipes)
library(dLagM)
library(jsonlite)
library(rgdal)
library(esri2sf)
library(readr)
options(
tigris_class = "sf",
tigris_use_cache = TRUE
)
Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")
# Load the processing functions. This also loads the normalization functions.
source("safegraph_process_patterns_functions.R")
# SET your path here to the covid19analysis folder.
sg_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/'
# SET your path here to the github covid folder
github_path <- ''
Santa Clara County and San Francisco zip code cases
scc_map_cases <- esri2sf("https://services2.arcgis.com/RiZWfy7B1r76pKTz/arcgis/rest/services/COVID_zip_FC/FeatureServer/0")
## [1] "Feature Layer"
## [1] "esriGeometryPolygon"
sf_place_cases <-
read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv") %>%
filter(county == 'San Francisco')
Get social distancing data
scc_blockgroups <-
block_groups("CA","Santa Clara", cb=F, progress_bar=F) %>%
st_transform('+proj=longlat +datum=WGS84')
sf_blockgroups <-
block_groups("CA","San Francisco", cb=F, progress_bar=F) %>%
st_transform('+proj=longlat +datum=WGS84')
# zip code areas
zctas_94 <-
zctas(cb = F, starts_with = "94") %>%
st_transform('+proj=longlat +datum=WGS84') %>%
dplyr::select(ZCTA5CE10, geometry)
zctas_95 <-
zctas(cb = F, starts_with = "95") %>%
st_transform('+proj=longlat +datum=WGS84') %>%
dplyr::select(ZCTA5CE10, geometry)
zctas_94_95 <- rbind(zctas_94, zctas_95)
# join with block groups
scc_bg_zctas <- scc_blockgroups %>%
st_centroid() %>%
st_join(zctas_94_95) %>%
dplyr::select(GEOID, ZCTA5CE10) %>%
rename(blockgroup = GEOID, zipcode = ZCTA5CE10)
sf_bg_zctas <- sf_blockgroups %>%
st_centroid() %>%
st_join(zctas_94) %>%
dplyr::select(GEOID, ZCTA5CE10) %>%
rename(blockgroup = GEOID, zipcode = ZCTA5CE10)
bay_sd <- readRDS("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/bay_socialdistancing_v2.rds") %>%
mutate(date = date_range_start %>% substr(1,10) %>% as.Date())
# obtaining weekends
weekends <- bay_sd %>%
filter(!duplicated(date)) %>%
arrange(date) %>%
mutate(weekend = ifelse((date %>% as.numeric()) %% 7 %in% c(2,3), T, F)) %>%
dplyr::select(date,weekend)
bay_sd <- bay_sd %>% left_join(weekends)
First just plot the cases over time by zip code
# obtain the saved census data
acs_vars = readRDS("/Users/simonespeizer/Documents/2020 Spring Quarter/CEE 218Z/censusData2018_acs_acs5.rds")
# define a function for pulling census data
pullCensus <- function(variableToPull, county) {
regionString <- paste0("state:06+county:", county)
censusData <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = regionString,
vars = variableToPull
) %>%
mutate(blockgroup = paste0(state,county,tract,block_group)) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME"))
return(censusData)
}
# get population data for San Francisco
sf_fips <- fips("CA", "San Francisco") %>% substr(3,5)
sf_pop_zip <- pullCensus("B01003_001E", sf_fips) %>%
left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
group_by(zipcode) %>%
summarize(total_pop_zip = sum(B01003_001E))
sf_cases_zip <- sf_place_cases %>% left_join(sf_pop_zip, by = c("place" = "zipcode")) %>%
mutate(cases_by_pop = confirmed_cases / total_pop_zip) %>%
rename(zipcode = place)
sf_cases_zip %>% ggplot(
aes(x = date)) +
geom_line(aes(y = cases_by_pop, color=zipcode))
Something weird is going on with case counts on May 7th and 11th, so I remove those dates manually. Also adjust the day in zip code 94117 that is off.
sf_cases_zip_edit <- sf_cases_zip %>% filter(date != "2020-05-07" & date != "2020-05-11") %>% mutate(confirmed_cases = ifelse(zipcode == "94117" & confirmed_cases == 1, confirmed_cases + 37, confirmed_cases), cases_by_pop = confirmed_cases / total_pop_zip)
sf_cases_zip_edit %>% ggplot(
aes(x = date)) +
geom_line(aes(y = cases_by_pop, color=zipcode))
sf_cases_zip_edit %>% ggplot(
aes(x = date)) +
geom_line(aes(y = confirmed_cases, color=zipcode))
Get social distancing data by zip code for the first few days before shelter-in-place
# relevant dates
pre_sd_date_cutoff <- as.Date("2020-03-01")
shelter_date <- as.Date("2020-03-17") # note that this is the day the order went into effect
# get daily social distancing data for SF by date
sf_sd_zip_by_date <- bay_sd %>%
filter(origin_census_block_group %in% sf_blockgroups$GEOID) %>%
left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode, date) %>%
summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count)) %>%
ungroup()
# get block group averages leaving home before shelter in place/COVID-19
sf_pre_sd_leaving_avg <- bay_sd %>%
filter(origin_census_block_group %in% sf_blockgroups$GEOID) %>%
filter(date <= pre_sd_date_cutoff) %>%
left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count)) %>%
mutate(
percent_at_home_pre = total_at_home_zip*100/total_devices,
percent_leaving_home_pre = (100 - percent_at_home_pre)
)
# get average percent leaving the home for the 3 days before shelter in place
sf_sd_zip_3_days_pre <- sf_sd_zip_by_date %>%
filter(date < shelter_date & date >= (shelter_date - 3)) %>%
group_by(zipcode) %>%
summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>%
left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
sf_sd_zip_3_days_pre_buckets <- sf_sd_zip_3_days_pre %>%
mutate(bucketed_percent_leaving = ifelse(percent_leaving_home < 65, "65% or below", ifelse(percent_leaving_home < 70, "65% to 70%", "70% or more")), bucketed_dif_percent_leaving = ifelse(dif_percent_leaving > -5, "-5% or above", ifelse(dif_percent_leaving > -10, "-5% to -10%", ifelse(dif_percent_leaving > -15, "-10% to -15%", "-15% or below"))))
# join with the case data
sf_cases_sd_3dayspre_bucketed_abs <- left_join(sf_cases_zip_edit, sf_sd_zip_3_days_pre_buckets) %>%
group_by(date, bucketed_percent_leaving) %>%
summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
# plot
sf_cases_sd_3dayspre_bucketed_abs %>% filter(!is.na(bucketed_percent_leaving)) %>% ggplot(aes(x = date)) +
geom_line(aes(y = cases_by_pop_grouped, color=bucketed_percent_leaving)) +
ggtitle('Cases per capita for zipcodes grouped by % leaving in the 3 days before SIP') + labs(y = "Cases per capita", x = "Date", color = "% leaving home")
Pretty much the opposite of what we expect. But note that the differences between the groups do not seem to be growing much in the time period we see here - maybe there were pre-existing differences between these groups in terms of their baseline case counts?
Now look at the difference in percent leaving relative to pre SIP.
# try with differences in percent leaving
sf_cases_sd_3dayspre_bucketed_rel <- left_join(sf_cases_zip_edit, sf_sd_zip_3_days_pre_buckets) %>%
group_by(date, bucketed_dif_percent_leaving) %>%
summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
# plot
sf_cases_sd_3dayspre_bucketed_rel %>% filter(!is.na(bucketed_dif_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_dif_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving in the 3 days before SIP') + labs(y = "Cases per capita", x = "Date", color = "dif in % leaving home relative to pre SIP")
This actually looks more like what we would expect, except for the -5% or greater bucket, which do note is only composed of 3 zip codes.
Try now with the 3 days after SIP.
# get average percent leaving the home for the 3 days afterter in place
sf_sd_zip_3_days_post <- sf_sd_zip_by_date %>%
filter(date >= shelter_date & date < (shelter_date + 3)) %>%
group_by(zipcode) %>%
summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>%
left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
sf_sd_zip_3_days_post %>% ggplot(aes(x = zipcode, y = percent_leaving_home)) + geom_point()
sf_sd_zip_3_days_post %>% ggplot(aes(x = zipcode, y = dif_percent_leaving)) + geom_point()
sf_sd_zip_3_days_post_buckets <- sf_sd_zip_3_days_post %>%
mutate(bucketed_percent_leaving = ifelse(percent_leaving_home < 55, "55% or below", ifelse(percent_leaving_home < 60, "55% to 60%", ifelse(percent_leaving_home < 65, "60% to 65%","65% or more"))), bucketed_dif_percent_leaving = ifelse(dif_percent_leaving > -10, "-10% or above", ifelse(dif_percent_leaving > -15, "-10% to -15%", ifelse(dif_percent_leaving > -20, "-15% to -20%", ifelse(dif_percent_leaving > -25, "-20% to -25%", "-25% or below")))))
# join with the case data
sf_cases_sd_3dayspost_bucketed_abs <- left_join(sf_cases_zip_edit, sf_sd_zip_3_days_post_buckets) %>%
group_by(date, bucketed_percent_leaving) %>%
summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
# plot
sf_cases_sd_3dayspost_bucketed_abs %>% filter(!is.na(bucketed_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving in the 3 days on and after SIP') + labs(y = "Cases per capita", x = "Date", color = "% leaving home")
Again pretty much the opposite of expectations.
# try with differences in percent leaving
sf_cases_sd_3dayspost_bucketed_rel <- left_join(sf_cases_zip_edit, sf_sd_zip_3_days_post_buckets) %>%
group_by(date, bucketed_dif_percent_leaving) %>%
summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
# plot
sf_cases_sd_3dayspost_bucketed_rel %>% filter(!is.na(bucketed_dif_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_dif_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving in the 3 days on and after SIP') + labs(y = "Cases per capita", x = "Date", color = "dif in % leaving home relative to pre SIP")
Slightly more as expected, but still weird.
Try the whole time from SIP through April 20th, the first day that we have case data
# get average percent leaving the home for the 3 days after shelter in place
sf_sd_zip_till420 <- sf_sd_zip_by_date %>%
filter(date >= shelter_date & date < ("2020-04-20")) %>%
group_by(zipcode) %>%
summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>%
left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
sf_sd_zip_till420 %>% ggplot(aes(x = zipcode, y = percent_leaving_home)) + geom_point()
sf_sd_zip_till420 %>% ggplot(aes(x = zipcode, y = dif_percent_leaving)) + geom_point()
sf_sd_zip_till420_buckets <- sf_sd_zip_3_days_post %>%
mutate(bucketed_percent_leaving = ifelse(percent_leaving_home < 50, "50% or below", ifelse(percent_leaving_home < 55, "50% to 55%", ifelse(percent_leaving_home < 60, "55% to 60%", "60% or more"))), bucketed_dif_percent_leaving = ifelse(dif_percent_leaving > -15, "-15% or above", ifelse(dif_percent_leaving > -20, "-15% to -20%", ifelse(dif_percent_leaving > -25, "-20% to -25%", "-25% or below"))))
# join with the case data
sf_cases_sd_till420_bucketed_abs <- left_join(sf_cases_zip_edit, sf_sd_zip_till420_buckets) %>%
group_by(date, bucketed_percent_leaving) %>%
summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
# plot
sf_cases_sd_till420_bucketed_abs %>% filter(!is.na(bucketed_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving since SIP') + labs(y = "Cases per capita", x = "Date", color = "% leaving home")
Same surprising trend, but again the differences between the top two groups don’t seem to grow a ton (though they do relative to the bottom one).
Let’s look at this in scatter plot form. In the plot below each point represents a date, and again we’re looking at bucketed zip codes by their movement.
# scatter plot
sf_cases_sd_till420_bucketed_abs %>% filter(!is.na(bucketed_percent_leaving)) %>% ggplot(aes(x = bucketed_percent_leaving, y = cases_by_pop_grouped)) + geom_point() + ggtitle('Cases by population through current date, grouped by movement, SF')
Again look at differences in percent leaving.
# try with differences in percent leaving
sf_cases_sd_till420_bucketed_rel <- left_join(sf_cases_zip_edit, sf_sd_zip_till420_buckets) %>%
group_by(date, bucketed_dif_percent_leaving) %>%
summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
# plot
sf_cases_sd_till420_bucketed_rel %>% filter(!is.na(bucketed_dif_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_dif_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving since SIP') + labs(y = "Cases per capita", x = "Date", color = "dif in % leaving home relative to pre SIP")
Except for the -15% or above, more as we’d expect.
Looking back at the original case growth over time by zip codes, it looks like there are generally two groups–one set of zip codes that have >0.003 cases per capita, and one set that have <0.002 cases per capita by May 18th. The first set have rapid growths, while those in the second set generally have less if any growth. Let’s try breaking it up by cases.
sf_cases_zip_bucketed <- sf_cases_zip_edit %>%
filter(date == max(date)) %>%
mutate(bucketed_case_level = ifelse(cases_by_pop <=0.002, "< 0.002", "> 0.002"))
sf_sd_zip_cases_bucket <- left_join(sf_cases_zip_bucketed %>% dplyr::select(-date), sf_sd_zip_by_date)
sf_sd_zip_cases_bucket_movement <- sf_sd_zip_cases_bucket %>% filter(!is.na(bucketed_case_level)) %>%
group_by(bucketed_case_level, date) %>%
summarize(total_at_home_bucket = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home_bucket*100/total_devices, percent_leaving_home = (100 - percent_at_home))
sf_sd_zip_cases_bucket_movement %>% filter(!is.na(bucketed_case_level)) %>% ggplot(aes(x = date)) +
geom_line(aes(y = percent_leaving_home, color=bucketed_case_level)) + ggtitle('% leaving home over time grouped by cases per capita') + labs(y = "% leaving home", x = "Date", color = "cases per capita by May 18th")
Same figure but with plot_ly for more interactivity.
fig_sf_movt_case_bucket <- sf_sd_zip_cases_bucket_movement %>% filter(!is.na(bucketed_case_level)) %>%
plot_ly() %>%
add_lines(x = ~date, y = ~percent_leaving_home, color = ~bucketed_case_level) %>%
layout(xaxis = list(title = 'Date'), yaxis = list(title = '% leaving home'), legend = list(title = list(text = "Cases per capita by May 18th")), title = "% leaving home over time grouped by cases per capita")
fig_sf_movt_case_bucket
Also pretty weird. Try looking just at after shelter in place, and the difference relative to before.
# try looking just after shelter in place
# get pre shelter in place averages
sf_sd_zip_cases_bucket_movement_pre_avg <- sf_sd_zip_cases_bucket_movement %>% filter(date < pre_sd_date_cutoff) %>% ungroup() %>%
group_by(bucketed_case_level) %>%
summarize(total_at_home_pre = sum(total_at_home_bucket), total_devices = sum(total_devices)) %>%
mutate(percent_at_home= total_at_home_pre*100/total_devices, percent_leaving_home_pre = (100-percent_at_home))
sf_sd_zip_cases_bucket_movement_post <- sf_sd_zip_cases_bucket_movement %>% filter(date >= shelter_date) %>%
left_join(sf_sd_zip_cases_bucket_movement_pre_avg %>% dplyr::select(bucketed_case_level, percent_leaving_home_pre)) %>%
mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
sf_sd_zip_cases_bucket_movement_post %>% filter(!is.na(bucketed_case_level)) %>% ggplot(aes(x = date)) +
geom_line(aes(y = dif_percent_leaving, color=bucketed_case_level)) + ggtitle('% leaving home over time (relative to pre SIP) grouped by cases per capita') + labs(y = "difference in % leaving home", x = "Date", color = "cases per capita by May 18th")
fig_sf_movt_case_bucket_post <- sf_sd_zip_cases_bucket_movement_post %>% filter(!is.na(bucketed_case_level)) %>%
plot_ly() %>%
add_lines(x = ~date, y = ~dif_percent_leaving, color = ~bucketed_case_level) %>%
layout(xaxis = list(title = 'Date'), yaxis = list(title = 'difference in % leaving home'), legend = list(title = list(text = "Cases per capita by May 18th")), title = "% leaving home over time (relative to pre SIP) grouped by cases per capita")
fig_sf_movt_case_bucket_post
That does look more like what we would expect.
First just visualize what the case data looks like.
scc_cases_zip <- scc_map_cases %>%
dplyr::select(Zipcode, Cases, Population) %>%
st_drop_geometry() %>%
rename(zipcode = Zipcode, cases = Cases, pop = Population) %>%
mutate(cases = replace_na(cases, 0)) %>%
mutate(cases_by_pop = cases/pop) %>%
filter(is.numeric(cases_by_pop))
scc_cases_zip %>% ggplot(aes(x = zipcode, y = cases_by_pop)) + geom_point()
Visualize zip code level leaving home data for SCC.
# get leaving home data on zip code level
scc_sd_zip_by_date <- bay_sd %>%
filter(origin_census_block_group %in% scc_blockgroups$GEOID) %>%
left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode, date) %>%
summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count)) %>%
mutate(
percent_at_home = total_at_home_zip*100/total_devices,
percent_leaving_home = (100 - percent_at_home)
)
scc_sd_zip_by_date %>% ggplot(aes(x = date)) + geom_line(aes(y = percent_leaving_home, color = zipcode))
Try first looking at the overall % leaving home since SIP and cases as of pulled on May 20th.
# get zip code averages leaving home before SIP
scc_sd_zip_pre_avgs <- scc_sd_zip_by_date %>%
filter(date < pre_sd_date_cutoff) %>%
group_by(zipcode) %>%
summarize(total_at_home_pre = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home_pre = total_at_home_pre*100/total_devices, percent_leaving_home_pre = (100 - percent_at_home_pre))
scc_sd_zip_post <- scc_sd_zip_by_date %>%
filter(date >= shelter_date) %>%
group_by(zipcode) %>%
summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>%
left_join(scc_sd_zip_pre_avgs %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre) %>%
left_join(scc_cases_zip)
# absolute percent leaving
scc_sd_zip_post %>% ggplot(
aes(x = percent_leaving_home, y = cases_by_pop)) + geom_point() + ggtitle("Santa Clara County") + geom_smooth(method=lm)
scc_zip_cases_sd_cor_abs <- lm(cases_by_pop ~ percent_leaving_home, scc_sd_zip_post , na.action = na.omit)
summary(scc_zip_cases_sd_cor_abs)
##
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = scc_sd_zip_post,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0016790 -0.0005276 -0.0001715 0.0002346 0.0029655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.615e-04 8.709e-04 0.415 0.680
## percent_leaving_home 1.898e-05 1.725e-05 1.100 0.276
##
## Residual standard error: 0.0008858 on 53 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.02232, Adjusted R-squared: 0.003877
## F-statistic: 1.21 on 1 and 53 DF, p-value: 0.2763
# try with difference in percent leaving
scc_sd_zip_post %>% ggplot(
aes(x = dif_percent_leaving, y = cases_by_pop)) + geom_point() + ggtitle("Santa Clara County") + geom_smooth(method=lm)
scc_zip_cases_sd_cor <- lm(cases_by_pop ~ dif_percent_leaving, scc_sd_zip_post , na.action = na.omit)
summary(scc_zip_cases_sd_cor)
##
## Call:
## lm(formula = cases_by_pop ~ dif_percent_leaving, data = scc_sd_zip_post,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.865e-03 -4.305e-04 -7.476e-05 2.154e-04 3.022e-03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.450e-03 4.086e-04 5.996 1.84e-07 ***
## dif_percent_leaving 4.302e-05 1.483e-05 2.900 0.00542 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0008323 on 53 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.137, Adjusted R-squared: 0.1207
## F-statistic: 8.411 on 1 and 53 DF, p-value: 0.005417
Try bucketing by cases per capita.
# bucket case counts
scc_cases_bucketed <- scc_cases_zip %>%
mutate(bucketed_cases_by_pop = ifelse(cases_by_pop > 0.002, "0.002 or more", ifelse(cases_by_pop > 0.0015, "0.0015 to 0.002", ifelse(cases_by_pop > 0.001, "0.001 to 0.0015", ifelse(cases_by_pop > 0, "0.00001 to 0.001", "0")))))
scc_sd_zip_by_date_case_bucket <- left_join(scc_cases_bucketed, scc_sd_zip_by_date) %>%
filter(!is.na(bucketed_cases_by_pop) & !is.na(date)) %>%
group_by(date, bucketed_cases_by_pop) %>%
summarize(total_at_home_bucket = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home_bucket*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% ungroup()
fig_scc_movt_case_bucket <- plot_ly(scc_sd_zip_by_date_case_bucket) %>%
add_trace(x = ~date, y = ~percent_leaving_home, color = ~bucketed_cases_by_pop, type = 'scatter', mode = 'lines') %>%
layout(xaxis = list(title = 'Date'), yaxis = list(title = '% leaving home'), legend = list(title = list(text = "Cases per capita")), title = "% leaving home over time grouped by cases per capita, Santa Clara County")
fig_scc_movt_case_bucket
# try looking at this in scatter plot form
scc_sd_zip_by_date_case_bucket_pre <- scc_sd_zip_by_date_case_bucket %>%
filter(date < pre_sd_date_cutoff) %>%
group_by(bucketed_cases_by_pop) %>%
summarize(total_at_home_bucket = sum(total_at_home_bucket), total_devices = sum(total_devices)) %>%
mutate(percent_at_home_pre = total_at_home_bucket*100/total_devices, percent_leaving_home_pre = (100 - percent_at_home_pre)) %>% ungroup()
# join with pre data
scc_sd_zip_by_date_case_bucket_post <- scc_sd_zip_by_date_case_bucket %>%
filter(date >= shelter_date) %>%
group_by(bucketed_cases_by_pop) %>%
summarize(total_at_home_bucket = sum(total_at_home_bucket), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home_bucket*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% ungroup() %>%
left_join(scc_sd_zip_by_date_case_bucket_pre %>% dplyr::select(bucketed_cases_by_pop, percent_leaving_home_pre)) %>%
mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
# plot - absolute metric
scc_sd_zip_by_date_case_bucket_post %>% ggplot(aes(x = percent_leaving_home, y = bucketed_cases_by_pop)) + geom_point() + ggtitle('Percent leaving home and cases per capita for buckets of cases per capita, Santa Clara')
# plot - relative metric
scc_sd_zip_by_date_case_bucket_post %>% ggplot(aes(x = dif_percent_leaving, y = bucketed_cases_by_pop)) + geom_point() + ggtitle('Percent leaving home and cases per capita for buckets of cases per capita, Santa Clara')
That generally looks as we’d expect it to–except for the zip codes with zero cases, but it seems like a fair hypothesis that something different might be going on in those counties anyway.
Now let’s look at bucketing movement into categories, like we did for SF, but here we can only look at the outcome of cases at a specific date not over time.
To start, just looking at bucketing overall movement since SIP.
ggplot(scc_sd_zip_post, aes(x = zipcode, y = percent_leaving_home)) + geom_point()
ggplot(scc_sd_zip_post, aes(x = zipcode, y = dif_percent_leaving)) + geom_point()
scc_sd_zip_post_bucketed <- scc_sd_zip_post %>%
mutate(bucketed_percent_leaving = ifelse(percent_leaving_home > 60, "60% or more", ifelse(percent_leaving_home > 55, "55% to 60%", ifelse(percent_leaving_home > 50, "50% to 55%", ifelse(percent_leaving_home > 45, "45% to 50%", ifelse(percent_leaving_home > 40, "40% to 45%", "40% or less"))))), bucketed_dif_leaving = ifelse(dif_percent_leaving > 0, "0% or above", ifelse(dif_percent_leaving > -20, "-0% to -20%", ifelse(dif_percent_leaving > -25, "-20% to -25%", ifelse(dif_percent_leaving > -30, "-25% to -30%", ifelse(dif_percent_leaving > -35, "-30% to -35%", ifelse(dif_percent_leaving > -40, "-35% to -40%", "-40% or lower"))))))) %>%
left_join(scc_cases_zip) %>%
filter(!is.na(cases))
scc_sd_cases_post_bucketed_abs <- scc_sd_zip_post_bucketed %>%
group_by(bucketed_percent_leaving) %>%
summarize(total_cases = sum(cases), total_pop = sum(pop)) %>%
mutate(cases_by_pop_grouped = total_cases/total_pop)
scc_sd_cases_post_bucketed_abs %>% ggplot(aes(x = bucketed_percent_leaving, y = cases_by_pop_grouped)) + geom_point() + ggtitle('Santa Clara')
Wow! A trend that (generally) makes sense!
Look at the difference in percent leaving.
scc_sd_cases_post_bucketed_rel <- scc_sd_zip_post_bucketed %>%
group_by(bucketed_dif_leaving) %>%
summarize(total_cases = sum(cases), total_pop = sum(pop)) %>%
mutate(cases_by_pop_grouped = total_cases/total_pop)
scc_sd_cases_post_bucketed_rel %>% ggplot(aes(x = bucketed_dif_leaving, y = cases_by_pop_grouped)) + geom_point() + ggtitle('Santa Clara')
Not quite as much as of the expected trend, but appears for -25 to -40 or below buckets.
Looking at the number of cases and movement through 1 week before the most recent date on the case data.
sf_case_date_max <- max(sf_cases_zip$date)
sf_sd_cases_curr <- sf_sd_zip_by_date %>%
filter(date >= shelter_date & date <= sf_case_date_max - 7) %>%
group_by(zipcode) %>%
summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>%
left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre) %>%
left_join(sf_cases_zip_edit %>% filter(date == sf_case_date_max)) %>%
filter(!is.na(confirmed_cases))
fig_sf_sd_cases_curr <- plot_ly(sf_sd_cases_curr) %>%
add_trace(x = ~percent_leaving_home, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~percent_leaving_home, y = fitted(lm(sf_sd_cases_curr$cases_by_pop~ sf_sd_cases_curr$percent_leaving_home)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SFC, cases per capita and % leaving home through 1 week before case data")
model_sf_curr <- lm(cases_by_pop ~ percent_leaving_home, sf_sd_cases_curr)
summary(model_sf_curr)
##
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = sf_sd_cases_curr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0019316 -0.0012394 -0.0007684 0.0011670 0.0031014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0063697 0.0060659 1.050 0.306
## percent_leaving_home -0.0000788 0.0001190 -0.662 0.515
##
## Residual standard error: 0.001741 on 21 degrees of freedom
## Multiple R-squared: 0.02045, Adjusted R-squared: -0.0262
## F-statistic: 0.4384 on 1 and 21 DF, p-value: 0.5151
fig_sf_sd_cases_curr
fig_sf_sd_cases_curr_dif <- plot_ly(sf_sd_cases_curr) %>%
add_trace(x = ~dif_percent_leaving, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~dif_percent_leaving, y = fitted(lm(sf_sd_cases_curr$cases_by_pop~ sf_sd_cases_curr$dif_percent_leaving )), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Difference in % of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SFC, cases per capita and difference in % leaving home through 1 week before case data")
fig_sf_sd_cases_curr_dif
model_sf_curr_dif <- lm(cases_by_pop ~ dif_percent_leaving, sf_sd_cases_curr)
summary(model_sf_curr_dif)
##
## Call:
## lm(formula = cases_by_pop ~ dif_percent_leaving, data = sf_sd_cases_curr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0019606 -0.0012696 -0.0008445 0.0013284 0.0031583
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.599e-03 2.234e-03 2.058 0.0522 .
## dif_percent_leaving 9.479e-05 9.339e-05 1.015 0.3217
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001718 on 21 degrees of freedom
## Multiple R-squared: 0.04676, Adjusted R-squared: 0.001368
## F-statistic: 1.03 on 1 and 21 DF, p-value: 0.3217
Note the need to manually include and update the date here.
scc_case_date_max <- as.Date("2020-05-26") # need to manually include and update this
scc_sd_cases_curr <- scc_sd_zip_by_date %>%
filter(date >= shelter_date & date <= scc_case_date_max - 7) %>%
group_by(zipcode) %>%
summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>%
left_join(scc_sd_zip_pre_avgs %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre) %>%
left_join(scc_cases_zip) %>%
filter(!is.na(cases))
fig_scc_sd_cases_curr <- plot_ly(scc_sd_cases_curr) %>%
add_trace(x = ~percent_leaving_home, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~percent_leaving_home, y = fitted(lm(scc_sd_cases_curr$cases_by_pop~ scc_sd_cases_curr$percent_leaving_home)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SCC, cases per capita and % leaving home through 1 week before case data")
fig_scc_sd_cases_curr
model_scc_curr <- lm(cases_by_pop ~ percent_leaving_home, scc_sd_cases_curr)
summary(model_scc_curr)
##
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = scc_sd_cases_curr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0016871 -0.0005254 -0.0001648 0.0002365 0.0029581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.869e-04 8.484e-04 0.456 0.650
## percent_leaving_home 1.856e-05 1.688e-05 1.099 0.277
##
## Residual standard error: 0.0008859 on 53 degrees of freedom
## Multiple R-squared: 0.0223, Adjusted R-squared: 0.003853
## F-statistic: 1.209 on 1 and 53 DF, p-value: 0.2765
fig_scc_sd_cases_curr_dif <- plot_ly(scc_sd_cases_curr) %>%
add_trace(x = ~dif_percent_leaving, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~dif_percent_leaving, y = fitted(lm(scc_sd_cases_curr$cases_by_pop~ scc_sd_cases_curr$dif_percent_leaving)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Difference in % of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SCC, cases per capita and difference in % leaving home through 1 week before case data")
fig_scc_sd_cases_curr_dif
model_scc_curr_dif <- lm(cases_by_pop ~ dif_percent_leaving, scc_sd_cases_curr)
summary(model_scc_curr_dif)
##
## Call:
## lm(formula = cases_by_pop ~ dif_percent_leaving, data = scc_sd_cases_curr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.884e-03 -4.369e-04 -9.157e-05 2.132e-04 3.002e-03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.423e-03 4.050e-04 5.983 1.93e-07 ***
## dif_percent_leaving 4.164e-05 1.456e-05 2.859 0.00606 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0008339 on 53 degrees of freedom
## Multiple R-squared: 0.1337, Adjusted R-squared: 0.1173
## F-statistic: 8.176 on 1 and 53 DF, p-value: 0.006057
Look at the outliers in this data. The zip codes with no cases are, with their populations:
# find zip codes with 0 cases
zero_case_scc <- scc_sd_cases_curr %>% filter(cases == 0)
print(zero_case_scc$zipcode)
## [1] "95053"
These zip codes have populations:
print(zero_case_scc$pop)
## [1] 3678
Some of these are very small but some are large. Let’s map these with their populations.
for_mapping_zero_case_scc <- left_join(zctas_94_95, zero_case_scc %>% dplyr::select(zipcode, pop), by = c("ZCTA5CE10" = "zipcode")) %>% filter(!is.na(pop))
mapview(for_mapping_zero_case_scc %>% dplyr::select(pop))
Try excluding these.
scc_sd_cases_curr_filtered <- scc_sd_cases_curr %>%
filter(cases != 0) %>%
mutate(log_cases = log(cases), log_cases_by_pop = log(cases_by_pop))
fig_scc_sd_cases_curr_filt <- plot_ly(scc_sd_cases_curr_filtered) %>%
add_trace(x = ~percent_leaving_home, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~percent_leaving_home, y = fitted(lm(scc_sd_cases_curr_filtered$cases_by_pop~ scc_sd_cases_curr_filtered$percent_leaving_home)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SCC, cases per capita and % leaving home through 1 week before case data")
fig_scc_sd_cases_curr_filt
model_scc_curr_filt <- lm(cases_by_pop ~ percent_leaving_home, scc_sd_cases_curr_filtered)
summary(model_scc_curr_filt)
##
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = scc_sd_cases_curr_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0016498 -0.0005012 -0.0001875 0.0001966 0.0028503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.193e-04 8.834e-04 -0.361 0.7193
## percent_leaving_home 3.349e-05 1.773e-05 1.889 0.0645 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0008568 on 52 degrees of freedom
## Multiple R-squared: 0.06421, Adjusted R-squared: 0.04621
## F-statistic: 3.568 on 1 and 52 DF, p-value: 0.06449
fig_scc_sd_cases_curr_dif_filt <- plot_ly(scc_sd_cases_curr_filtered) %>%
add_trace(x = ~dif_percent_leaving, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~dif_percent_leaving, y = fitted(lm(scc_sd_cases_curr_filtered$cases_by_pop~ scc_sd_cases_curr_filtered$dif_percent_leaving)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Difference in % of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SCC, cases per capita and difference in % leaving home through 1 week before case data")
fig_scc_sd_cases_curr_dif_filt
model_scc_curr_filt_dif <- lm(cases_by_pop ~ dif_percent_leaving, scc_sd_cases_curr_filtered)
summary(model_scc_curr_filt_dif)
##
## Call:
## lm(formula = cases_by_pop ~ dif_percent_leaving, data = scc_sd_cases_curr_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0016915 -0.0004076 -0.0001050 0.0002747 0.0029563
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.689e-03 4.018e-04 6.692 1.53e-08 ***
## dif_percent_leaving 5.020e-05 1.434e-05 3.500 0.000963 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0007968 on 52 degrees of freedom
## Multiple R-squared: 0.1907, Adjusted R-squared: 0.1751
## F-statistic: 12.25 on 1 and 52 DF, p-value: 0.0009631
Looking at the growth of cases from some baseline level over time. I will try using 0.0028 cases per capita as the baseline rate and seeing how much various zip codes exceed that in the week folloing when they reach that level, as dependent on their movement in the two weeks before the date they reached that level.
# set the cutoff level
cutoff_cases_by_pop <- 0.0028
# how many days after cutoff level to look
later_date_sep = 7
# how many days prior to look at movement averages
prior_movement_time = 14
# filter to just when cases are above the cutoff
sf_cases_cutoff <- sf_cases_zip_edit %>% filter(cases_by_pop >= cutoff_cases_by_pop)
zipcodes_in_cutoff <- unique(sf_cases_cutoff$zipcode)
later_cases_change <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
leaving_avg <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
for (i in 1:length(zipcodes_in_cutoff)) {
curr_zip <- zipcodes_in_cutoff[i]
curr_vals <- sf_cases_cutoff %>% filter(zipcode == curr_zip)
later_cases_curr = NA
leaving_avg_val = NA
if (curr_vals$date[1] != sf_cases_zip_edit$date[1]) { # need to make sure it didn't start above the cutoff
later_cases_curr = curr_vals$cases_by_pop[later_date_sep] - curr_vals$cases_by_pop[1] # change in cases an amount of time later
# get the percent of devices leaving home in an amount of time before the date that cases went above the threshold level
leaving_computed = sf_sd_zip_by_date %>%
filter(zipcode == curr_zip & date < curr_vals$date[1] & (date >= curr_vals$date[1] - prior_movement_time)) %>%
summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home*100/total_devices, percent_leaving_home = (100 - percent_at_home))
leaving_avg_val = leaving_computed$percent_leaving_home
}
later_cases_change[i] = later_cases_curr
leaving_avg[i] = leaving_avg_val
}
# combine
collected_cases_growth_baseline <- data.frame(zipcodes_in_cutoff, later_cases_change, leaving_avg) %>% filter(!is.na(later_cases_change))
# plot
fig_collected_cases_growth_baseline <- plot_ly(collected_cases_growth_baseline) %>%
add_trace(x = ~leaving_avg, y = ~later_cases_change, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~leaving_avg, y = fitted(lm(collected_cases_growth_baseline$later_cases_change~ collected_cases_growth_baseline$leaving_avg)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = '% of devices leaving home in 2 weeks before'), yaxis = list(title = 'Change in cases per capita 1 week later'), title = "SF, change in cases/person in week after >0.0028 cases/person vs % leaving in 2 weeks before")
fig_collected_cases_growth_baseline
model_collected <- lm(collected_cases_growth_baseline$later_cases_change~ collected_cases_growth_baseline$leaving_avg)
summary(model_collected)
##
## Call:
## lm(formula = collected_cases_growth_baseline$later_cases_change ~
## collected_cases_growth_baseline$leaving_avg)
##
## Residuals:
## 1 2 3 4 5 6 7
## -1.075e-04 8.919e-05 -3.617e-04 5.133e-04 5.867e-05 -1.235e-04 -6.844e-05
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -2.396e-03 1.994e-03 -1.202
## collected_cases_growth_baseline$leaving_avg 5.882e-05 3.977e-05 1.479
## Pr(>|t|)
## (Intercept) 0.283
## collected_cases_growth_baseline$leaving_avg 0.199
##
## Residual standard error: 0.0002957 on 5 degrees of freedom
## Multiple R-squared: 0.3043, Adjusted R-squared: 0.1652
## F-statistic: 2.187 on 1 and 5 DF, p-value: 0.1992
Compare this to just the metric we were looking at before (number of cases and movement through 1 week before the most recent date on the case data) for just these zip codes.
# plot just these zip codes from the figure before
fig_sf_sd_cases_curr_limit <- plot_ly(sf_sd_cases_curr %>% filter(zipcode %in% zipcodes_in_cutoff)) %>%
add_trace(x = ~percent_leaving_home, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~percent_leaving_home, y = fitted(lm(cases_by_pop~ percent_leaving_home, sf_sd_cases_curr %>% filter(zipcode %in% zipcodes_in_cutoff))), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Cases per capita'), title = "SFC, cases/person and % leaving through 1 week before case data, just zip codes in previous analysis")
fig_sf_sd_cases_curr_limit
model_cases_limit <- lm(cases_by_pop~ percent_leaving_home, sf_sd_cases_curr %>% filter(zipcode %in% zipcodes_in_cutoff))
summary(model_cases_limit)
##
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = sf_sd_cases_curr %>%
## filter(zipcode %in% zipcodes_in_cutoff))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0009547 -0.0006777 -0.0001398 0.0006776 0.0012809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0035723 0.0080033 -0.446 0.671
## percent_leaving_home 0.0001611 0.0001592 1.012 0.351
##
## Residual standard error: 0.0009088 on 6 degrees of freedom
## Multiple R-squared: 0.1458, Adjusted R-squared: 0.00338
## F-statistic: 1.024 on 1 and 6 DF, p-value: 0.3507
These two plots are not that similar, which is interesting but makes sense–more evidence that it’s likely important to take into account the original number of cases in a zip code.
For San Francisco, since that’s the data on cases over time that we have. Below I look at the change in cases from April 21st through present. Movement is averaged from 2 weeks before April 21st through 1 week before present.
# get original levels of cases
original_cases <- sf_cases_zip %>% filter(date == (min(date)+1)) %>%
mutate(log_orig_cases = log(confirmed_cases), log_orig_cases_by_pop = log(cases_by_pop)) %>%
dplyr::select(date, zipcode, confirmed_cases, cases_by_pop, log_orig_cases, log_orig_cases_by_pop) %>%
rename(orig_date = date, orig_cases = confirmed_cases, orig_cases_by_pop = cases_by_pop)
# join with current levels of cases and movement averaged from 2 weeks before start of case data through 1 week before most recent case data
sf_sd_cases_curr_orig <- left_join(original_cases, sf_sd_cases_curr %>% dplyr::select(date, confirmed_cases, cases_by_pop, zipcode)) %>%
mutate(log_cases = log(confirmed_cases),
log_cases_by_pop = log(cases_by_pop),
change_cases = confirmed_cases - orig_cases,
change_cases_by_pop = cases_by_pop - orig_cases_by_pop,
change_log_cases = log_cases - log_orig_cases,
change_log_cases_by_pop = log_cases_by_pop - log_orig_cases_by_pop,
perc_change_cases = change_cases*100 / orig_cases,
perc_change_cases_by_pop = change_cases_by_pop*100/orig_cases_by_pop,
perc_change_log_cases = change_log_cases*100/log_orig_cases,
perc_change_log_cases_by_pop = change_log_cases_by_pop*100/log_orig_cases_by_pop) %>%
left_join(sf_sd_zip_by_date %>%
filter((date >= original_cases$orig_date[1] - 14) & (date <= sf_case_date_max - 7)) %>%
group_by(zipcode) %>%
summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home*100/total_devices, percent_leaving_home = (100 - percent_at_home))) %>%
filter(!is.na(confirmed_cases))
# plot
fig_sf_sd_cases_change <- plot_ly(sf_sd_cases_curr_orig) %>%
add_trace(x = ~percent_leaving_home, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~percent_leaving_home, y = fitted(lm(change_cases_by_pop~ percent_leaving_home, sf_sd_cases_curr_orig)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases/person and % leaving home April 7th through 1 week before case data")
fig_sf_sd_cases_change
model_cases_change <- lm(change_cases_by_pop ~ percent_leaving_home, sf_sd_cases_curr_orig)
summary(model_cases_change)
##
## Call:
## lm(formula = change_cases_by_pop ~ percent_leaving_home, data = sf_sd_cases_curr_orig)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0010124 -0.0006528 -0.0004590 0.0006735 0.0019122
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.843e-03 3.216e-03 0.884 0.387
## percent_leaving_home -3.645e-05 6.437e-05 -0.566 0.577
##
## Residual standard error: 0.0009496 on 21 degrees of freedom
## Multiple R-squared: 0.01505, Adjusted R-squared: -0.03186
## F-statistic: 0.3208 on 1 and 21 DF, p-value: 0.5771
# plot change in log of cases
fig_sf_sd_cases_change_log <- plot_ly(sf_sd_cases_curr_orig) %>%
add_trace(x = ~percent_leaving_home, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~percent_leaving_home, y = fitted(lm(change_log_cases_by_pop~ percent_leaving_home, sf_sd_cases_curr_orig)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Change in log of cases per capita from April 21st to present'), title = "SFC, change in cases/person and % leaving home April 7th through 1 week before case data")
fig_sf_sd_cases_change_log
model_cases_change_log <- lm(change_log_cases_by_pop ~ percent_leaving_home, sf_sd_cases_curr_orig)
summary(model_cases_change_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ percent_leaving_home,
## data = sf_sd_cases_curr_orig)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56519 -0.19932 -0.07670 0.06448 1.90013
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.85980 1.59946 1.163 0.258
## percent_leaving_home -0.02573 0.03201 -0.804 0.431
##
## Residual standard error: 0.4723 on 21 degrees of freedom
## Multiple R-squared: 0.02984, Adjusted R-squared: -0.01636
## F-statistic: 0.6459 on 1 and 21 DF, p-value: 0.4306
# plot percent change in cases, filtering outliers of very very high percent changes
fig_sf_sd_cases_change_perc <- plot_ly(sf_sd_cases_curr_orig %>% filter(perc_change_cases_by_pop < 1000)) %>%
add_trace(x = ~percent_leaving_home, y = ~perc_change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~percent_leaving_home, y = fitted(lm(perc_change_cases_by_pop~ percent_leaving_home, sf_sd_cases_curr_orig %>% filter(perc_change_cases_by_pop < 1000))), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of devices leaving home'), yaxis = list(title = 'Percent change in cases per capita from April 21st to present'), title = "SFC, % change in cases/person and % leaving home April 7th through 1 week before most case data")
fig_sf_sd_cases_change_perc
model_cases_change_perc <- lm(perc_change_cases_by_pop ~ percent_leaving_home, sf_sd_cases_curr_orig %>% filter(perc_change_cases_by_pop < 1000))
summary(model_cases_change_perc)
##
## Call:
## lm(formula = perc_change_cases_by_pop ~ percent_leaving_home,
## data = sf_sd_cases_curr_orig %>% filter(perc_change_cases_by_pop <
## 1000))
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.642 -21.207 -4.874 17.523 74.659
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 239.530 120.123 1.994 0.060 .
## percent_leaving_home -3.458 2.403 -1.439 0.166
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.45 on 20 degrees of freedom
## Multiple R-squared: 0.09382, Adjusted R-squared: 0.04851
## F-statistic: 2.071 on 1 and 20 DF, p-value: 0.1656
This still is weird.
Try just looking at a few correlations between demographic variables, movement, and change in cases per capita since April 21st. The movement and cases metrics are the same as in the previous section. Demographic information from ACS census data.
First occupants per room.
sf_occupants_zip <- pullCensus("group(B25014)", sf_fips) %>%
dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
dplyr::select(-variable) %>%
separate(label, into = c(NA, NA, NA,"occupants per room"), sep = "!!") %>%
filter(!is.na(`occupants per room`)) %>%
group_by(blockgroup, `occupants per room`) %>%
summarize(estimate_tot = sum(estimate)) %>%
spread(key = `occupants per room`, value = estimate_tot) %>%
mutate(total_nums = `0.50 or less occupants per room` + `0.51 to 1.00 occupants per room` + `1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`) %>%
left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
group_by(zipcode) %>%
summarize(total_nums = sum(total_nums), total_0.5less = sum(`0.50 or less occupants per room`), total_0.5to1 = sum(`0.51 to 1.00 occupants per room`), total_1to1.5 = sum(`1.01 to 1.50 occupants per room`), total_1.5to2 = sum(`1.51 to 2.00 occupants per room`), total_2more = sum(`2.01 or more occupants per room`)) %>%
mutate(`percent more than 1` = (total_1to1.5 + total_1.5to2 + total_2more) * 100/ total_nums, `percent less than 1` = (100-`percent more than 1`), `percent 0.5 or less` = total_0.5less*100/total_nums, `percent more than 0.5` = (100-`percent 0.5 or less`), `percent more than 2` = total_2more*100/total_nums)
sf_occupants_sd_cases_change <- left_join(sf_occupants_zip, sf_sd_cases_curr_orig) %>% filter(!is.na(confirmed_cases))
# plot
fig_sf_cases_change_occupants <- plot_ly(sf_occupants_sd_cases_change) %>%
add_trace(x = ~`percent more than 1`, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~`percent more than 1`, y = fitted(lm(change_cases_by_pop~ `percent more than 1`, sf_occupants_sd_cases_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of households with more than 1 occupants per room'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and occupants per room")
fig_sf_cases_change_occupants
model_cases_change_occupants <- lm(change_cases_by_pop ~ `percent more than 1`, sf_occupants_sd_cases_change)
summary(model_cases_change_occupants)
##
## Call:
## lm(formula = change_cases_by_pop ~ `percent more than 1`, data = sf_occupants_sd_cases_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0013355 -0.0004266 -0.0002839 0.0005227 0.0015590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.944e-04 3.193e-04 0.922 0.367
## `percent more than 1` 1.161e-04 4.276e-05 2.714 0.013 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0008233 on 21 degrees of freedom
## Multiple R-squared: 0.2597, Adjusted R-squared: 0.2245
## F-statistic: 7.367 on 1 and 21 DF, p-value: 0.01299
# try with more than 2 people
fig_sf_cases_change_occupants_2 <- plot_ly(sf_occupants_sd_cases_change) %>%
add_trace(x = ~`percent more than 2`, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~`percent more than 2`, y = fitted(lm(change_cases_by_pop~ `percent more than 2`, sf_occupants_sd_cases_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of households with more than 2 occupants per room'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and occupants per room")
fig_sf_cases_change_occupants_2
model_cases_change_occupants_2 <- lm(change_cases_by_pop ~ `percent more than 2`, sf_occupants_sd_cases_change)
summary(model_cases_change_occupants)
##
## Call:
## lm(formula = change_cases_by_pop ~ `percent more than 1`, data = sf_occupants_sd_cases_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0013355 -0.0004266 -0.0002839 0.0005227 0.0015590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.944e-04 3.193e-04 0.922 0.367
## `percent more than 1` 1.161e-04 4.276e-05 2.714 0.013 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0008233 on 21 degrees of freedom
## Multiple R-squared: 0.2597, Adjusted R-squared: 0.2245
## F-statistic: 7.367 on 1 and 21 DF, p-value: 0.01299
# combine with social distancing
model_cases_change_occupants_sd <- lm(change_cases_by_pop ~ `percent more than 1` + percent_leaving_home, sf_occupants_sd_cases_change)
summary(model_cases_change_occupants_sd)
##
## Call:
## lm(formula = change_cases_by_pop ~ `percent more than 1` + percent_leaving_home,
## data = sf_occupants_sd_cases_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0013552 -0.0004288 -0.0002279 0.0005456 0.0014971
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.502e-04 3.138e-03 -0.175 0.8626
## `percent more than 1` 1.203e-04 4.645e-05 2.590 0.0175 *
## percent_leaving_home 1.640e-05 6.061e-05 0.271 0.7895
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.000842 on 20 degrees of freedom
## Multiple R-squared: 0.2624, Adjusted R-squared: 0.1886
## F-statistic: 3.558 on 2 and 20 DF, p-value: 0.04766
# try with absolute numbers of cases
model_cases_change_occupants_abs <- lm(change_cases ~ `percent more than 2`, sf_occupants_sd_cases_change)
summary(model_cases_change_occupants_abs)
##
## Call:
## lm(formula = change_cases ~ `percent more than 2`, data = sf_occupants_sd_cases_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.83 -28.29 -18.42 13.64 165.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.418 13.708 2.657 0.0148 *
## `percent more than 2` 7.791 10.001 0.779 0.4447
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.98 on 21 degrees of freedom
## Multiple R-squared: 0.02809, Adjusted R-squared: -0.01819
## F-statistic: 0.6069 on 1 and 21 DF, p-value: 0.4447
That doesn’t seem significant.
Next try population density, using the area of the zip code regions
sf_pop_density_zip <- sf_cases_zip %>% filter(date == max(date)) %>%
dplyr::select(zipcode, total_pop_zip) %>%
left_join(zctas_94, by = c("zipcode" = "ZCTA5CE10")) %>%
st_as_sf() %>%
mutate(zip_area = st_area(.), pop_density = total_pop_zip / zip_area) %>%
filter(!is.na(pop_density))
mapview(sf_pop_density_zip %>% dplyr::select(pop_density))
sf_pop_density_zip_sd_cases <- left_join(sf_sd_cases_curr_orig, sf_pop_density_zip)
# plot
fig_sf_cases_change_pop_density <- plot_ly(sf_pop_density_zip_sd_cases) %>%
add_trace(x = ~pop_density, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~pop_density, y = fitted(lm(change_cases_by_pop~ pop_density, sf_pop_density_zip_sd_cases)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Population density'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and population density")
fig_sf_cases_change_pop_density
model_cases_change_pop_density <- lm(change_cases_by_pop ~ pop_density, sf_pop_density_zip_sd_cases)
summary(model_cases_change_pop_density)
##
## Call:
## lm(formula = change_cases_by_pop ~ pop_density, data = sf_pop_density_zip_sd_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0009348 -0.0006967 -0.0005629 0.0007396 0.0017506
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0009020 0.0004133 2.182 0.0406 *
## pop_density 0.0135048 0.0397646 0.340 0.7375
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0009542 on 21 degrees of freedom
## Multiple R-squared: 0.005462, Adjusted R-squared: -0.0419
## F-statistic: 0.1153 on 1 and 21 DF, p-value: 0.7375
model_cases_change_pop_density_sd <- lm(change_cases_by_pop ~ pop_density + percent_leaving_home, sf_pop_density_zip_sd_cases)
summary(model_cases_change_pop_density_sd)
##
## Call:
## lm(formula = change_cases_by_pop ~ pop_density + percent_leaving_home,
## data = sf_pop_density_zip_sd_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0010235 -0.0006783 -0.0004324 0.0006923 0.0018714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.924e-03 3.285e-03 0.890 0.384
## pop_density 1.779e-02 4.095e-02 0.434 0.669
## percent_leaving_home -4.133e-05 6.660e-05 -0.621 0.542
##
## Residual standard error: 0.0009685 on 20 degrees of freedom
## Multiple R-squared: 0.02425, Adjusted R-squared: -0.07332
## F-statistic: 0.2485 on 2 and 20 DF, p-value: 0.7823
# absolute cases
model_cases_change_pop_density_abs <- lm(change_cases ~ pop_density, sf_pop_density_zip_sd_cases)
summary(model_cases_change_pop_density_abs)
##
## Call:
## lm(formula = change_cases ~ pop_density, data = sf_pop_density_zip_sd_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.64 -31.47 -23.70 14.30 162.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.19 23.10 1.35 0.191
## pop_density 1267.69 2222.52 0.57 0.574
##
## Residual standard error: 53.33 on 21 degrees of freedom
## Multiple R-squared: 0.01526, Adjusted R-squared: -0.03164
## F-statistic: 0.3253 on 1 and 21 DF, p-value: 0.5745
Not significant.
Try number of people per household.
sf_house_size_zip <- pullCensus("group(B11016)", sf_fips) %>%
dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
dplyr::select(-variable) %>%
separate(label, into = c(NA, NA, "type", "size"), sep = "!!") %>%
filter(!is.na(type)) %>%
filter(!is.na(size)) %>%
dplyr::select(-type) %>%
group_by(blockgroup, size) %>%
summarize(`total of this size` = sum(estimate)) %>%
spread(key = size, value = `total of this size`) %>%
mutate(total_nums = `1-person household` + `2-person household` + `3-person household` + `4-person household` + `5-person household`+ `6-person household` + `7-or-more person household`) %>%
left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
group_by(zipcode) %>%
summarize(total_nums = sum(total_nums), total_1 = sum(`1-person household`), total_2 = sum(`2-person household`), total_3 = sum(`3-person household`), total_4 = sum(`4-person household`), total_5 = sum(`5-person household`), total_6 = sum(`6-person household`), total_7more = sum(`7-or-more person household`)) %>%
mutate(`percent 5 or more` = (total_5 + total_6 + total_7more)* 100/ total_nums, `percent 1 or 2 only` = (total_1 + total_2)*100/total_nums)
sf_size_sd_cases_change <- left_join(sf_sd_cases_curr_orig, sf_house_size_zip)
# plot
fig_sf_cases_change_size <- plot_ly(sf_size_sd_cases_change) %>%
add_trace(x = ~`percent 5 or more`, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~`percent 5 or more`, y = fitted(lm(change_cases_by_pop~ `percent 5 or more`, sf_size_sd_cases_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of households with 5 or more members'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and household size")
fig_sf_cases_change_size
model_cases_change_size <- lm(change_cases_by_pop ~ `percent 5 or more`, sf_size_sd_cases_change)
summary(model_cases_change_size)
##
## Call:
## lm(formula = change_cases_by_pop ~ `percent 5 or more`, data = sf_size_sd_cases_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0009727 -0.0007008 -0.0003047 0.0004362 0.0018114
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.336e-04 2.852e-04 2.572 0.0178 *
## `percent 5 or more` 4.583e-05 3.329e-05 1.376 0.1832
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0009164 on 21 degrees of freedom
## Multiple R-squared: 0.08275, Adjusted R-squared: 0.03908
## F-statistic: 1.895 on 1 and 21 DF, p-value: 0.1832
model_cases_change_size_sd <- lm(change_cases_by_pop ~ `percent 5 or more` + percent_leaving_home, sf_size_sd_cases_change)
summary(model_cases_change_size_sd)
##
## Call:
## lm(formula = change_cases_by_pop ~ `percent 5 or more` + percent_leaving_home,
## data = sf_size_sd_cases_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0010071 -0.0007005 -0.0003095 0.0004346 0.0018160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.339e-03 3.405e-03 0.393 0.698
## `percent 5 or more` 4.392e-05 3.573e-05 1.229 0.233
## percent_leaving_home -1.190e-05 6.666e-05 -0.179 0.860
##
## Residual standard error: 0.0009383 on 20 degrees of freedom
## Multiple R-squared: 0.08421, Adjusted R-squared: -0.007365
## F-statistic: 0.9196 on 2 and 20 DF, p-value: 0.4149
# testing with number of cases
fig_sf_cases_change_size <- plot_ly(sf_size_sd_cases_change) %>%
add_trace(x = ~`percent 5 or more`, y = ~change_cases, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~`percent 5 or more`, y = fitted(lm(change_cases~ `percent 5 or more`, sf_size_sd_cases_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of households with 5 or more members'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and household size")
fig_sf_cases_change_size
model_cases_change_size_abs <- lm(change_cases ~ `percent 5 or more`, sf_size_sd_cases_change)
summary(model_cases_change_size_abs)
##
## Call:
## lm(formula = change_cases ~ `percent 5 or more`, data = sf_size_sd_cases_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.21 -25.08 -13.87 18.41 156.55
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.342 14.661 1.046 0.307
## `percent 5 or more` 4.308 1.711 2.517 0.020 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47.1 on 21 degrees of freedom
## Multiple R-squared: 0.2318, Adjusted R-squared: 0.1952
## F-statistic: 6.337 on 1 and 21 DF, p-value: 0.02002
Also not very significant.
Try units in structure.
sf_units_zip <- pullCensus("group(B25024)", sf_fips) %>% dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
dplyr::select(-variable) %>%
separate(label, into = c(NA, NA, "units"), sep = "!!") %>%
filter(!is.na(units)) %>%
spread(key = units, value = estimate) %>%
mutate(total_nums = `1, attached` + `1, detached` + `10 to 19` + `2` + `20 to 49`+ `3 or 4` + `5 to 9`+ `50 or more`+ `Boat, RV, van, etc.`+ `Mobile home`, total_20more = `20 to 49`+`50 or more`, total_10more = `10 to 19` + `20 to 49`+`50 or more`, total_1 = `1, attached` + `1, detached`) %>%
left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
group_by(zipcode) %>%
summarize(total_nums = sum(total_nums), total_20more = sum(total_20more), total_10more = sum(total_20more), total_1_only = sum(total_1)) %>%
mutate(`percent 10 or more` = total_10more*100/total_nums, `percent 20 or more` = total_20more*100/total_nums, `percent 1 only` = total_1_only*100/total_nums, `percent more than 1` = (100 - `percent 1 only`))
sf_units_sd_cases_change <- left_join(sf_sd_cases_curr_orig, sf_units_zip)
# plot with more than 20 first
fig_sf_cases_change_units <- plot_ly(sf_units_sd_cases_change) %>%
add_trace(x = ~`percent 20 or more`, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~`percent 20 or more`, y = fitted(lm(change_cases_by_pop~ `percent 20 or more`, sf_units_sd_cases_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of structures with 20 or more units'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and units per structure")
fig_sf_cases_change_units
model_cases_change_units <- lm(change_cases_by_pop ~ `percent 20 or more`, sf_units_sd_cases_change)
summary(model_cases_change_units)
##
## Call:
## lm(formula = change_cases_by_pop ~ `percent 20 or more`, data = sf_units_sd_cases_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0012250 -0.0006349 -0.0004288 0.0007245 0.0018698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.888e-04 2.796e-04 3.179 0.00452 **
## `percent 20 or more` 4.574e-06 6.651e-06 0.688 0.49920
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0009462 on 21 degrees of freedom
## Multiple R-squared: 0.02202, Adjusted R-squared: -0.02455
## F-statistic: 0.4729 on 1 and 21 DF, p-value: 0.4992
model_cases_change_units_sd <- lm(change_cases_by_pop ~ `percent 20 or more` + percent_leaving_home, sf_units_sd_cases_change)
summary(model_cases_change_size_sd)
##
## Call:
## lm(formula = change_cases_by_pop ~ `percent 5 or more` + percent_leaving_home,
## data = sf_size_sd_cases_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0010071 -0.0007005 -0.0003095 0.0004346 0.0018160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.339e-03 3.405e-03 0.393 0.698
## `percent 5 or more` 4.392e-05 3.573e-05 1.229 0.233
## percent_leaving_home -1.190e-05 6.666e-05 -0.179 0.860
##
## Residual standard error: 0.0009383 on 20 degrees of freedom
## Multiple R-squared: 0.08421, Adjusted R-squared: -0.007365
## F-statistic: 0.9196 on 2 and 20 DF, p-value: 0.4149
# try with more than 1 unit per structure
fig_sf_cases_change_units_1 <- plot_ly(sf_units_sd_cases_change) %>%
add_trace(x = ~`percent more than 1`, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~`percent more than 1`, y = fitted(lm(change_cases_by_pop~ `percent more than 1`, sf_units_sd_cases_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of structures with more than 1 unit'), yaxis = list(title = 'Change in cases per capita from April 21st to present'), title = "SFC, change in cases per capita and units per structure")
fig_sf_cases_change_units_1
model_cases_change_units_1 <- lm(change_cases_by_pop ~ `percent more than 1`, sf_units_sd_cases_change)
summary(model_cases_change_units_1)
##
## Call:
## lm(formula = change_cases_by_pop ~ `percent more than 1`, data = sf_units_sd_cases_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0009561 -0.0007078 -0.0004629 0.0007276 0.0017919
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.785e-04 4.971e-04 1.968 0.0624 .
## `percent more than 1` 6.932e-07 6.784e-06 0.102 0.9196
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0009566 on 21 degrees of freedom
## Multiple R-squared: 0.0004969, Adjusted R-squared: -0.0471
## F-statistic: 0.01044 on 1 and 21 DF, p-value: 0.9196
model_cases_change_units_sd_1 <- lm(change_cases_by_pop ~ `percent more than 1` + percent_leaving_home, sf_units_sd_cases_change)
summary(model_cases_change_units_sd_1)
##
## Call:
## lm(formula = change_cases_by_pop ~ `percent more than 1` + percent_leaving_home,
## data = sf_units_sd_cases_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0010645 -0.0006709 -0.0004462 0.0006910 0.0019181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.886e-03 3.298e-03 0.875 0.392
## `percent more than 1` 1.518e-06 7.035e-06 0.216 0.831
## percent_leaving_home -3.936e-05 6.724e-05 -0.585 0.565
##
## Residual standard error: 0.0009719 on 20 degrees of freedom
## Multiple R-squared: 0.01733, Adjusted R-squared: -0.08093
## F-statistic: 0.1764 on 2 and 20 DF, p-value: 0.8396
That was unfortunately not very fruitful so far.
Starting to look at visits to locations, beginning with SFC.
poi_ca <- readRDS(paste0(sg_path,'ca_poi.rds'))
Start with the week before shelter-in-place started.
# week <- "2020-03-09"
# filter_loc <- "San Francisco"
# filter_loc_abr <- "sfc"
#
# patterns <-
# readRDS(paste0(sg_path,'weekly-patterns/v2/main-file/',week,'-weekly-patterns-ca.rds'))
#
# hps <-
# readRDS(paste0(sg_path,'weekly-patterns/v2/home-summary-file/',week,'-home-panel-summary.rds'))
#
# sfc_patterns <-
# patterns %>%
# left_join(
# poi_ca %>%
# dplyr::select(
# safegraph_place_id,
# longitude,
# latitude,
# naics_code
# ),
# by = "safegraph_place_id"
# ) %>%
# filter(!is.na(longitude)) %>%
# mutate(
# long = longitude,
# lat = latitude
# ) %>%
# st_as_sf(coords = c("long","lat")) %>%
# st_set_crs(4326) %>%
# st_join(
# counties("CA", cb=F, progress_bar=F) %>%
# filter(NAME == filter_loc) %>%
# st_transform(4326) %>%
# dplyr::select(NAME),
# left=F
# )
#
# sfc_patterns_origins_daily <-
# process_patterns_origins_daily(sfc_patterns,hps)
#
# sfc_patterns_origins_daily <-
# sfc_patterns_origins_daily %>%
# left_join(
# sfc_patterns %>%
# as.data.frame() %>%
# dplyr::select(
# safegraph_place_id,
# location_name,
# street_address,
# naics_code
# )
# )
#
# saveRDS(sfc_patterns_origins_daily,paste0(sg_path,"weekly-patterns/v2/",filter_loc_abr,"_patterns_",week,"_origins_daily.rds"))
sfc_patterns_origins_daily <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-03-09_origins_daily.rds"))
Try looking at the absolute number of visits in each zip code in the week before SIP and the absolute number of cases to date in each county in SF.
sf_visits_week_pre <- sfc_patterns_origins_daily %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip))
# combine with case data
sf_visits_week_pre_cases <- left_join(sf_visits_week_pre, sf_cases_zip %>% filter(date == max(date))) %>% filter(!is.na(confirmed_cases))
# plot
fig_sf_cases_visits_week_pre <- plot_ly(sf_visits_week_pre_cases) %>%
add_trace(x = ~visit_counts_high_zip, y = ~confirmed_cases, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visit_counts_high_zip, y = fitted(lm(confirmed_cases~ visit_counts_high_zip, sf_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits in week before SIP (high)'), yaxis = list(title = 'Current cases'), title = "SFC, cases and visits in week before SIP")
fig_sf_cases_visits_week_pre
model_cases_visits_week_pre <- lm(confirmed_cases ~ visit_counts_high_zip, sf_visits_week_pre_cases)
summary(model_cases_visits_week_pre)
##
## Call:
## lm(formula = confirmed_cases ~ visit_counts_high_zip, data = sf_visits_week_pre_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -114.53 -31.25 -13.72 45.15 170.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.472e+01 3.053e+01 -0.482 0.6348
## visit_counts_high_zip 5.558e-04 1.378e-04 4.033 0.0006 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 71.79 on 21 degrees of freedom
## Multiple R-squared: 0.4365, Adjusted R-squared: 0.4096
## F-statistic: 16.27 on 1 and 21 DF, p-value: 0.0006004
# try with low just for comparison
fig_sf_cases_visits_week_pre_low <- plot_ly(sf_visits_week_pre_cases) %>%
add_trace(x = ~visit_counts_low_zip, y = ~confirmed_cases, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visit_counts_low_zip, y = fitted(lm(confirmed_cases~ visit_counts_low_zip, sf_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits in week before SIP (low)'), yaxis = list(title = 'Current cases'), title = "SFC, cases and visits in week before SIP")
fig_sf_cases_visits_week_pre_low
model_cases_visits_week_pre_low <- lm(confirmed_cases ~ visit_counts_low_zip, sf_visits_week_pre_cases)
summary(model_cases_visits_week_pre_low)
##
## Call:
## lm(formula = confirmed_cases ~ visit_counts_low_zip, data = sf_visits_week_pre_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -115.43 -31.74 -11.01 45.60 169.46
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.732e+01 3.099e+01 -0.559 0.582116
## visit_counts_low_zip 9.250e-04 2.285e-04 4.049 0.000578 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 71.66 on 21 degrees of freedom
## Multiple R-squared: 0.4384, Adjusted R-squared: 0.4117
## F-statistic: 16.39 on 1 and 21 DF, p-value: 0.0005778
That is very different from the results when we looked at percent leaving home! Interesting. Results are similar with high and low, so I’m just going to keep using the high counts for now.
But could this just be a population thing? I.e. higher population correlates with higher cases and more visits?
fig_sf_cases_pop_visits_week_pre <- plot_ly(sf_visits_week_pre_cases) %>%
add_trace(x = ~visit_counts_high_zip, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visit_counts_high_zip, y = fitted(lm(cases_by_pop~ visit_counts_high_zip, sf_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits in week before SIP (high)'), yaxis = list(title = 'Current cases per capita'), title = "SFC, cases and visits in week before SIP")
fig_sf_cases_pop_visits_week_pre
model_cases_pop_visits_week_pre <- lm(cases_by_pop ~ visit_counts_high_zip, sf_visits_week_pre_cases)
summary(model_cases_pop_visits_week_pre)
##
## Call:
## lm(formula = cases_by_pop ~ visit_counts_high_zip, data = sf_visits_week_pre_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0017862 -0.0012636 -0.0006584 0.0012414 0.0032200
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.857e-03 7.375e-04 2.518 0.020 *
## visit_counts_high_zip 2.608e-09 3.329e-09 0.784 0.442
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001734 on 21 degrees of freedom
## Multiple R-squared: 0.0284, Adjusted R-squared: -0.01786
## F-statistic: 0.6139 on 1 and 21 DF, p-value: 0.4421
The relationship is definitely not as strong if you look at cases per capita. What if you pair with visits per capita?
sf_visits_week_pre_cases <- sf_visits_week_pre_cases %>%
mutate(visits_per_capita_high = visit_counts_high_zip / total_pop_zip, visits_per_capita_low = visit_counts_low_zip / total_pop_zip)
# plot
fig_sf_cases_pop_visits_pop_week_pre <- plot_ly(sf_visits_week_pre_cases) %>%
add_trace(x = ~visits_per_capita_high, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita_high, y = fitted(lm(cases_by_pop~ visits_per_capita_high, sf_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits per capita in week before SIP (high)'), yaxis = list(title = 'Current cases per capita'), title = "SFC, cases and visits in week before SIP")
fig_sf_cases_pop_visits_pop_week_pre
model_cases_pop_visits_pop_week_pre <- lm(cases_by_pop ~ visits_per_capita_high, sf_visits_week_pre_cases)
summary(model_cases_pop_visits_pop_week_pre)
##
## Call:
## lm(formula = cases_by_pop ~ visits_per_capita_high, data = sf_visits_week_pre_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0017718 -0.0013751 -0.0007039 0.0011794 0.0032135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0013798 0.0020578 0.670 0.510
## visits_per_capita_high 0.0001942 0.0004010 0.484 0.633
##
## Residual standard error: 0.001749 on 21 degrees of freedom
## Multiple R-squared: 0.01104, Adjusted R-squared: -0.03605
## F-statistic: 0.2345 on 1 and 21 DF, p-value: 0.6332
Again not significant results, but at least the trend is in the direction we would expect.
Also do multiple regression with absolute cases, absolute visits, and population
cases_pop_visits_sf <- lm(confirmed_cases ~ visit_counts_high_zip + total_pop_zip, sf_visits_week_pre_cases)
summary(cases_pop_visits_sf)
##
## Call:
## lm(formula = confirmed_cases ~ visit_counts_high_zip + total_pop_zip,
## data = sf_visits_week_pre_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -111.87 -48.78 -12.61 40.92 160.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.364e+01 3.510e+01 -0.959 0.349
## visit_counts_high_zip 4.132e-05 4.957e-04 0.083 0.934
## total_pop_zip 3.190e-03 2.953e-03 1.080 0.293
##
## Residual standard error: 71.5 on 20 degrees of freedom
## Multiple R-squared: 0.4675, Adjusted R-squared: 0.4143
## F-statistic: 8.781 on 2 and 20 DF, p-value: 0.001832
Hmm…
Let’s do exactly the same thing but with SCC just for comparison.
# week <- "2020-03-09"
# filter_loc <- "Santa Clara"
# filter_loc_abr <- "scc"
#
# patterns <-
# readRDS(paste0(sg_path,'weekly-patterns/v2/main-file/',week,'-weekly-patterns-ca.rds'))
#
# hps <-
# readRDS(paste0(sg_path,'weekly-patterns/v2/home-summary-file/',week,'-home-panel-summary.rds'))
#
# scc_patterns <-
# patterns %>%
# left_join(
# poi_ca %>%
# dplyr::select(
# safegraph_place_id,
# longitude,
# latitude,
# naics_code
# ),
# by = "safegraph_place_id"
# ) %>%
# filter(!is.na(longitude)) %>%
# mutate(
# long = longitude,
# lat = latitude
# ) %>%
# st_as_sf(coords = c("long","lat")) %>%
# st_set_crs(4326) %>%
# st_join(
# counties("CA", cb=F, progress_bar=F) %>%
# filter(NAME == filter_loc) %>%
# st_transform(4326) %>%
# dplyr::select(NAME),
# left=F
# )
#
# scc_patterns_origins_daily <-
# process_patterns_origins_daily(scc_patterns,hps)
#
# scc_patterns_origins_daily <-
# scc_patterns_origins_daily %>%
# left_join(
# scc_patterns %>%
# as.data.frame() %>%
# dplyr::select(
# safegraph_place_id,
# location_name,
# street_address,
# naics_code
# )
# )
#
# saveRDS(scc_patterns_origins_daily,paste0(sg_path,"weekly-patterns/v2/",filter_loc_abr,"_patterns_",week,"_origins_daily.rds"))
scc_patterns_origins_daily <- readRDS(paste0(sg_path,"weekly-patterns/v2/scc_patterns_2020-03-09_origins_daily.rds"))
Try looking at the absolute number of visits in each zip code in the week before SIP and the absolute number of cases to date in each county in SCC.
scc_visits_week_pre <- scc_patterns_origins_daily %>%
left_join(scc_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip))
# combine with case data
scc_visits_week_pre_cases <- left_join(scc_visits_week_pre, scc_cases_zip) %>%
filter(!is.na(cases))
# plot
fig_scc_cases_visits_week_pre <- plot_ly(scc_visits_week_pre_cases) %>%
add_trace(x = ~visit_counts_high_zip, y = ~cases, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visit_counts_high_zip, y = fitted(lm(cases~ visit_counts_high_zip, scc_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits in week before SIP (high)'), yaxis = list(title = 'Current cases'), title = "SCC, cases and visits in week before SIP")
fig_scc_cases_visits_week_pre
scc_model_cases_visits_week_pre <- lm(cases ~ visit_counts_high_zip, scc_visits_week_pre_cases)
summary(scc_model_cases_visits_week_pre)
##
## Call:
## lm(formula = cases ~ visit_counts_high_zip, data = scc_visits_week_pre_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.097 -21.226 -4.106 9.912 155.987
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.5710741 9.6466801 -0.888 0.378
## visit_counts_high_zip 0.0001807 0.0000276 6.545 2.43e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36.06 on 53 degrees of freedom
## Multiple R-squared: 0.447, Adjusted R-squared: 0.4366
## F-statistic: 42.84 on 1 and 53 DF, p-value: 2.429e-08
# try with low just for comparison
fig_scc_cases_visits_week_pre_low <- plot_ly(scc_visits_week_pre_cases) %>%
add_trace(x = ~visit_counts_low_zip, y = ~cases, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visit_counts_low_zip, y = fitted(lm(cases~ visit_counts_low_zip, scc_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits in week before SIP (low)'), yaxis = list(title = 'Current cases'), title = "SCC, cases and visits in week before SIP")
fig_scc_cases_visits_week_pre_low
scc_model_cases_visits_week_pre_low <- lm(cases ~ visit_counts_low_zip, scc_visits_week_pre_cases)
summary(scc_model_cases_visits_week_pre_low)
##
## Call:
## lm(formula = cases ~ visit_counts_low_zip, data = scc_visits_week_pre_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.728 -20.929 -5.711 8.191 160.517
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.831e+00 9.902e+00 -0.589 0.558
## visit_counts_low_zip 2.578e-04 4.248e-05 6.069 1.4e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37.24 on 53 degrees of freedom
## Multiple R-squared: 0.41, Adjusted R-squared: 0.3989
## F-statistic: 36.84 on 1 and 53 DF, p-value: 1.404e-07
Cases per capita:
fig_scc_cases_pop_visits_week_pre <- plot_ly(scc_visits_week_pre_cases) %>%
add_trace(x = ~visit_counts_high_zip, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visit_counts_high_zip, y = fitted(lm(cases_by_pop~ visit_counts_high_zip, scc_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits in week before SIP (high)'), yaxis = list(title = 'Current cases per capita'), title = "SCC, cases and visits in week before SIP")
fig_scc_cases_pop_visits_week_pre
scc_model_cases_pop_visits_week_pre <- lm(cases_by_pop ~ visit_counts_high_zip, scc_visits_week_pre_cases)
summary(scc_model_cases_pop_visits_week_pre)
##
## Call:
## lm(formula = cases_by_pop ~ visit_counts_high_zip, data = scc_visits_week_pre_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0012683 -0.0005813 -0.0002036 0.0002581 0.0030920
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.262e-03 2.396e-04 5.267 2.58e-06 ***
## visit_counts_high_zip 1.613e-10 6.854e-10 0.235 0.815
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0008954 on 53 degrees of freedom
## Multiple R-squared: 0.001044, Adjusted R-squared: -0.0178
## F-statistic: 0.0554 on 1 and 53 DF, p-value: 0.8148
Visits per capita
scc_visits_week_pre_cases <- scc_visits_week_pre_cases %>%
mutate(visits_per_capita_high = visit_counts_high_zip / pop, visits_per_capita_low = visit_counts_low_zip / pop)
# plot
fig_scc_cases_pop_visits_pop_week_pre <- plot_ly(scc_visits_week_pre_cases) %>%
add_trace(x = ~visits_per_capita_high, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita_high, y = fitted(lm(cases_by_pop~ visits_per_capita_high, scc_visits_week_pre_cases)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits per capita in week before SIP (high)'), yaxis = list(title = 'Current cases per capita'), title = "SCC, cases and visits in week before SIP")
fig_scc_cases_pop_visits_pop_week_pre
scc_model_cases_pop_visits_pop_week_pre <- lm(cases_by_pop ~ visits_per_capita_high, scc_visits_week_pre_cases)
summary(scc_model_cases_pop_visits_pop_week_pre)
##
## Call:
## lm(formula = cases_by_pop ~ visits_per_capita_high, data = scc_visits_week_pre_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0011584 -0.0005801 -0.0001982 0.0002486 0.0027879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.919e-03 6.428e-04 2.985 0.00429 **
## visits_per_capita_high -6.928e-05 7.197e-05 -0.963 0.34005
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0008882 on 53 degrees of freedom
## Multiple R-squared: 0.01719, Adjusted R-squared: -0.001356
## F-statistic: 0.9269 on 1 and 53 DF, p-value: 0.3401
That’s very weird…
Also do multiple regression with absolute cases, absolute visits, and population
cases_pop_visits_scc <- lm(cases ~ visit_counts_high_zip + pop, scc_visits_week_pre_cases)
summary(cases_pop_visits_scc)
##
## Call:
## lm(formula = cases ~ visit_counts_high_zip + pop, data = scc_visits_week_pre_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64.716 -17.865 -3.391 10.695 149.452
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.300e+01 9.770e+00 -1.331 0.1890
## visit_counts_high_zip -4.365e-06 1.068e-04 -0.041 0.9675
## pop 1.746e-03 9.748e-04 1.791 0.0791 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.33 on 52 degrees of freedom
## Multiple R-squared: 0.4791, Adjusted R-squared: 0.4591
## F-statistic: 23.92 on 2 and 52 DF, p-value: 4.313e-08
Looking at the differences in visits and % leaving home for the week before SIP.
# save previous results as another name since I edit that name below
sfc_patterns_origins_daily_week_pre <- sfc_patterns_origins_daily
# get % leaving in week before SIP
sf_sd_zip_week_pre <- sf_sd_zip_by_date %>%
filter(date < shelter_date & date >= (shelter_date - 7)) %>%
group_by(zipcode) %>%
summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>%
left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
sf_sd_visits_week_pre <- left_join(sf_sd_zip_week_pre, sf_visits_week_pre_cases) %>% filter(!is.na(percent_leaving_home) & !is.na(visits_per_capita_high))
# plot
fig_sfc_visits_pop_leaving_week_pre <- plot_ly(sf_sd_visits_week_pre) %>%
add_trace(x = ~visits_per_capita_high, y = ~percent_leaving_home, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita_high, y = fitted(lm(percent_leaving_home~ visits_per_capita_high, sf_sd_visits_week_pre)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits per capita in week before SIP (high)'), yaxis = list(title = '% leaving home in week before SIP'), title = "SFC, leaving home and visits in week before SIP")
fig_sfc_visits_pop_leaving_week_pre
That’s odd, but makes sense why the two metrics give us different results.
What about total devices leaving and total visits?
sf_sd_visits_week_pre <- sf_sd_visits_week_pre %>% mutate(total_leaving_home_zip = total_devices - total_at_home_zip)
fig_sfc_visits_leaving_week_pre <- plot_ly(sf_sd_visits_week_pre) %>%
add_trace(x = ~visit_counts_high_zip, y = ~total_leaving_home_zip, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visit_counts_high_zip, y = fitted(lm(total_leaving_home_zip~ visit_counts_high_zip, sf_sd_visits_week_pre)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits in week before SIP (high)'), yaxis = list(title = 'Number of devices leaving home in week before SIP'), title = "SFC, leaving home and visits in week before SIP")
fig_sfc_visits_leaving_week_pre
Well that’s a relationship we would expect. Interesting.
Try SCC.
# get % leaving in week before SIP
scc_sd_zip_week_pre <- scc_sd_zip_by_date %>%
filter(date < shelter_date & date >= (shelter_date - 7)) %>%
group_by(zipcode) %>%
summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>%
left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
scc_sd_visits_week_pre <- left_join(scc_sd_zip_week_pre, scc_visits_week_pre_cases) %>% filter(!is.na(percent_leaving_home) & !is.na(visits_per_capita_high))
# plot
fig_scc_visits_pop_leaving_week_pre <- plot_ly(scc_sd_visits_week_pre) %>%
add_trace(x = ~visits_per_capita_high, y = ~percent_leaving_home, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita_high, y = fitted(lm(percent_leaving_home~ visits_per_capita_high, scc_sd_visits_week_pre)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits per capita in week before SIP (high)'), yaxis = list(title = '% leaving home in week before SIP'), title = "SCC, leaving home and visits in week before SIP")
fig_scc_visits_pop_leaving_week_pre
Remove outlier at bottom right that is barely leaving home and plot again
# plot without outliers
fig_scc_visits_pop_leaving_week_pre_edit <- plot_ly(scc_sd_visits_week_pre %>% filter(percent_leaving_home > 50)) %>%
add_trace(x = ~visits_per_capita_high, y = ~percent_leaving_home, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita_high, y = fitted(lm(percent_leaving_home~ visits_per_capita_high, scc_sd_visits_week_pre %>% filter(percent_leaving_home > 50))), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits per capita in week before SIP (high)'), yaxis = list(title = '% leaving home in week before SIP'), title = "SCC, leaving home and visits in week before SIP")
fig_scc_visits_pop_leaving_week_pre_edit
Still not totally what we would expect (a linear relationship).
Total devices leaving and total visits.
scc_sd_visits_week_pre <- scc_sd_visits_week_pre %>% mutate(total_leaving_home_zip = total_devices - total_at_home_zip)
fig_scc_visits_leaving_week_pre <- plot_ly(scc_sd_visits_week_pre) %>%
add_trace(x = ~visit_counts_high_zip, y = ~total_leaving_home_zip, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visit_counts_high_zip, y = fitted(lm(total_leaving_home_zip~ visit_counts_high_zip, scc_sd_visits_week_pre)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits in week before SIP (high)'), yaxis = list(title = 'Number of devices leaving home in week before SIP'), title = "SCC, leaving home and visits in week before SIP")
fig_scc_visits_leaving_week_pre
Also expected. Interesting that these metrics diverge.
Would it be more appropriate to normalize number of visits by number of devices rather than population, then?
## San Francisco
# for (i in 1:9) {
# week_init <- as.Date("2020-03-09")
# week <- week_init + 7*i
# filter_loc <- "San Francisco"
# filter_loc_abr <- "sfc"
#
# patterns <-
# readRDS(paste0(sg_path,'weekly-patterns/v2/main-file/',week,'-weekly-patterns-ca.rds'))
#
# hps <-
# readRDS(paste0(sg_path,'weekly-patterns/v2/home-summary-file/',week,'-home-panel-summary.rds'))
#
# sfc_patterns <-
# patterns %>%
# left_join(
# poi_ca %>%
# dplyr::select(
# safegraph_place_id,
# longitude,
# latitude,
# naics_code
# ),
# by = "safegraph_place_id"
# ) %>%
# filter(!is.na(longitude)) %>%
# mutate(
# long = longitude,
# lat = latitude
# ) %>%
# st_as_sf(coords = c("long","lat")) %>%
# st_set_crs(4326) %>%
# st_join(
# counties("CA", cb=F, progress_bar=F) %>%
# filter(NAME == filter_loc) %>%
# st_transform(4326) %>%
# dplyr::select(NAME),
# left=F
# )
#
# sfc_patterns_origins_daily <-
# process_patterns_origins_daily(sfc_patterns,hps)
#
# sfc_patterns_origins_daily <-
# sfc_patterns_origins_daily %>%
# left_join(
# sfc_patterns %>%
# as.data.frame() %>%
# dplyr::select(
# safegraph_place_id,
# location_name,
# street_address,
# naics_code
# )
# )
#
# saveRDS(sfc_patterns_origins_daily,paste0(sg_path,"weekly-patterns/v2/",filter_loc_abr,"_patterns_",week,"_origins_daily.rds"))
# }
Let’s look at total visits since SIP.
sfc_patterns_origins_daily_week1 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-03-16_origins_daily.rds"))
sfc_patterns_origins_daily_week2 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-03-23_origins_daily.rds"))
sfc_patterns_origins_daily_week3 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-03-30_origins_daily.rds"))
sfc_patterns_origins_daily_week4 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-04-06_origins_daily.rds"))
sfc_patterns_origins_daily_week5 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-04-13_origins_daily.rds"))
sfc_patterns_origins_daily_week6 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-04-20_origins_daily.rds"))
sfc_patterns_origins_daily_week7 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-04-27_origins_daily.rds"))
sfc_patterns_origins_daily_week8 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-05-04_origins_daily.rds"))
sfc_patterns_origins_daily_week9 <- readRDS(paste0(sg_path,"weekly-patterns/v2/sfc_patterns_2020-05-11_origins_daily.rds"))
# combine
sfc_patterns_origins_daily_since_SIP <- rbind(sfc_patterns_origins_daily_week1, sfc_patterns_origins_daily_week2,sfc_patterns_origins_daily_week3,sfc_patterns_origins_daily_week4,sfc_patterns_origins_daily_week5,sfc_patterns_origins_daily_week6,sfc_patterns_origins_daily_week7,sfc_patterns_origins_daily_week8,sfc_patterns_origins_daily_week9)
# summarize
sf_visits_since_SIP <- sfc_patterns_origins_daily_since_SIP %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip))
sf_visits_since_SIP_cases <- left_join(sf_visits_since_SIP, sf_cases_zip %>% filter(date == max(date))) %>%
mutate(visits_per_capita_high = visit_counts_high_zip / total_pop_zip, visits_per_capita_low = visit_counts_low_zip / total_pop_zip) %>% filter(!is.na(confirmed_cases))
# plot
fig_sf_cases_visits_since_SIP <- plot_ly(sf_visits_since_SIP_cases) %>%
add_trace(x = ~visit_counts_high_zip, y = ~confirmed_cases, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visit_counts_high_zip, y = fitted(lm(confirmed_cases~ visit_counts_high_zip, sf_visits_since_SIP_cases)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits since SIP (high)'), yaxis = list(title = 'Current cases'), title = "SFC, cases and visits since SIP")
fig_sf_cases_visits_since_SIP
model_cases_visits_since_SIP <- lm(confirmed_cases ~ visit_counts_high_zip, sf_visits_since_SIP_cases)
summary(model_cases_visits_since_SIP)
##
## Call:
## lm(formula = confirmed_cases ~ visit_counts_high_zip, data = sf_visits_since_SIP_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -127.210 -30.544 -6.309 40.793 160.271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.815e+01 2.958e+01 -0.614 0.546018
## visit_counts_high_zip 9.972e-05 2.319e-05 4.300 0.000317 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.74 on 21 degrees of freedom
## Multiple R-squared: 0.4682, Adjusted R-squared: 0.4429
## F-statistic: 18.49 on 1 and 21 DF, p-value: 0.0003174
# try with low just for comparison
fig_sf_cases_visits_since_SIP_low <- plot_ly(sf_visits_since_SIP_cases) %>%
add_trace(x = ~visit_counts_low_zip, y = ~confirmed_cases, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visit_counts_low_zip, y = fitted(lm(confirmed_cases~ visit_counts_low_zip, sf_visits_since_SIP_cases)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits since SIP (low)'), yaxis = list(title = 'Current cases'), title = "SFC, cases and visits since SIP")
fig_sf_cases_visits_since_SIP_low
model_cases_visits_since_SIP_low <- lm(confirmed_cases ~ visit_counts_low_zip, sf_visits_since_SIP_cases)
summary(model_cases_visits_since_SIP_low)
##
## Call:
## lm(formula = confirmed_cases ~ visit_counts_low_zip, data = sf_visits_since_SIP_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.827 -29.393 -4.704 39.462 159.397
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.066e+01 2.990e+01 -0.691 0.497285
## visit_counts_low_zip 1.550e-04 3.579e-05 4.330 0.000295 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.51 on 21 degrees of freedom
## Multiple R-squared: 0.4717, Adjusted R-squared: 0.4466
## F-statistic: 18.75 on 1 and 21 DF, p-value: 0.0002949
Looks pretty similar to the week before SIP figure.
What about just the week after/around SIP? (3/16-3/22)
sf_visits_week1 <- sfc_patterns_origins_daily_week1 %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip))
sf_visits_week1_cases <- left_join(sf_visits_week1, sf_cases_zip %>% filter(date == max(date))) %>% filter(!is.na(confirmed_cases))
# plot
fig_sf_cases_visits_week1 <- plot_ly(sf_visits_week1_cases) %>%
add_trace(x = ~visit_counts_high_zip, y = ~confirmed_cases, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visit_counts_high_zip, y = fitted(lm(confirmed_cases~ visit_counts_high_zip, sf_visits_week1_cases)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits since SIP (high)'), yaxis = list(title = 'Current cases'), title = "SFC, cases and visits in week after SIP ( 3/16-3/22)")
fig_sf_cases_visits_week1
model_cases_visits_week1 <- lm(confirmed_cases ~ visit_counts_high_zip, sf_visits_week1_cases)
summary(model_cases_visits_week1)
##
## Call:
## lm(formula = confirmed_cases ~ visit_counts_high_zip, data = sf_visits_week1_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -121.581 -38.174 1.435 41.284 163.241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.932e+01 2.989e+01 -0.646 0.524980
## visit_counts_high_zip 8.561e-04 1.996e-04 4.288 0.000326 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.82 on 21 degrees of freedom
## Multiple R-squared: 0.4669, Adjusted R-squared: 0.4415
## F-statistic: 18.39 on 1 and 21 DF, p-value: 0.0003261
Note that my “1 week post” actually starts 3/16 instead of 3/17, but that could be adjusted, I just kept it like this now since it was easier since that’s already how the data was bucketed. Could easily adjust this later though.
Thinking about whether it’s possible that we’re missing visits by just looking at origins/how much that is relevent, I map the zip codes and their movement.
# add average visits per capita
sf_sd_visits_week_pre <- sf_sd_visits_week_pre %>% mutate(avg_visits_per_capita = (visits_per_capita_high + visits_per_capita_low)/2)
sf_visits_since_SIP_cases <- sf_visits_since_SIP_cases %>% mutate(avg_visits_per_capita = (visits_per_capita_high + visits_per_capita_low)/2)
scc_visits_week_pre_cases <- scc_visits_week_pre_cases %>% mutate(avg_visits_per_capita = (visits_per_capita_high + visits_per_capita_low)/2)
pal_1 <-
colorNumeric(
palette = "Blues",
domain =
c(0,sf_sd_visits_week_pre %>%
pull(avg_visits_per_capita) %>%
unique())
)
pal_2 <-
colorNumeric(
palette = "Reds",
domain =
c(0,sf_sd_visits_week_pre %>%
pull(percent_leaving_home) %>%
unique())
)
pal_3 <-
colorNumeric(
palette = "Blues",
domain =
c(0,sf_visits_since_SIP_cases %>%
pull(avg_visits_per_capita) %>%
unique())
)
pal_4 <-
colorNumeric(
palette = "Reds",
domain =
c(0,sf_sd_cases_curr %>%
pull(percent_leaving_home) %>%
unique())
)
pal_5 <-
colorNumeric(
palette = "Blues",
domain =
c(0,sf_visits_since_SIP_cases %>%
pull(cases_by_pop) %>%
unique())
)
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = sf_sd_visits_week_pre %>% left_join(zctas_94_95, by = c("zipcode" = "ZCTA5CE10")) %>% st_as_sf() %>% st_transform(4326),
fill = TRUE,
fillColor = ~pal_1(avg_visits_per_capita),
color = "white",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.5,
group = "Average visits per capita, 1 week pre SIP",
label = sf_sd_visits_week_pre$avg_visits_per_capita
) %>%
addPolygons(
data = sf_sd_visits_week_pre %>% left_join(zctas_94_95, by = c("zipcode" = "ZCTA5CE10")) %>% st_as_sf() %>% st_transform(4326),
fill = TRUE,
fillColor = ~pal_2(percent_leaving_home),
color = "white",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.5,
group = "Percent leaving home, 1 week pre SIP",
label = sf_sd_visits_week_pre$percent_leaving_home
) %>%
addPolygons(
data = sf_visits_since_SIP_cases %>% left_join(zctas_94_95, by = c("zipcode" = "ZCTA5CE10")) %>% st_as_sf() %>% st_transform(4326),
fill = TRUE,
fillColor = ~pal_3(avg_visits_per_capita),
color = "white",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.5,
group = "Average visits per capita since SIP",
label = sf_visits_since_SIP_cases$avg_visits_per_capita
) %>%
addPolygons(
data = sf_sd_cases_curr %>% left_join(zctas_94_95, by = c("zipcode" = "ZCTA5CE10")) %>% st_as_sf() %>% st_transform(4326),
fill = TRUE,
fillColor = ~pal_4(percent_leaving_home),
color = "white",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.5,
group = "Percent leaving home, SIP through 1 week ago",
label = sf_sd_cases_curr$percent_leaving_home
) %>%
addPolygons(
data = sf_visits_since_SIP_cases %>% left_join(zctas_94_95, by = c("zipcode" = "ZCTA5CE10")) %>% st_as_sf() %>% st_transform(4326),
fill = TRUE,
fillColor = ~pal_5(cases_by_pop),
color = "white",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.5,
group = "Cases per capita",
label = sf_visits_since_SIP_cases$cases_by_pop
) %>%
addLayersControl(
overlayGroups = c("Average visits per capita, 1 week pre SIP", "Percent leaving home, 1 week pre SIP", "Average visits per capita since SIP", "Percent leaving home, SIP through 1 week ago","Cases per capita")
)
saveCountyPatternsOrigin <- function(week, filter_loc, filter_loc_abr) {
patterns <-
readRDS(paste0(sg_path,'weekly-patterns/v2/main-file/',week,'-weekly-patterns-ca.rds'))
hps <-
readRDS(paste0(sg_path,'weekly-patterns/v2/home-summary-file/',week,'-home-panel-summary.rds'))
county_patterns <-
patterns %>%
left_join(
poi_ca %>%
dplyr::select(
safegraph_place_id,
longitude,
latitude,
naics_code
),
by = "safegraph_place_id"
) %>%
filter(!is.na(longitude)) %>%
mutate(
long = longitude,
lat = latitude
) %>%
st_as_sf(coords = c("long","lat")) %>%
st_set_crs(4326) %>%
st_join(
counties("CA", cb=F, progress_bar=F) %>%
filter(NAME == filter_loc) %>%
st_transform(4326) %>%
dplyr::select(NAME),
left=F
)
county_patterns_origins_daily <-
process_patterns_origins_daily(county_patterns,hps)
county_patterns_origins_daily <-
county_patterns_origins_daily %>%
left_join(
county_patterns %>%
as.data.frame() %>%
dplyr::select(
safegraph_place_id,
location_name,
street_address,
naics_code
)
)
saveRDS(county_patterns_origins_daily,paste0(sg_path,"weekly-patterns/v2/",filter_loc_abr,"_patterns_",week,"_origins_daily.rds"))
}
# saveCountyPatternsOrigin("2020-03-09", "Marin", "mac")
# saveCountyPatternsOrigin("2020-03-09", "San Mateo", "smc")
# saveCountyPatternsOrigin("2020-03-09", "Contra Costa", "ccc")
# saveCountyPatternsOrigin("2020-03-09", "Alameda", "alc")
# saveCountyPatternsOrigin("2020-03-16", "Santa Clara", "scc")
# saveCountyPatternsOrigin("2020-03-16", "Marin", "mac")
# saveCountyPatternsOrigin("2020-03-16", "San Mateo", "smc")
# saveCountyPatternsOrigin("2020-03-16", "Contra Costa", "ccc")
# saveCountyPatternsOrigin("2020-03-16", "Alameda", "alc")
Obtain any SF block group movement from the Marin, SMC, SCC, Contra Costa, and Alameda data, for the week before SIP.
marin_patterns_origins_daily_week_pre <- readRDS(paste0(sg_path,"weekly-patterns/v2/mac_patterns_2020-03-09_origins_daily.rds"))
smc_patterns_origins_daily_week_pre <- readRDS(paste0(sg_path,"weekly-patterns/v2/smc_patterns_2020-03-09_origins_daily.rds"))
scc_patterns_origins_daily_week_pre <- readRDS(paste0(sg_path,"weekly-patterns/v2/scc_patterns_2020-03-09_origins_daily.rds"))
ccc_patterns_origins_daily_week_pre <- readRDS(paste0(sg_path,"weekly-patterns/v2/ccc_patterns_2020-03-09_origins_daily.rds"))
alc_patterns_origins_daily_week_pre <- readRDS(paste0(sg_path,"weekly-patterns/v2/alc_patterns_2020-03-09_origins_daily.rds"))
# see how many of these results are in SF blockgroups
marin_origins_week_pre_in_sf <- marin_patterns_origins_daily_week_pre %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip)) %>%
mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2)
smc_origins_week_pre_in_sf <- smc_patterns_origins_daily_week_pre %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip)) %>%
mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2)
scc_origins_week_pre_in_sf <- scc_patterns_origins_daily_week_pre %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip)) %>%
mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2)
ccc_origins_week_pre_in_sf <- ccc_patterns_origins_daily_week_pre %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip)) %>%
mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2)
alc_origins_week_pre_in_sf <- alc_patterns_origins_daily_week_pre %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip)) %>%
mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2)
kable(marin_origins_week_pre_in_sf %>% dplyr::select(zipcode, visit_counts_avg))
zipcode | visit_counts_avg |
---|---|
94102 | 475.97292 |
94103 | 789.51439 |
94105 | 258.97058 |
94107 | 315.39244 |
94108 | 1604.53485 |
94109 | 5002.97164 |
94110 | 5710.96985 |
94111 | 67.43151 |
94112 | 3533.98184 |
94114 | 1526.43155 |
94115 | 2959.83404 |
94116 | 2679.65911 |
94117 | 1940.38967 |
94118 | 8369.04381 |
94121 | 3049.50423 |
94122 | 5178.30592 |
94123 | 3741.92898 |
94124 | 287.55985 |
94127 | 1323.31935 |
94131 | 1867.63030 |
94132 | 1086.23054 |
94133 | 1510.32035 |
94134 | 766.19675 |
94158 | 98.10206 |
kable(smc_origins_week_pre_in_sf %>% dplyr::select(zipcode, visit_counts_avg))
zipcode | visit_counts_avg |
---|---|
94102 | 8822.9942 |
94103 | 9445.9243 |
94105 | 4743.6358 |
94107 | 10827.6865 |
94108 | 3274.3621 |
94109 | 13889.0510 |
94110 | 51052.2628 |
94111 | 767.8747 |
94112 | 147166.6894 |
94114 | 11950.1894 |
94115 | 13134.4186 |
94116 | 62232.3329 |
94117 | 17343.8670 |
94118 | 22733.6120 |
94121 | 27290.3088 |
94122 | 55861.6978 |
94123 | 7899.8366 |
94124 | 44383.8291 |
94127 | 16828.2167 |
94131 | 25830.8609 |
94132 | 50896.3940 |
94133 | 8440.6738 |
94134 | 74335.7403 |
94158 | 8250.9984 |
kable(scc_origins_week_pre_in_sf %>% dplyr::select(zipcode, visit_counts_avg))
zipcode | visit_counts_avg |
---|---|
94102 | 681.6278 |
94103 | 1593.4132 |
94105 | 1399.6867 |
94107 | 2417.7694 |
94108 | 902.2337 |
94109 | 3250.7088 |
94110 | 5295.8956 |
94112 | 6500.4987 |
94114 | 1494.4945 |
94115 | 3631.1729 |
94116 | 5285.5545 |
94117 | 5251.1729 |
94118 | 4615.9181 |
94121 | 3848.4594 |
94122 | 6241.3229 |
94123 | 1245.2389 |
94124 | 4479.2548 |
94127 | 999.4394 |
94131 | 2819.7143 |
94132 | 4129.0364 |
94133 | 1792.9758 |
94134 | 3180.9474 |
94158 | 1779.9106 |
kable(ccc_origins_week_pre_in_sf %>% dplyr::select(zipcode, visit_counts_avg))
zipcode | visit_counts_avg |
---|---|
94102 | 427.88525 |
94103 | 1839.44877 |
94104 | 15.27273 |
94105 | 496.92492 |
94107 | 1342.25672 |
94108 | 1244.02409 |
94109 | 2186.76344 |
94110 | 4305.36111 |
94111 | 76.44088 |
94112 | 5427.29499 |
94114 | 718.77727 |
94115 | 3125.59312 |
94116 | 1903.14038 |
94117 | 2048.10992 |
94118 | 2151.35352 |
94121 | 2057.74163 |
94122 | 2739.95306 |
94123 | 778.72194 |
94124 | 2900.84678 |
94127 | 746.78034 |
94131 | 1941.41929 |
94132 | 1413.69885 |
94133 | 1563.17602 |
94134 | 2959.91844 |
94158 | 683.09438 |
kable(alc_origins_week_pre_in_sf %>% dplyr::select(zipcode, visit_counts_avg))
zipcode | visit_counts_avg |
---|---|
94102 | 3557.1429 |
94103 | 6661.3442 |
94104 | 264.6690 |
94105 | 3991.7308 |
94107 | 4663.0451 |
94108 | 3079.6958 |
94109 | 8576.2139 |
94110 | 13280.1975 |
94111 | 471.0046 |
94112 | 16708.3392 |
94114 | 2920.4335 |
94115 | 3034.3397 |
94116 | 5947.1702 |
94117 | 7028.2598 |
94118 | 7522.3348 |
94121 | 8761.4736 |
94122 | 10245.1295 |
94123 | 2888.1686 |
94124 | 7845.0807 |
94127 | 2320.1625 |
94131 | 4460.3709 |
94132 | 7488.0836 |
94133 | 3665.8908 |
94134 | 6392.1383 |
94158 | 2128.0052 |
Compare to the numbers we had before in SF.
sf_visits_week_pre <- sf_visits_week_pre %>% mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2)
kable(sf_visits_week_pre %>% dplyr::select(zipcode, visit_counts_avg))
zipcode | visit_counts_avg |
---|---|
94102 | 110456.1680 |
94103 | 97654.8866 |
94104 | 669.2848 |
94105 | 30200.8654 |
94107 | 87742.3934 |
94108 | 58180.4614 |
94109 | 182261.6564 |
94110 | 325142.3979 |
94111 | 6392.1666 |
94112 | 384496.7076 |
94114 | 105694.0045 |
94115 | 136504.0604 |
94116 | 214065.0066 |
94117 | 146914.1251 |
94118 | 222036.5605 |
94121 | 205688.9278 |
94122 | 280201.0580 |
94123 | 72071.9397 |
94124 | 197262.4305 |
94127 | 83415.7547 |
94131 | 137383.4522 |
94132 | 152203.8776 |
94133 | 109806.4387 |
94134 | 194210.6322 |
94158 | 53867.1903 |
Definitely smaller numbers, but not insignificant.
Combine with the SF data.
combined_sf_week_pre <- rbind(sf_visits_week_pre, marin_origins_week_pre_in_sf, smc_origins_week_pre_in_sf, scc_origins_week_pre_in_sf, ccc_origins_week_pre_in_sf, alc_origins_week_pre_in_sf) %>%
group_by(zipcode) %>%
summarize(total_visit_counts_high = sum(visit_counts_high_zip), total_visit_counts_low = sum(visit_counts_low_zip)) %>%
mutate(total_visit_counts_avg = (total_visit_counts_high + total_visit_counts_low)/2) %>%
left_join(sf_cases_zip_edit %>% filter(date == max(date)) %>% dplyr::select(zipcode, confirmed_cases, total_pop_zip, cases_by_pop)) %>%
mutate(total_visits_per_capita_avg = total_visit_counts_avg / total_pop_zip) %>% filter(!is.na(confirmed_cases))
Look at the differences in visits per capita when using the combined data and the original just SF one, again for the week before SIP.
dif_visits_week_pre <- left_join(combined_sf_week_pre, sf_visits_week_pre %>% dplyr::select(zipcode, visit_counts_avg)) %>%
mutate(visits_per_capita_avg = visit_counts_avg / total_pop_zip, dif_visits_per_capita = total_visits_per_capita_avg - visits_per_capita_avg) %>%
left_join(zctas_94_95, by = c("zipcode" = "ZCTA5CE10")) %>% st_as_sf() %>% st_transform(4326)
mapview(dif_visits_week_pre %>% dplyr::select(dif_visits_per_capita))
Let’s compare these patterns to the week after SIP.
marin_patterns_origins_daily_week1 <- readRDS(paste0(sg_path,"weekly-patterns/v2/mac_patterns_2020-03-16_origins_daily.rds"))
smc_patterns_origins_daily_week1 <- readRDS(paste0(sg_path,"weekly-patterns/v2/smc_patterns_2020-03-16_origins_daily.rds"))
scc_patterns_origins_daily_week1 <- readRDS(paste0(sg_path,"weekly-patterns/v2/scc_patterns_2020-03-16_origins_daily.rds"))
ccc_patterns_origins_daily_week1 <- readRDS(paste0(sg_path,"weekly-patterns/v2/ccc_patterns_2020-03-16_origins_daily.rds"))
alc_patterns_origins_daily_week1 <- readRDS(paste0(sg_path,"weekly-patterns/v2/alc_patterns_2020-03-16_origins_daily.rds"))
# see how many of these results are in SF blockgroups
marin_origins_week1_in_sf <- marin_patterns_origins_daily_week1 %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip)) %>%
mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2)
smc_origins_week1_in_sf <- smc_patterns_origins_daily_week1 %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip)) %>%
mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2)
scc_origins_week1_in_sf <- scc_patterns_origins_daily_week1 %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip)) %>%
mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2)
ccc_origins_week1_in_sf <- ccc_patterns_origins_daily_week1 %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip)) %>%
mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2)
alc_origins_week1_in_sf <- alc_patterns_origins_daily_week1 %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip)) %>%
mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2)
# combine with SF
combined_sf_week1 <- rbind(sf_visits_week1 %>% mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2), marin_origins_week1_in_sf, smc_origins_week1_in_sf, scc_origins_week1_in_sf, ccc_origins_week1_in_sf, alc_origins_week1_in_sf) %>%
group_by(zipcode) %>%
summarize(total_visit_counts_high = sum(visit_counts_high_zip), total_visit_counts_low = sum(visit_counts_low_zip)) %>%
mutate(total_visit_counts_avg = (total_visit_counts_high + total_visit_counts_low)/2) %>%
left_join(sf_cases_zip_edit %>% filter(date == max(date)) %>% dplyr::select(zipcode, confirmed_cases, total_pop_zip, cases_by_pop)) %>%
mutate(total_visits_per_capita_avg = total_visit_counts_avg / total_pop_zip) %>% filter(!is.na(confirmed_cases))
# differences
dif_visits_week1 <- left_join(combined_sf_week1, sf_visits_week1 %>% mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2) %>% dplyr::select(zipcode, visit_counts_avg)) %>%
mutate(visits_per_capita_avg = visit_counts_avg / total_pop_zip, dif_visits_per_capita = total_visits_per_capita_avg - visits_per_capita_avg) %>%
left_join(zctas_94_95, by = c("zipcode" = "ZCTA5CE10")) %>% st_as_sf() %>% st_transform(4326)
mapview(dif_visits_week1 %>% dplyr::select(dif_visits_per_capita))
It looks like the differences between block groups are similar (relatively) to the week before SIP.
Let’s try with these more accurate numbers.
Total visits and total cases
# plot
fig_sf_cases_visits_week_pre_combined <- plot_ly(combined_sf_week_pre) %>%
add_trace(x = ~total_visit_counts_avg, y = ~confirmed_cases, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~total_visit_counts_avg, y = fitted(lm(confirmed_cases~ total_visit_counts_avg, combined_sf_week_pre)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits in week before SIP'), yaxis = list(title = 'Current cases'), title = "SFC, cases and visits in week before SIP, combined data")
fig_sf_cases_visits_week_pre_combined
model_cases_visits_week_pre_combined <- lm(confirmed_cases ~ total_visit_counts_avg, combined_sf_week_pre)
summary(model_cases_visits_week_pre_combined)
##
## Call:
## lm(formula = confirmed_cases ~ total_visit_counts_avg, data = combined_sf_week_pre)
##
## Residuals:
## Min 1Q Median 3Q Max
## -107.28 -27.91 -10.03 37.92 186.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.4077470 28.9449346 -0.187 0.853588
## total_visit_counts_avg 0.0004902 0.0001236 3.967 0.000703 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 72.3 on 21 degrees of freedom
## Multiple R-squared: 0.4284, Adjusted R-squared: 0.4011
## F-statistic: 15.74 on 1 and 21 DF, p-value: 0.0007032
Cases per capita
fig_sfc_cases_pop_visits_week_pre_combined <- plot_ly(combined_sf_week_pre) %>%
add_trace(x = ~total_visit_counts_avg, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~total_visit_counts_avg, y = fitted(lm(cases_by_pop~ total_visit_counts_avg, combined_sf_week_pre)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits in week before SIP'), yaxis = list(title = 'Current cases per capita'), title = "SFC, cases and visits in week before SIP, combined data")
fig_sfc_cases_pop_visits_week_pre_combined
sfc_model_cases_pop_visits_week_pre_combined <- lm(cases_by_pop ~ total_visit_counts_avg, combined_sf_week_pre)
summary(sfc_model_cases_pop_visits_week_pre_combined)
##
## Call:
## lm(formula = cases_by_pop ~ total_visit_counts_avg, data = combined_sf_week_pre)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0018096 -0.0012676 -0.0006227 0.0012212 0.0032183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.902e-03 6.944e-04 2.739 0.0123 *
## total_visit_counts_avg 2.293e-09 2.965e-09 0.773 0.4479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001735 on 21 degrees of freedom
## Multiple R-squared: 0.0277, Adjusted R-squared: -0.01861
## F-statistic: 0.5982 on 1 and 21 DF, p-value: 0.4479
Visits per capita
fig_sfc_cases_pop_visits_pop_week_pre_combined <- plot_ly(combined_sf_week_pre) %>%
add_trace(x = ~total_visits_per_capita_avg, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~total_visits_per_capita_avg, y = fitted(lm(cases_by_pop~ total_visits_per_capita_avg, combined_sf_week_pre)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits per capita in week before SIP'), yaxis = list(title = 'Current cases per capita'), title = "SFC, cases and visits in week before SIP, combined data")
fig_sfc_cases_pop_visits_pop_week_pre_combined
sfc_model_cases_pop_visits_pop_week_pre <- lm(cases_by_pop ~ total_visits_per_capita_avg, combined_sf_week_pre)
summary(scc_model_cases_pop_visits_pop_week_pre)
##
## Call:
## lm(formula = cases_by_pop ~ visits_per_capita_high, data = scc_visits_week_pre_cases)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0011584 -0.0005801 -0.0001982 0.0002486 0.0027879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.919e-03 6.428e-04 2.985 0.00429 **
## visits_per_capita_high -6.928e-05 7.197e-05 -0.963 0.34005
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0008882 on 53 degrees of freedom
## Multiple R-squared: 0.01719, Adjusted R-squared: -0.001356
## F-statistic: 0.9269 on 1 and 53 DF, p-value: 0.3401
Also do multiple regression with absolute cases, absolute visits, and population
cases_pop_visits_sfc_combined <- lm(confirmed_cases ~ total_visit_counts_avg + total_pop_zip, combined_sf_week_pre)
summary(cases_pop_visits_sfc_combined)
##
## Call:
## lm(formula = confirmed_cases ~ total_visit_counts_avg + total_pop_zip,
## data = combined_sf_week_pre)
##
## Residuals:
## Min 1Q Median 3Q Max
## -111.70 -48.29 -12.48 39.98 162.22
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.229e+01 3.609e+01 -0.895 0.381
## total_visit_counts_avg 6.244e-05 3.706e-04 0.168 0.868
## total_pop_zip 3.032e-03 2.480e-03 1.223 0.236
##
## Residual standard error: 71.47 on 20 degrees of freedom
## Multiple R-squared: 0.4681, Adjusted R-squared: 0.4149
## F-statistic: 8.801 on 2 and 20 DF, p-value: 0.001812
First just look at % leaving and visits again with the updates to visits to see if this changed anything.
# get % leaving in week before SIP
sf_sd_visits_week_pre_update <- left_join(sf_sd_zip_week_pre, combined_sf_week_pre) %>% filter(!is.na(percent_leaving_home) & !is.na(total_visits_per_capita_avg))
# plot
fig_sfc_visits_pop_leaving_week_pre_update <- plot_ly(sf_sd_visits_week_pre_update) %>%
add_trace(x = ~total_visits_per_capita_avg, y = ~percent_leaving_home, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~total_visits_per_capita_avg, y = fitted(lm(percent_leaving_home~ total_visits_per_capita_avg, sf_sd_visits_week_pre_update)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits per capita in week before SIP'), yaxis = list(title = '% leaving home in week before SIP'), title = "SFC, leaving home and visits in week before SIP, updated")
fig_sfc_visits_pop_leaving_week_pre_update
Didn’t alter the trend all that much.
Total devices leaving and total visits.
sf_sd_visits_week_pre_update <- sf_sd_visits_week_pre_update %>% mutate(total_leaving_home_zip = total_devices - total_at_home_zip)
fig_sfc_visits_leaving_week_pre_update <- plot_ly(sf_sd_visits_week_pre_update) %>%
add_trace(x = ~total_visit_counts_avg, y = ~total_leaving_home_zip, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~total_visit_counts_avg, y = fitted(lm(total_leaving_home_zip~ total_visit_counts_avg, sf_sd_visits_week_pre_update)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits in week before SIP'), yaxis = list(title = 'Number of devices leaving home in week before SIP'), title = "SFC, leaving home and visits in week before SIP")
fig_sfc_visits_leaving_week_pre_update
Adding the cases to the first plot.
fig_sfc_visits_pop_leaving_week_pre_update_cases <- plot_ly(sf_sd_visits_week_pre_update) %>%
add_trace(x = ~total_visits_per_capita_avg, y = ~percent_leaving_home, type = 'scatter', mode = 'markers', color = ~cases_by_pop) %>%
add_trace(x = ~total_visits_per_capita_avg, y = fitted(lm(percent_leaving_home~ total_visits_per_capita_avg, sf_sd_visits_week_pre_update)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits per capita in week before SIP'), yaxis = list(title = '% leaving home in week before SIP'), title = "SFC, leaving home and visits in week before SIP, updated")
fig_sfc_visits_pop_leaving_week_pre_update_cases
Same analysis done previously for % at home, now with visits.
# # set the cutoff level
# cutoff_cases_by_pop <- 0.0028
# # how many days after cutoff level to look
# later_date_sep = 7
# # how many days prior to look at movement averages
# prior_movement_time = 14
# # filter to just when cases are above the cutoff
# sf_cases_cutoff <- sf_cases_zip_edit %>% filter(cases_by_pop >= cutoff_cases_by_pop)
#
# zipcodes_in_cutoff <- unique(sf_cases_cutoff$zipcode)
#
later_cases_change2 <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
visits_tot <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
visits_by_pop <- vector(mode = "numeric", length = length(zipcodes_in_cutoff))
# get the visits data in a form where it is averaged by day for each zipcode
sf_visits_since_SIP_by_day <- sfc_patterns_origins_daily_since_SIP %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode, date) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip)) %>%
mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2) %>%
left_join(sf_visits_since_SIP_cases %>% dplyr::select(zipcode, total_pop_zip)) %>%
mutate(visits_per_capita_avg = visit_counts_avg / total_pop_zip)
for (i in 1:length(zipcodes_in_cutoff)) {
curr_zip <- zipcodes_in_cutoff[i]
curr_vals <- sf_cases_cutoff %>% filter(zipcode == curr_zip)
later_cases_curr = NA
visits_count_sum = NA
visits_per_cap = NA
if (curr_vals$date[1] != sf_cases_zip_edit$date[1]) { # need to make sure it didn't start above the cutoff
later_cases_curr = curr_vals$cases_by_pop[later_date_sep] - curr_vals$cases_by_pop[1] # change in cases an amount of time later
# get the percent of devices leaving home in an amount of time before the date that cases went above the threshold level
visits_computed = sf_visits_since_SIP_by_day %>%
filter(zipcode == curr_zip & date < curr_vals$date[1] & (date >= curr_vals$date[1] - prior_movement_time)) %>%
summarize(total_visits = sum(visit_counts_avg)) %>%
left_join(sf_visits_since_SIP_cases %>% dplyr::select(zipcode, total_pop_zip)) %>%
mutate(visits_per_capita = total_visits/total_pop_zip)
visits_count_sum = visits_computed$total_visits
visits_per_cap = visits_computed$visits_per_capita
}
later_cases_change2[i] = later_cases_curr
visits_tot[i] = visits_count_sum
visits_by_pop[i] = visits_per_cap
}
# combine
cases_growth_baseline_visits <- data.frame(zipcodes_in_cutoff, later_cases_change2, visits_tot, visits_by_pop) %>% filter(!is.na(later_cases_change))
# plot
fig_growth_visits <- plot_ly(cases_growth_baseline_visits) %>%
add_trace(x = ~visits_tot, y = ~later_cases_change2, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_tot, y = fitted(lm(cases_growth_baseline_visits$later_cases_change2~ cases_growth_baseline_visits$visits_tot)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'number of visits in 2 weeks before'), yaxis = list(title = 'Change in cases per capita 1 week later'), title = "SF, change in cases/person in week after >0.0028 cases/person vs visits in 2 weeks before")
fig_growth_visits
model_growth_visits <- lm(cases_growth_baseline_visits$later_cases_change2~ cases_growth_baseline_visits$visits_tot)
summary(model_growth_visits)
##
## Call:
## lm(formula = cases_growth_baseline_visits$later_cases_change2 ~
## cases_growth_baseline_visits$visits_tot)
##
## Residuals:
## 1 2 3 4 5 6 7
## 7.667e-05 -1.224e-04 -5.557e-05 4.982e-04 2.298e-04 -2.442e-04 -3.825e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.819e-04 3.044e-04 0.926 0.397
## cases_growth_baseline_visits$visits_tot 9.717e-10 1.014e-09 0.958 0.382
##
## Residual standard error: 0.0003259 on 5 degrees of freedom
## Multiple R-squared: 0.1551, Adjusted R-squared: -0.01391
## F-statistic: 0.9177 on 1 and 5 DF, p-value: 0.3821
Also plot with visits per capita.
fig_growth_visits_pop <- plot_ly(cases_growth_baseline_visits) %>%
add_trace(x = ~visits_by_pop, y = ~later_cases_change2, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_by_pop, y = fitted(lm(cases_growth_baseline_visits$later_cases_change2~ cases_growth_baseline_visits$visits_by_pop)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'number of visits per capita in 2 weeks before'), yaxis = list(title = 'Change in cases per capita 1 week later'), title = "SF, change in cases/person in week after >0.0028 cases/person vs visits/person in 2 weeks before")
fig_growth_visits_pop
model_growth_visits_pop <- lm(cases_growth_baseline_visits$later_cases_change2~ cases_growth_baseline_visits$visits_by_pop)
summary(model_growth_visits_pop)
##
## Call:
## lm(formula = cases_growth_baseline_visits$later_cases_change2 ~
## cases_growth_baseline_visits$visits_by_pop)
##
## Residuals:
## 1 2 3 4 5 6 7
## 0.0000301 -0.0002855 -0.0001264 0.0006412 0.0001137 -0.0002476 -0.0001255
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -0.0002843 0.0017013 -0.167
## cases_growth_baseline_visits$visits_by_pop 0.0001424 0.0002900 0.491
## Pr(>|t|)
## (Intercept) 0.874
## cases_growth_baseline_visits$visits_by_pop 0.644
##
## Residual standard error: 0.0003463 on 5 degrees of freedom
## Multiple R-squared: 0.046, Adjusted R-squared: -0.1448
## F-statistic: 0.2411 on 1 and 5 DF, p-value: 0.6442
Looking at the trends in visits over time in each zip code in SFC (noting the limitation that this only includes visits to other locations in SF with the current data)
fig_sf_visits_by_day <- sf_visits_since_SIP_by_day %>% plot_ly() %>% add_lines(x = ~date, y = ~visits_per_capita_avg, color = ~zipcode) %>%
layout(xaxis = list(title = 'Date'), yaxis = list(title = 'visits per capita'), title = "SFC visits per day per capita to SFC locations")
fig_sf_visits_by_day
Compare to case growth over time, like I have done previously for % leaving home.
# overall cases in SFC by date
sf_cases_by_date <- sf_cases_zip %>% group_by(date) %>%
summarize(total_cases = sum(confirmed_cases))
# also from LA times separately
ca_county_cases <-
read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-county-totals.csv")
sf_county_cases <-
ca_county_cases %>%
filter(county == "San Francisco") %>%
filter(!is.na(confirmed_cases)) %>% left_join(sf_cases_by_date) %>%
mutate(dif_cases = confirmed_cases - total_cases)
# the zip code data has some holes and doesn't go back as far in time so will use the LA Times overall data
sf_county_cases <- sf_county_cases %>%
mutate(dcases = c(NA,diff(confirmed_cases)),
past_cases = c(NA, sf_county_cases$confirmed_cases[1:(nrow(sf_county_cases)-1)]),
cases_growth_daily = (dcases / past_cases)*100)
sf_visits_since_SIP_by_day_total_cases <- sf_visits_since_SIP_by_day %>%
group_by(date) %>%
summarize(total_visits = sum(visit_counts_avg)) %>%
left_join(sf_county_cases)
ay <- list(
tickfont = list(color = "red"),
overlaying = "y",
side = "right",
title = "case growth rate"
)
fig_sf_visits_by_day <- plot_ly(sf_visits_since_SIP_by_day_total_cases) %>% add_lines(x = ~date, y = ~total_visits, name = "total visits") %>% add_lines(x = ~date, y = ~cases_growth_daily, name = "case growth rate", yaxis = "y2") %>% layout(title = "Visits and Case Growth Rate for SFC", yaxis2 = ay, xaxis = list(title = "Date"))
fig_sf_visits_by_day
Visits in the week before and after SIP. This does include visits from SF block groups to other counties.
# get only visits relevant to SFC
sfc_patterns_origins_daily_week_pre_in_sf <- sfc_patterns_origins_daily_week_pre %>%
filter(origin_census_block_group %in% sf_bg_zctas$blockgroup)
sfc_patterns_origins_daily_week1_in_sf <- sfc_patterns_origins_daily_week1 %>%
filter(origin_census_block_group %in% sf_bg_zctas$blockgroup)
marin_patterns_origins_daily_week1_in_sf <- marin_patterns_origins_daily_week1 %>%
filter(origin_census_block_group %in% sf_bg_zctas$blockgroup)
marin_patterns_origins_daily_week_pre_in_sf <- marin_patterns_origins_daily_week_pre %>%
filter(origin_census_block_group %in% sf_bg_zctas$blockgroup)
smc_patterns_origins_daily_week1_in_sf <- smc_patterns_origins_daily_week1 %>%
filter(origin_census_block_group %in% sf_bg_zctas$blockgroup)
smc_patterns_origins_daily_week_pre_in_sf <- smc_patterns_origins_daily_week_pre %>%
filter(origin_census_block_group %in% sf_bg_zctas$blockgroup)
scc_patterns_origins_daily_week1_in_sf <- scc_patterns_origins_daily_week1 %>%
filter(origin_census_block_group %in% sf_bg_zctas$blockgroup)
scc_patterns_origins_daily_week_pre_in_sf <- scc_patterns_origins_daily_week_pre %>%
filter(origin_census_block_group %in% sf_bg_zctas$blockgroup)
ccc_patterns_origins_daily_week1_in_sf <- ccc_patterns_origins_daily_week1 %>%
filter(origin_census_block_group %in% sf_bg_zctas$blockgroup)
ccc_patterns_origins_daily_week_pre_in_sf <- ccc_patterns_origins_daily_week_pre %>%
filter(origin_census_block_group %in% sf_bg_zctas$blockgroup)
alc_patterns_origins_daily_week1_in_sf <- alc_patterns_origins_daily_week1 %>%
filter(origin_census_block_group %in% sf_bg_zctas$blockgroup)
alc_patterns_origins_daily_week_pre_in_sf <- alc_patterns_origins_daily_week_pre %>%
filter(origin_census_block_group %in% sf_bg_zctas$blockgroup)
combined_sf_week_pre_1_by_date <- rbind(sfc_patterns_origins_daily_week_pre_in_sf, sfc_patterns_origins_daily_week1_in_sf, marin_patterns_origins_daily_week1_in_sf, marin_patterns_origins_daily_week_pre_in_sf, smc_patterns_origins_daily_week1_in_sf, smc_patterns_origins_daily_week_pre_in_sf, scc_patterns_origins_daily_week1_in_sf, scc_patterns_origins_daily_week_pre_in_sf, ccc_patterns_origins_daily_week1_in_sf, ccc_patterns_origins_daily_week_pre_in_sf, alc_patterns_origins_daily_week1_in_sf, alc_patterns_origins_daily_week_pre_in_sf) %>%
left_join(sf_bg_zctas, by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode, date) %>%
summarize(visit_counts_high_zip = sum(visit_counts_high), visit_counts_low_zip = sum(visit_counts_low)) %>%
filter(!is.na(visit_counts_high_zip)) %>%
mutate(visit_counts_avg = (visit_counts_high_zip + visit_counts_low_zip)/2) %>%
left_join(sf_cases_zip_edit %>% filter(date == max(date)) %>% dplyr::select(zipcode, confirmed_cases, total_pop_zip, cases_by_pop)) %>%
mutate(total_visits_per_capita_avg = visit_counts_avg / total_pop_zip) %>% filter(!is.na(confirmed_cases))
fig_sf_visits_by_day_early <- combined_sf_week_pre_1_by_date %>% plot_ly() %>% add_lines(x = ~date, y = ~total_visits_per_capita_avg, color = ~zipcode) %>%
layout(xaxis = list(title = 'Date'), yaxis = list(title = 'visits per capita'), title = "SFC visits per day per capita")
fig_sf_visits_by_day_early
That’s funny. Though not too surprising.
Absolute number of visits:
fig_sf_visits_by_day_early_abs <- combined_sf_week_pre_1_by_date %>% plot_ly() %>% add_lines(x = ~date, y = ~visit_counts_avg, color = ~zipcode) %>%
layout(xaxis = list(title = 'Date'), yaxis = list(title = 'total visits'), title = "SFC visits per day")
fig_sf_visits_by_day_early_abs
Visits per person leaving home in this time frame. Combining the social distancing data with the visits data, looking over the whole time frame of the two weeks of visit data.
combined_sf_week_pre_1_total <- combined_sf_week_pre_1_by_date %>%
group_by(zipcode) %>%
summarize(total_visits = sum(visit_counts_avg), total_visits_per_capita = sum(total_visits_per_capita_avg))
sf_sd_week_pre_1 <- bay_sd %>%
filter(origin_census_block_group %in% sf_blockgroups$GEOID) %>%
filter(date >= min(combined_sf_week_pre_1_by_date$date) & date <= max(combined_sf_week_pre_1_by_date$date)) %>%
left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode) %>%
summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count)) %>%
mutate(
percent_at_home = total_at_home_zip*100/total_devices,
percent_leaving_home = (100 - percent_at_home)
)
sf_sd_visits_week_pre_1 <- left_join(sf_sd_week_pre_1, combined_sf_week_pre_1_total) %>%
left_join(sf_cases_zip %>% filter(date == max(date)) %>% dplyr::select(zipcode, total_pop_zip, confirmed_cases, cases_by_pop)) %>%
mutate(num_leaving_home = percent_leaving_home*total_pop_zip/100, visits_per_person_leaving = total_visits / num_leaving_home) %>%
filter(!is.na(confirmed_cases))
Graph cases and visits per person leaving.
fig_cases_visits_leaving_abs <- plot_ly(sf_sd_visits_week_pre_1 ) %>%
add_trace(x = ~visits_per_person_leaving, y = ~confirmed_cases, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_person_leaving, y = fitted(lm(confirmed_cases~ visits_per_person_leaving, sf_sd_visits_week_pre_1 )), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits per person leaving home 3/9-3/22'), yaxis = list(title = 'Current cases'), title = "SFC, cases and visits per person leaving home")
fig_cases_visits_leaving_abs
sfc_model_fig_cases_visits_leaving_abs <- lm(confirmed_cases ~ visits_per_person_leaving, sf_sd_visits_week_pre_1 )
summary(sfc_model_fig_cases_visits_leaving_abs)
##
## Call:
## lm(formula = confirmed_cases ~ visits_per_person_leaving, data = sf_sd_visits_week_pre_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.79 -60.08 -13.12 39.64 283.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -44.340 86.116 -0.515 0.612
## visits_per_person_leaving 9.981 6.125 1.630 0.118
##
## Residual standard error: 90.1 on 21 degrees of freedom
## Multiple R-squared: 0.1123, Adjusted R-squared: 0.06998
## F-statistic: 2.655 on 1 and 21 DF, p-value: 0.1181
fig_cases_visits_leaving <- plot_ly(sf_sd_visits_week_pre_1 ) %>%
add_trace(x = ~visits_per_person_leaving, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_person_leaving, y = fitted(lm(cases_by_pop~ visits_per_person_leaving, sf_sd_visits_week_pre_1 )), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Number of visits per person leaving home 3/9-3/22'), yaxis = list(title = 'Current cases per capita'), title = "SFC, cases and visits per person leaving home")
fig_cases_visits_leaving
sfc_model_fig_cases_visits_leaving <- lm(cases_by_pop ~ visits_per_person_leaving, sf_sd_visits_week_pre_1 )
summary(sfc_model_fig_cases_visits_leaving)
##
## Call:
## lm(formula = cases_by_pop ~ visits_per_person_leaving, data = sf_sd_visits_week_pre_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0020099 -0.0011592 -0.0005651 0.0009938 0.0032518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0006089 0.0016351 0.372 0.713
## visits_per_person_leaving 0.0001277 0.0001163 1.098 0.285
##
## Residual standard error: 0.001711 on 21 degrees of freedom
## Multiple R-squared: 0.05427, Adjusted R-squared: 0.009233
## F-statistic: 1.205 on 1 and 21 DF, p-value: 0.2847
# model with % completely at home and visits per person leaving home
sfc_model_cases_visits_leaving_perc <- lm(cases_by_pop ~ visits_per_person_leaving + percent_at_home, sf_sd_visits_week_pre_1 )
summary(sfc_model_cases_visits_leaving_perc)
##
## Call:
## lm(formula = cases_by_pop ~ visits_per_person_leaving + percent_at_home,
## data = sf_sd_visits_week_pre_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0022632 -0.0010210 -0.0004345 0.0008065 0.0034636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.018e-03 4.788e-03 -1.884 0.0742 .
## visits_per_person_leaving 2.445e-07 1.233e-04 0.002 0.9984
## percent_at_home 3.149e-04 1.486e-04 2.119 0.0468 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001584 on 20 degrees of freedom
## Multiple R-squared: 0.2277, Adjusted R-squared: 0.1505
## F-statistic: 2.949 on 2 and 20 DF, p-value: 0.07546
I’m not sure if that is quite the right metric that I want yet, but there is something I’m trying to get at here that might be useful.