library(rmapshaper)
library(censusapi)
library(shapefiles)
library(base)
library(tidyverse)
library(tigris)
library(sf)
library(mapview)
library(raster)
library(rlang)
library(rgeos)
library(data.table)
library(measurements)
library(smoothr)
library(lwgeom)
library(units)
library(geosphere)
library(kableExtra)
library(leaflet)
library(htmltools)
library(plotly)
library(osrm)
library(tsviz)
library(sp)
mapviewOptions(
basemaps = "OpenStreetMap"
)
options(
tigris_class = "sf",
tigris_use_cache = TRUE,
osrm.server = "http://127.0.0.1:5000/",
osrm.profile = "walk"
)
Sys.setenv(CENSUS_KEY="b88495f8315f45b2d100dee9ba8f4c489a7371c2")
projection <- "+proj=utm +zone=10 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=ft +no_defs"
`%notin%` <- Negate(`%in%`)
Does the location of where one lives impact the ability to enjoy outdoor green space? Does the size of the yard at your house play a factor in whether you regularly visit parks? Are communities under-visiting parks that are physically closer to them? The goal of this analysis is to provide insight on park accessibility within communities in the City of San Jose. We will consider factors such as backyard size, park visit frequency, and distance from a park to determine a community’s park accessibility.
The first few sections describe our methodological approaches for preparing the factors for analysis, followed by sections detailing the comparisons between factors and the resulting conclusions.
The following describes how the Safegraph data is processed:
Safegraph collects device visit information to “places of interest” (POI) like parks or grocery stores and includes some information about a device’s origin census block group. The visit counts in the Safegraph dataset fall into either one of these three cases:
Note: We obtained all of the data for the park visits portion of the analysis from Safegraph Patterns Data.
Our goal for this section is to obtain the residential yard size per capita for each census block group in San Jose. We start with parcel data from the Santa Clara County Assessor’s Office, and filter down to only include parcels with residential land use codes. The following list details the land use codes we used for this analysis:
After filtering to parcels with these residential use codes, we spatially match the building footprint shape associated with that parcel. To get from parcel area to yard area, we subtract the building footprint shape out of the parcel shape.
#load("P:/SFBI/Data Library/OSM/osm_bldg.Rdata")
#
#osm_bldg <-
# osm_bldg %>% st_transform(projection)
sj_boundary <-
places("CA", cb=FALSE) %>%
filter(NAME == "San Jose") %>%
st_transform(projection)
#
sj_blockgroup <-
readRDS("sj_blockgroups.rds") %>%
st_transform(projection)
#mapview(sj_blockgroup)
#load("P:/SFBI/Data Library/California Blocks/ca_blocks_lite.Rdata")
#ca_blocks_lite <- ca_blocks_lite %>% st_transform(projection)
#sj_blocks <- ca_blocks_lite[sj_boundary,]
#save(sj_blocks, file = "sj_blocks.Rdata")
#load("sj_blocks.Rdata")
#mapview(sj_blocks[100,])
# sj_bldg <-
# osm_bldg[sj_boundary,] %>%
# transmute(
# index = row_number()
# )
#save(sj_bldg, file = "sj_bldg.Rdata")
load("sj_bldg.Rdata")
# scc_parcels <- st_read("P:/SFBI/Data Library/Parcels/santa clara/scc.shp")
#
# scc_parcels <-
# scc_parcels %>%
# st_transform(projection)
#
# This includes parcels parcels at the periphery of San Jose. This is important for the "block method" used to find street edges. otherwise this sf object isn't used.
# sj_parcels_plus <-
# scc_parcels[sj_blocks,] %>%
# dplyr::select(APN, USE_CODE, SITUS_CITY)
# #
# sj_parcels <-
# sj_parcels_plus %>%
# filter(SITUS_CITY == "SAN JOSE")
# #
# save(sj_parcels_plus, sj_parcels, file = "sj_parcels.Rdata")
load("sj_parcels.Rdata")
# apn_bg_lookup <- data.frame(APN = NA, origin_census_block_group = NA)
#
# for(i in 1:nrow(sj_blockgroup)){
# if(i %% 10 == 0){print(i)}
# temp <- sj_blockgroup[i,]
# temp2 <- sj_parcels[temp,] %>%
# dplyr::select(APN) %>%
# st_set_geometry(NULL) %>%
# mutate(
# origin_census_block_group = temp$origin_census_block_group
# )
# apn_bg_lookup <-
# apn_bg_lookup %>%
# rbind(temp2)
# }
#
# apn_bg_lookup <-
# apn_bg_lookup %>%
# na.omit()
#
# saveRDS(apn_bg_lookup,"sj_apn_bg_lookup.rds")
apn_bg_lookup <- readRDS("sj_apn_bg_lookup.rds")
sj_parcels_residential <-
sj_parcels %>%
mutate(
USE_CODE = as.numeric(USE_CODE)
) %>%
filter(USE_CODE %in% c(1,2,3,4,6,7))
#mapview(head(sj_parcels_residential,100))
matched_bldg_footprint <-
sj_bldg %>%
st_centroid() %>%
st_intersection(sj_parcels_residential) %>%
st_set_geometry(NULL) %>%
left_join(sj_bldg, by = "index") %>%
st_as_sf()
#
#
residential_parcels <-
sj_parcels_residential %>%
mutate(
parcel_area = st_area(.) %>% as.numeric()
) %>%
as.data.frame() %>%
st_as_sf() %>%
rename(parcel_geometry = geometry)
#
#
residential_bldg <-
residential_parcels %>%
as.data.frame() %>%
right_join(
matched_bldg_footprint %>% dplyr::select(APN),
by = "APN"
) %>%
filter(!is.na(parcel_area)) %>%
rename(building_geometry = geometry) %>%
st_as_sf() %>%
st_set_geometry("building_geometry")
#
#
residential_parcels <-
residential_bldg %>%
st_set_geometry("parcel_geometry")
#
residential_parcels <- residential_parcels[!duplicated(residential_parcels), ]
##############################################################
# available_yard <-
# st_difference(
# residential_parcels[1:10000,],
# st_union(
# residential_parcels$building_geometry[1:10000,])
# )
#
# available_yard2 <-
# st_difference(
# residential_parcels[10001:20000,],
# st_union(
# residential_parcels$building_geometry[10001:20000,])
# )
#
# available_yard3 <-
# st_difference(
# residential_parcels[20001:30000,],
# st_union(
# residential_parcels$building_geometry[20001:30000,])
# )
# #
# available_yard4 <-
# st_difference(
# residential_parcels[30001:40000,],
# st_union(
# residential_parcels$building_geometry[30001:40000,])
# )
#
# available_yard5 <-
# st_difference(
# residential_parcels[40001:50000,],
# st_union(
# residential_parcels$building_geometry[40001:50000,])
# )
#
# available_yard6 <-
# st_difference(
# residential_parcels[50001:60000,],
# st_union(
# residential_parcels$building_geometry[50001:60000,])
# )
#
# available_yard7 <-
# st_difference(
# residential_parcels[60001:70000,],
# st_union(
# residential_parcels$building_geometry[60001:70000,])
# )
#
# available_yard8 <-
# st_difference(
# residential_parcels[70001:80000,],
# st_union(
# residential_parcels$building_geometry[70001:80000,])
# )
#
# available_yard9 <-
# st_difference(
# residential_parcels[80001:90000,],
# st_union(
# residential_parcels$building_geometry[80001:90000,])
# )
#
# available_yard10 <-
# st_difference(
# residential_parcels[90001:100000,],
# st_union(
# residential_parcels$building_geometry[90001:100000,])
# )
#
# available_yard10 <-
# st_difference(
# residential_parcels[100001:nrow(residential_parcels),],
# st_union(
# residential_parcels$building_geometry[100001:nrow(residential_parcels),])
# )
# all_available_yard <-
# available_yard %>%
# rbind(available_yard2) %>%
# rbind(available_yard3) %>%
# rbind(available_yard4) %>%
# rbind(available_yard5) %>%
# rbind(available_yard6) %>%
# rbind(available_yard7) %>%
# rbind(available_yard8) %>%
# rbind(available_yard9) %>%
# rbind(available_yard10)
#save(all_available_yard, file = "sj_all_yard_available.Rdata")
load("sj_all_yard_available.Rdata")
all_available_yard_other <-
all_available_yard %>%
filter(USE_CODE == 2 | USE_CODE == 3)
all_available_yard_apartment <-
all_available_yard %>%
filter(USE_CODE == 4)
all_available_yard_other2 <-
all_available_yard %>%
filter(USE_CODE %in% c(6,7))
available_area_fun <- function(df, residential_parcels){
df %>%
as.data.frame() %>%
rename(geometry = parcel_geometry) %>%
left_join(residential_parcels %>% dplyr::select(APN), by = "APN") %>%
st_as_sf() %>%
st_set_geometry("building_geometry") %>%
as_Spatial() %>%
gBuffer(byid = T, width = 0) %>%
sp::disaggregate() %>%
st_as_sf() %>%
rename(available_area_geometry = geometry) %>%
st_set_geometry("available_area_geometry") %>%
mutate(
available_area = round(st_area(.) %>% as.numeric())
) %>%
st_set_crs(projection)
}
# save(available_area, file = "sj_available_backyard_area.Rdata")
#!!!!This .Rdata file constians all of the yard areas for parcels zoned as single-family residential!!!!
load("sj_available_backyard_area.Rdata")
# PUC 2,3
available_area1_other <- available_area_fun(all_available_yard_other, residential_parcels)
available_area1_other <-
available_area1_other %>%
dplyr::select(APN, available_area, available_area_geometry)
#PUC 6,7
available_area2_other <- available_area_fun(all_available_yard_other2, residential_parcels)
available_area2_other <-
available_area2_other %>%
dplyr::select(APN, available_area, available_area_geometry)
available_area_apartment <-
all_available_yard_apartment %>%
mutate(
bldg_area = st_area(building_geometry)
) %>%
group_by(APN, parcel_area) %>%
summarize(bldg_area_sum = sum(as.numeric(bldg_area))) %>%
mutate(available_area = parcel_area - bldg_area_sum)
available_area_apartment <-
available_area_apartment %>%
rename(available_area_geometry = building_geometry) %>%
dplyr::select(APN,available_area,available_area_geometry) %>%
mutate(
available_area = ifelse(available_area < 0,0,available_area)
)
#combining all PUC dataframes
available_area_combo <-
available_area %>%
rbind(available_area1_other) %>%
rbind(available_area2_other) %>%
rbind(available_area_apartment)
nix_list <- c("69403028","41425018","67013004","67013003","67029024","74207011")
nix_df <-
available_area %>%
filter(APN %in% nix_list) %>%
left_join(
all_available_yard %>%
st_set_geometry(NULL) %>%
dplyr::select(APN, USE_CODE) %>%
filter(APN %in% nix_list),
by = "APN"
)
available_area_bg <-
available_area_combo %>%
left_join(
apn_bg_lookup %>%
filter(APN %in% available_area$APN),
by = "APN"
) %>%
filter(APN %notin% nix_list)
#
bg_area_avg <-
available_area_bg %>%
st_set_geometry(NULL) %>%
group_by(origin_census_block_group) %>%
summarise(yard_area_avg = sum(available_area))
#
extra_bg <-
sj_blockgroup %>%
filter(origin_census_block_group %notin% bg_area_avg$origin_census_block_group) %>%
st_set_geometry(NULL) %>%
dplyr::select(origin_census_block_group) %>%
mutate(yard_area_avg = 0)
#
bg_area_avg_all <-
bg_area_avg %>%
rbind(extra_bg) %>%
left_join(
sj_blockgroup %>%
dplyr::select(-DISTRICTS),
by = "origin_census_block_group"
) %>%
na.omit() %>%
st_as_sf() %>%
st_transform(4326)
#save(bg_area_avg_all, file = "sj_bg_avg_yard_area.Rdata")
load("sj_bg_avg_yard_area.Rdata")
sj_population <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = c("group(B01003)")
) %>%
mutate(
origin_census_block_group =
paste0(state,county,tract,block_group)
) %>%
rename(
total_pop = "B01003_001E"
) %>%
dplyr::select(total_pop, origin_census_block_group)
sj_yard_pop_weight <-
bg_area_avg_all %>%
left_join(
sj_population %>%
filter(origin_census_block_group %in% bg_area_avg_all$origin_census_block_group),
by = "origin_census_block_group"
) %>%
mutate(
yard_per_capita = yard_area_avg / total_pop
)
The following map shows the void building footprint within the parcel shape for one block group. The resulting shape represents the yard area for each parcel.
example_bg_yard <-
available_area_bg %>%
filter(origin_census_block_group == "060855006002")
mapview(example_bg_yard)
In order to align our geographic granularity with the other factors of our analysis, we take the sum of all of the yard sizes within a block group. This value is then divided by the block group population to obtain the yard area per capita. The following map shows the yard area per capita sizes for all of the block groups in San Jose. Notice that the largest values occur near the left border of the city, near the hills.
bg_area_avg_minus <-
sj_yard_pop_weight
blue_pal2 <- colorNumeric(
palette = colorRamp(c("#FFFFFF", "#00288c"), interpolate="spline"),
domain =
bg_area_avg_minus %>%
pull(yard_per_capita) %>%
unique()
)
yard_bg_map_minus <-
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = sj_boundary %>% st_transform(4326),
fill = F,
color = "gray",
weight = 1,
label = ~NAME
) %>%
addPolygons(
data = bg_area_avg_minus,
fillColor = ~blue_pal2(yard_per_capita),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
#group = "Average Daily Park Visits",
label = ~paste0("Block Group Average Yard Size per Capita: ",round(yard_per_capita), " square feet"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)) %>%
addLegend(
position = "topright",
pal = blue_pal2,
values = bg_area_avg_minus$yard_per_capita,
title = "Avg. Yard Area per Capita (sqft.)",
opacity = 1
)
yard_bg_map_minus
Note: The data from the Santa Clara County assessor’s office likely includes multiple zoning misclassifications. For example, the following parcel has a Property Use Code of 1 (single-family residential), but it is actually a water treatment plant. This “yard” (that has an area of ~one million square feet) was inflating the yard size for the block group in which it is located.
mapview(nix_df[7,])
We did a visual inspection of the single-family residential parcels in the block groups with large average yard sizes (over 30,000 sqft.), and removed the parcels that appeared to be misclassified. Keep in mind that there still may be other misclassified parcels incorporated into the block group average yard size.
Some overall San Jose residential yard size statistics:
Mean San Jose Yard Area per Capita: 386 sq. ft.
Standard Deviation: 660 sq. ft.
Minimum San Jose Yard Size per Capita: 0 sq. ft.
Maximum San Jose Yard Size per Capita: 6300 sq. ft.
For now, I will put a hold on this analysis and move to the average number of people who visit San Jose parks that reside in these block groups. At the end, we will come back to these yard areas to compare against other accessibility factors.
When looking at park visitors who live in specific census block groups (CBGs), it is not sufficient to consider the CBG visit numbers at face value. The population of each block group plays an important factor in weighing a community’s visits to parks. For example, if a CBG has 3 residents, and they have an average of 3 daily park visits, then we can infer that this community has very active park visitors (even though the 3 park visits may seem low).
The following map displays all of the census block groups we considered in this analysis (see the Safegraph Data Collection Explanation section above for why some CBGs are missing). The “Park Visits” option illustrates the average daily number of visits that each CBG had to a SJ park since January 1, 2020. The “Average Monthly Park Visits per Person” option provides more insight into the monthly frequency that community members visits parks.
####periodically update from sanjose_park_process file
park_origins <- readRDS("sj_park_origins.rds") %>%
filter(origin_census_block_group %in% sj_blockgroup$origin_census_block_group) %>%
mutate(month = month(date %>% as.Date()))
# park_origins_monthly <-
# park_origins %>%
# group_by(origin_census_block_group,month) %>%
# summarise(month_total = sum(mean_origin_visits)) %>%
# filter(month != 8)
park_geometry <-
readRDS('park_output_geometry.rds') %>%
st_as_sf() %>%
st_transform(projection)
park_origin_mean <-
park_origins %>%
group_by(origin_census_block_group) %>%
summarise(mean_visits = mean(mean_origin_visits))
visit_pop_combo <-
sj_population %>%
filter(origin_census_block_group %in% park_origin_mean$origin_census_block_group) %>%
left_join(
park_origin_mean,
by = "origin_census_block_group"
) %>%
na.omit() %>%
mutate(
visits_per_pop = round((mean_visits/total_pop) * 30,2)
) %>%
left_join(
sj_blockgroup %>%
dplyr::select(-DISTRICTS),
by = "origin_census_block_group"
) %>%
st_as_sf() %>%
st_transform(projection)
orange_pal <- colorNumeric(
palette = colorRamp(c("#fcd786", "#a67100"), interpolate="spline"),
domain =
visit_pop_combo %>%
pull(mean_visits) %>%
unique()
)
teal_pal <- colorNumeric(
palette = colorRamp(c("#d6fffe", "#015a68"), interpolate="spline"),
domain =
visit_pop_combo %>%
pull(visits_per_pop) %>%
unique()
)
all_visitors_map <-
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = sj_boundary %>% st_transform(4326),
fill = F,
color = "gray",
weight = 1,
label = ~NAME
) %>%
addPolygons(
data = visit_pop_combo %>%
st_transform(4326),
fillColor = ~orange_pal(mean_visits),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "Average Daily San Jose Park Visits",
label = ~paste0(round(mean_visits)," average daily park visits"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)) %>%
addPolygons(
data = visit_pop_combo %>%
st_transform(4326),
fillColor = ~teal_pal(visits_per_pop),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "Average Monthly Park Visits per Person",
label = ~paste0(visits_per_pop," average monthly park visits per person"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)
) %>%
addLayersControl(
baseGroups = c("Average Daily San Jose Park Visits", "Average Monthly Park Visits per Person"),
options = layersControlOptions(collapsed = FALSE)
)
all_visitors_map
Some interesting results to highlight:
The block group that has both the highest average daily park visits and highest average monthly park visits per person is shown in the following view. Interestingly, this CBG contains the Santa Clara County jail and police office. Approximately 60% of the “residents” of this block group visit a San Jose park everyday. But who are these visitors? Inmates are unlikely to have a cellular device. After discussing this point with Stanford’s RegLab, we developed the hypothesis that there may be certain devices (tablets or phones) that police and correction officers use during the workday, but leave at the office during the night (which becomes the device’s home).
all_visitors_map %>%
setView(-121.906408, 37.350420, zoom = 15)
An additional block group that stands out is one located near Emma Prusch Farm Park. This CBG contains mainly an on-ramp, a portion of Bayshore Freeway, and a storage center. In the “Average Daily Park Visits” view, this CBG does not seem to have any significance. But when you switch to the “Average Monthly Park Visits per Person” view, the CBG shows to have the second highest visits/population ratio in all of San Jose (although the daily park visit average is only 7, each person visits a park on average ~15 times per month). This may be an instance of a homeless encampment underneath or near the freeway on-ramp, whose residents frequent the nearby Emma Prusch Farm Park.
homeless <-
all_visitors_map %>%
showGroup("Average Park Visits per 1000") %>%
setView(-121.847157, 37.335857, zoom = 15)
homeless
What communities visit parks more or less frequently than others? We will now directly compare the CBGs with the lowest and highest monthly park visits per person since January 1, 2020.
We considered a low monthly park visit per person value to be less than 1, and a high value to be more than 3 (one SD less/more than the mean, respectively).
lowest_visitors <-
visit_pop_combo %>%
filter(visits_per_pop < mean(visit_pop_combo$visits_per_pop)-sd(visit_pop_combo$visits_per_pop)) %>%
st_as_sf() %>%
st_transform(projection)
purple_pal <- colorNumeric(
palette = colorRamp(c("#b27ef2", "#4900a3"), interpolate="spline"),
domain =
lowest_visitors %>%
pull(visits_per_pop) %>%
unique()
)
highest_visitors <-
visit_pop_combo %>%
filter(visits_per_pop > mean(visit_pop_combo$visits_per_pop)+sd(visit_pop_combo$visits_per_pop)) %>%
st_as_sf() %>%
st_transform(projection)
red_pal <- colorNumeric(
palette = colorRamp(c("#ff8585", "#9c0000"), interpolate="spline"),
domain =
highest_visitors %>%
pull(visits_per_pop) %>%
unique()
)
extremes_visitors_map <-
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = sj_boundary %>%
st_transform(4326),
fill = F,
color = "gray",
weight = 1,
label = ~NAME
) %>%
addPolygons(
data = lowest_visitors %>%
st_transform(4326),
fillColor = ~purple_pal(visits_per_pop),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "CBGs with Lowest Average Monthly Park Visits per Person",
label = ~paste0(visits_per_pop," average monthly park visits per person"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)
) %>%
addPolygons(
data = highest_visitors %>%
st_transform(4326),
fillColor = ~red_pal(visits_per_pop),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "CBGs with Highest Average Monthly Park Visits per Person",
label = ~paste0(visits_per_pop," average monthly park visits per person"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)
) %>%
addLayersControl(
overlayGroups = c("CBGs with Lowest Average Monthly Park Visits per Person", "CBGs with Highest Average Monthly Park Visits per Person"),
options = layersControlOptions(collapsed = FALSE)
)
extremes_visitors_map
#mapview(lowest_visitors, zcol = "visits_per_pop")
In the geographic sense, there appears to be no specific CBG pattern that would allow for predictive classification of a low or high visit/population ratio. Later, we will introduce some key demographic variables into the analysis to determine if there are other predictors for monthly park visits per person.
Our main goal for this section is to find the distance and walking duration of the nearest park from each San Jose census block group (CBG). We first start by finding the nearest point on a park boundary to the centroid of each CBG. The following map provides a visual representation of this process. The red shape is the census block group, the green shape is a park, and the purple line represents the shortest distance from the center of the block group to the edge of the park.
sj_bg_coords <-
sj_blockgroup %>%
st_centroid()
temp_nearest <- st_nearest_points(sj_bg_coords$geometry[3],park_geometry$SHAPE[1])
mapview(temp_nearest) + mapview(sj_blockgroup[3,], col.regions = "red") + mapview(park_geometry[1,], col.regions = "green")
This distance is “as the crow flies”, or straight-line distance, which is not an accurate representation of the actual route one would take in a urban environment. Taking into account the complex nature of streets, we use OSRM routing to calculate walking distances and durations between census block groups and the nearest point on a park boundary. The following map illustrates the more refined route that results from OSRM:
#
# nearest_points <- data.frame(origin_census_block_group = NA, geometry = NA, park_name = NA, nearest_point = NA)
#
# for(i in 1:nrow(sj_bg_coords)){
# for(j in 1:nrow(park_geometry)){
# temp_df <- data.frame(
# origin_census_block_group = sj_bg_coords$origin_census_block_group[i],
# geometry = sj_bg_coords$geometry[i],
# park_name = park_geometry$location_name[j]
# )
#
# temp_df <-
# temp_df %>%
# mutate(
# nearest_point = ifelse(length(st_intersection(st_nearest_points(sj_bg_coords$geometry[i],park_geometry$SHAPE[j]),park_geometry$SHAPE[j])) == 0, NA, st_intersection(st_nearest_points(sj_bg_coords$geometry[i],park_geometry$SHAPE[j]),park_geometry$SHAPE[j]))
# )
#
# nearest_points <-
# nearest_points %>%
# rbind(temp_df)
# }
#
# print(i)
#
# }
#
# nearest_points <-
# nearest_points %>%
# na.omit()
#
#
# save(nearest_points, file = "sj_park_bg_nearest_points.Rdata")
load("sj_park_bg_nearest_points.Rdata")
#
#
#
# cbg_names <-
# nearest_points %>%
# dplyr::select(origin_census_block_group)
#
# park_names <-
# nearest_points %>%
# dplyr::select(park_name)
#
# nearest_points_cbg <-
# nearest_points %>%
# st_as_sf() %>%
# st_set_crs(projection) %>%
# st_transform(4326) %>%
# rename(
# block_group_centroid = geometry
# ) %>%
# dplyr::select(origin_census_block_group,block_group_centroid) %>%
# st_coordinates() %>%
# cbind(cbg_names) %>%
# rename(
# "latcbg" = Y,
# "loncbg" = X
# )
#
# nearest_points_park <-
# nearest_points %>%
# st_set_geometry("nearest_point") %>%
# st_as_sf() %>%
# st_set_crs(projection) %>%
# st_transform(4326) %>%
# st_centroid() %>%
# dplyr::select(park_name,nearest_point) %>%
# st_coordinates() %>%
# cbind(park_names) %>%
# rename(
# "latpark" = Y,
# "lonpark" = X
# )
#
#
# nearest_points_all <-
# nearest_points_cbg %>%
# cbind(nearest_points_park) %>%
# na.omit()
#
#
# dist_park_osrm <-
# 1:nrow(nearest_points_all) %>%
# map_dfr(function(i){
# if(i%%50 == 0){print(i)}
# osrmRoute(
# src = nearest_points_all[i, c("origin_census_block_group", "loncbg","latcbg")],
# dst = nearest_points_all[i, c("park_name", "lonpark","latpark")],
# returnclass="sf"
# ) %>%
# as.data.frame()
# }) %>%
# st_as_sf() %>%
# st_set_crs(4326)
#
# save(dist_park_osrm, file = "sj_osrm_park_dist.Rdata")
load("sj_osrm_park_dist.Rdata")
mapview(dist_park_osrm[1,]) + mapview(sj_blockgroup[3,], col.regions = "red") + mapview(park_geometry[1,], col.regions = "green")
The OSRM method assumes a fixed walking speed of 5 km/hr, or about 20 minutes per mile. We used this method to find the nearest park to each census block group. The following map displays all of the CBGs in San Jose, color coded by the distance to the nearest park.
min_dist_park_osrm <-
dist_park_osrm %>%
as.data.frame() %>%
group_by(src) %>%
summarize(
rank = which.min(distance),
distance = distance[rank],
duration = duration[rank],
park_name = dst[rank]) %>%
rename(
origin_census_block_group = src
) %>%
dplyr::select(-rank) %>%
left_join(
sj_blockgroup,
by = "origin_census_block_group"
) %>%
st_as_sf() %>%
mutate(distance = distance / 1.60934)
green_pal <- colorNumeric(
palette = colorRamp(c("#95f599", "#007804"), interpolate="spline"),
domain =
min_dist_park_osrm %>%
pull(distance) %>%
unique()
)
min_dist_map <-
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = sj_boundary %>%
st_transform(4326),
fill = F,
color = "gray",
weight = 1,
label = ~NAME
) %>%
addPolygons(
data = min_dist_park_osrm %>%
st_transform(4326),
fillColor = ~green_pal(distance),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
#group = "CBGs with Lowest Park Visit Ratio",
label = ~paste0("The closest park is ",park_name,", which is ",round(distance,2), " miles away."),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)
) %>%
addLegend(
position = "topright",
pal = green_pal,
values = min_dist_park_osrm$distance,
title = "Distance to Nearest Park (mi)",
opacity = 1
)
min_dist_map
#all_isochrones <- data.frame(geometry=NA,location_name=NA)[numeric(0),]
#test <- park_geometry %>% filter(!location_name %in% all_isochrones$location_name)
#I had to change the osrmIsochrone res to 15 for Lake Cunningham Park and Municipal Rifle Range
# for(i in 1:nrow(test)){ #140:nrow(park_geometry)
# print(i)
#
# boundary_coords <- test[i,] %>% st_coordinates()
#
# isochrone_hold <- data.frame(id=NA,min=NA,max=NA,geometry=NA,center=5)[numeric(0),]
#
# for(j in 1:nrow(boundary_coords)){
# print(nrow(boundary_coords))
# print(j)
# pt <- boundary_coords[j,1:2] %>%
# st_point %>%
# st_sfc() %>%
# st_sf() %>%
# st_set_crs(projection) %>%
# st_transform(4326)
#
# pt_isochrone <-
# pt %>%
# osrmIsochrone(
# returnclass="sf",
# breaks = 10,
# res = 15
# )
#
# isochrone_hold <-
# isochrone_hold %>%
# rbind(pt_isochrone)
#
# }
#
# isochrone_union <-
# isochrone_hold %>%
# st_union() %>%
# st_as_sf() %>%
# rename("geometry" = "x") %>%
# mutate(location_name = test$location_name[i])
#
# all_isochrones <-
# all_isochrones %>%
# rbind(isochrone_union)
#
# }
#save(all_isochrones, file = "sj_10min_park_isochrone.Rdata")
load("sj_10min_park_isochrone.Rdata")
A goal in parks and recreation departments across the nation is having a park within a 10-minute walk of all communities. By using OSRM Routing, we created isochrones that represent a 10 minute walk from any point on the boundary of San Jose Parks. In the following map, the “10 Minute Walking Distance from SJ Park Boundaries” option displays San Jose parks along with their 10 minute walking isochrones. The “CBGs within a 10 min. walk of a park” shows as the title describes.
walk_10min <-
dist_park_osrm %>%
as.data.frame() %>%
st_as_sf() %>%
st_drop_geometry() %>%
filter(duration <= 10) %>%
rename(
origin_census_block_group = src,
park_name = dst
) %>%
group_by(origin_census_block_group) %>%
summarise(duration = mean(duration)) %>%
left_join(
sj_blockgroup,
by = "origin_census_block_group"
) %>%
st_as_sf()
green_pal2 <- colorNumeric(
palette = colorRamp(c("#95f599", "#007804"), interpolate="spline"),
domain =
walk_10min %>%
pull(duration) %>%
unique()
)
min_10duration_map <-
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = sj_boundary %>%
st_transform(4326),
fill = F,
color = "gray",
weight = 1,
label = ~NAME
) %>%
addPolygons(
data = walk_10min %>%
st_transform(4326),
fillColor = ~green_pal2(duration),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "CBGs within a 10 min. walk of a park",
label = ~paste0("Walking time to the nearest park: ",round(duration)," mintues"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)
) %>%
addPolygons(
data = all_isochrones %>%
st_transform(4326),
fillColor = "#93c400",
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "10 Minute Walking Distance from SJ Park Boundaries",
label = "10 Minute Walk Isochrone",
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
))%>%
addPolygons(
data = park_geometry %>%
st_transform(4326),
fillColor = "rgb(112,112,112)",
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "10 Minute Walking Distance from SJ Park Boundaries",
label = ~paste0(location_name),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
))%>%
addLayersControl(
baseGroups = c("10 Minute Walking Distance from SJ Park Boundaries", "CBGs within a 10 min. walk of a park"),
options = layersControlOptions(collapsed = FALSE)
)
min_10duration_map
mass_isochrone <-
all_isochrones %>%
st_union() %>%
st_as_sf() %>%
st_transform(4326)
original_area <- st_area(sj_boundary %>% st_transform(4326))
difference <- st_difference(sj_boundary %>% st_transform(4326), mass_isochrone)
new_area <- st_area(difference)
park_range_coverage <- 1-(as.numeric(new_area) / as.numeric(original_area))
In San Jose, 38% of the land is within a 10 minute walking distance of a park, and 62% of the land is outside of this range. The next map illustrates which areas of San Jose fall inside/outside of the 10 minute walk zone.
inside_outside <-
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = sj_boundary %>%
st_transform(4326),
fill = F,
color = "gray",
weight = 1,
label = ~NAME
) %>%
addPolygons(
data = mass_isochrone %>%
st_transform(4326),
fillColor = "green",
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "Inside the 10 Minute Walking Zone",
label = ~paste0("Within a 10 minute walk to a park."),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)
) %>%
addPolygons(
data = difference %>%
st_transform(4326),
fillColor = "blue",
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
group = "Outside the 10 Minute Walking Zone",
label = ~paste0("Not Within a 10 minute walk to a park."),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)
) %>%
addLayersControl(
baseGroups = c("Inside the 10 Minute Walking Zone", "Outside the 10 Minute Walking Zone"),
options = layersControlOptions(collapsed = FALSE)
)
inside_outside
How does the OSRM Routing distance calculation method compare to the “as the crow flies” straight-line method? To illustrate the difference, we will use the distances between San Jose CBGs and their nearest park (displayed in the maps in the previous section). In almost every case, the OSRM distance is larger than the straight line distance, so the the numbers in the map legend result from: OSRM - as the crow flies. Note that the delta is greater in the CBGs on the periphery of San Jose, which frequently have a longer distance to the nearest park.
#Comparison: As the crow flies and OSRM
# matched_points <- data.frame(origin_census_block_group = NA, geometry = NA, park_name = NA, nearest_point = NA)
#
# nearest_points <-
# nearest_points %>%
# as.data.frame()
#
# for(i in 1:nrow(nearest_points)){
# if(i%%500 == 0){print(i)}
# for(j in 1:nrow(min_dist_park_osrm)){
# if(nearest_points$origin_census_block_group[i] == min_dist_park_osrm$origin_census_block_group[j] & nearest_points$park_name[i] == min_dist_park_osrm$park_name[j]){
# matched_points <-
# matched_points %>%
# rbind(nearest_points[i,])
# }
# }
#
# }
#
#
# nearest_points_dist <-
# matched_points %>%
# na.omit() %>%
# st_set_geometry("geometry") %>%
# st_as_sf() %>%
# st_set_crs(projection) %>%
# st_transform(4326) %>%
# st_set_geometry("nearest_point") %>%
# st_as_sf() %>%
# st_set_crs(projection) %>%
# st_transform(4326) %>%
# mutate(
# crow_dist = 0
# )
#
# for(i in 1:nrow(nearest_points_dist)){
# nearest_points_dist$crow_dist[i] = as.numeric(st_length(st_nearest_points(nearest_points_dist$geometry[i],nearest_points_dist$nearest_point[i]))) / 1609.34
# }
#
# nearest_points_dist_mi <-
# nearest_points_dist %>%
# mutate(
# crow_dist = as.numeric(crow_dist / 1609.34) #convert to miles
# )
#
# save(nearest_points_dist, file = "sj_parks_crow_dist.Rdata")
load("sj_parks_crow_dist.Rdata")
osrm_crow <-
nearest_points_dist %>%
left_join(
min_dist_park_osrm %>%
st_set_geometry(NULL) %>%
dplyr::select(origin_census_block_group,distance),
by = "origin_census_block_group"
) %>%
mutate(dist_diff = distance - crow_dist) %>%
st_set_geometry(NULL) %>%
st_set_geometry("geometry") %>%
st_set_geometry(NULL) %>%
left_join(
sj_blockgroup,
by = "origin_census_block_group"
)
teal_pal2 <- colorNumeric(
palette = colorRamp(c("#d6fffe", "#015a68"), interpolate="spline"),
domain =
osrm_crow %>%
pull(dist_diff) %>%
unique()
)
osrm_crow_map <-
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = sj_boundary %>%
st_transform(4326),
fill = F,
color = "gray",
weight = 1,
label = ~NAME
) %>%
addPolygons(
data = osrm_crow %>%
st_as_sf() %>%
st_transform(4326),
fillColor = ~teal_pal2(dist_diff),
color = "black",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.75,
label = ~paste0("Delta between OSRM and 'as the crow flies': ",round(dist_diff,2)," miles"),
highlightOptions =
highlightOptions(
weight = 2.25,
opacity = 1
)
) %>%
addLegend(
position = "topright",
pal = teal_pal2,
values = osrm_crow$dist_diff,
title = "Delta between OSRM and 'as the crow flies' distances",
opacity = 1
)
osrm_crow_map
To illustrate the difference further, the following scatter plot directly compares the distance values for each block group. The solid black line represents a 1:1 relationship, which the data would follow if the OSRM/as the crow flies values were identical. The points deviate away from the 1:1 line as the distances become greater, indicating that the OSRM distances are a more accurate representation of actual travel for parks that are further away from a CBG.
dist_scatter <-
osrm_crow %>%
ggplot(
aes(
x = crow_dist,
y = distance
)
) +
geom_abline(
mapping = NULL,
data = NULL,
1,
0,
na.rm = FALSE,
show.legend = TRUE
) +
geom_point() +
labs(
x = "As the Crow Flies Distance (mi.)",
y = "OSRM Routing Distance (mi.)",
title = "OSRM Routing and `As the Crow Flies` Distance Comparisons (CBGs to Nearest SJ Park)"
)
dist_scatter
#
#
# park_10min_pop <-
# all_isochrones %>%
# mutate(
# pop_10min_park = NA
# )
#
# iso_fraction_hold <- data.frame(origin_census_block_group = NA, fraction_iso = NA, park_name = NA)[numeric(0),]
#
# for(park in 1:nrow(all_isochrones)){
# print(park)
#
# sj_bg_iso <-
# sj_blockgroup %>%
# dplyr::select(-DISTRICTS) %>%
# st_transform(4326) %>%
# mutate(
# original_area = as.numeric(st_area(geometry)),
# iso_area = as.numeric(st_area(all_isochrones$geometry[park])),
# new_geom = NA,
# new_geom_area = NA
# )
# for(i in 1:nrow(sj_bg_iso)){
# #if(i%%10 == 0){print(i)}
# sj_bg_iso$new_geom[i] <- st_intersection(sj_bg_iso$geometry[i],all_isochrones$geometry[park]) %>% st_as_sf()
#
#
# if(length(st_intersection(sj_bg_iso$geometry[i],all_isochrones$geometry[park])) != 0){
# sj_bg_iso$new_geom_area[i] <- as.numeric(st_area(st_intersection(sj_bg_iso$geometry[i],all_isochrones$geometry[park])))
# }else{
# sj_bg_iso$new_geom_area[i] <- 0
# }
# }
#
# sj_bg_iso_fraction <-
# sj_bg_iso %>%
# st_set_geometry(NULL) %>%
# mutate(
# iso_coverage_fraction = round(new_geom_area / original_area,2),
# fraction_iso = round(new_geom_area / iso_area,2)
# ) %>%
# dplyr::select(origin_census_block_group,iso_coverage_fraction, fraction_iso)
#
# fraction_hold <-
# sj_bg_iso_fraction %>%
# dplyr::select(-iso_coverage_fraction) %>%
# mutate(park_name = all_isochrones$location_name[park])
#
# iso_fraction_hold <-
# iso_fraction_hold %>%
# rbind(fraction_hold)
#
#
# sj_iso_pop <-
# sj_bg_iso_fraction %>%
# left_join(
# sj_population,
# by = "origin_census_block_group"
# ) %>%
# mutate(
# pop_10min_park = total_pop * iso_coverage_fraction
# )
#
# park_10min_pop$pop_10min_park[park] <- sum(sj_iso_pop$pop_10min_park)
# }
#
#
#save(park_10min_pop,iso_fraction_hold,file="sj_pop_10min_park.Rdata")
load("sj_pop_10min_park.Rdata")
# sj_income <-
# getCensus(
# name = "acs/acs5",
# vintage = 2018,
# region = "block group:*",
# regionin = "state:06+county:085",
# vars = c("group(B19001)")
# ) %>%
# mutate(
# origin_census_block_group =
# paste0(state,county,tract,block_group)
# ) %>%
# dplyr::select(-c(GEO_ID,state,NAME,contains("EA"),contains("MA"),contains("M"),county,tract,block_group)) %>%
# rename(
# "Total" = "B19001_001E"
# ) %>%
# mutate(
# percent_over_125k = round((B19001_015E+B19001_016E+B19001_017E) / Total * 100,2)
# ) %>%
# dplyr::select(origin_census_block_group,percent_over_125k) %>%
# filter(origin_census_block_group %in% sj_blockgroup$origin_census_block_group)
#
# sj_race <-
# getCensus(
# name = "acs/acs5",
# vintage = 2018,
# region = "block group:*",
# regionin = "state:06+county:085",
# vars = c("group(B02001)")
# ) %>%
# mutate(
# origin_census_block_group =
# paste0(state,county,tract,block_group)
# ) %>%
# dplyr::select(-c(GEO_ID,state,NAME,contains("EA"),contains("MA"),contains("M"),county,tract,block_group)) %>%
# mutate(
# percent_white = round((B02001_002E / B02001_001E) *100,2)
# ) %>%
# dplyr::select(origin_census_block_group,percent_white) %>%
# filter(origin_census_block_group %in% sj_blockgroup$origin_census_block_group)
acs_vars <- readRDS("C:/Users/sadegh/Documents/GitHub/covid19/Simone_Speizer/censusData2018_acs_acs5.rds")
sj_employment <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = c("group(B08008)")
) %>%
mutate(
origin_census_block_group =
paste0(state,county,tract,block_group)
) %>%
dplyr::select(-c(GEO_ID,state,NAME,contains("EA"),contains("MA"),contains("M"),county,tract,block_group)) %>%
dplyr::select(B08008_001E,origin_census_block_group) %>%
rename("total_employ" = "B08008_001E") %>%
left_join(
sj_population,
by = "origin_census_block_group"
) %>%
mutate(
`% employeed` = total_employ / total_pop * 100
) %>%
dplyr::select(origin_census_block_group,`% employeed`) %>%
filter(origin_census_block_group %in% sj_blockgroup$origin_census_block_group)
sj_household_child <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = c("group(B11005)")
) %>%
mutate(
origin_census_block_group =
paste0(state,county,tract,block_group)
) %>%
dplyr::select(-c(GEO_ID,state,NAME,contains("EA"),contains("MA"),contains("M"),county,tract,block_group)) %>%
dplyr::select(B11005_001E,B11005_002E,origin_census_block_group) %>%
rename("total_households" = "B11005_001E", "children_households" = "B11005_002E") %>%
mutate(
`% household with children` = children_households / total_households * 100
) %>%
dplyr::select(origin_census_block_group,`% household with children`) %>%
filter(origin_census_block_group %in% sj_blockgroup$origin_census_block_group)
sj_occupants_room <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = c("group(B25014)")
) %>%
mutate(
origin_census_block_group =
paste0(state,county,tract,block_group)
) %>%
dplyr::select(-c(GEO_ID,state,NAME,contains("EA"),contains("MA"),contains("M"),county,tract,block_group)) %>%
gather(key = "variable", value = "estimate", -origin_census_block_group) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
dplyr::select(-variable) %>%
separate(label, into = c(NA, NA, NA,"occupants per room"), sep = "!!") %>%
filter(!is.na(`occupants per room`)) %>%
group_by(origin_census_block_group, `occupants per room`) %>%
summarize(estimate_tot = sum(estimate)) %>%
spread(key = `occupants per room`, value = estimate_tot) %>%
mutate(total_nums = `0.50 or less occupants per room` + `0.51 to 1.00 occupants per room` + `1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`) %>%
group_by(origin_census_block_group) %>%
summarize(total_nums = sum(total_nums), total_0.5less = sum(`0.50 or less occupants per room`), total_0.5to1 = sum(`0.51 to 1.00 occupants per room`), total_1to1.5 = sum(`1.01 to 1.50 occupants per room`), total_1.5to2 = sum(`1.51 to 2.00 occupants per room`), total_2more = sum(`2.01 or more occupants per room`)) %>%
mutate(`percent more than 1 occupant` = (total_1to1.5 + total_1.5to2 + total_2more) * 100/ total_nums, `percent less than 1 occupant` = (100-`percent more than 1 occupant`), `percent 0.5 or less occupants` = total_0.5less*100/total_nums, `percent more than 0.5 occupants` = (100-`percent 0.5 or less occupants`), `percent more than 2 occupants` = total_2more*100/total_nums) %>%
filter(origin_census_block_group %in% sj_blockgroup$origin_census_block_group)
sj_occupants_room <- sj_occupants_room[,c(1,8:12)]
sj_renter_homeowner <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = c("group(B25003)")
) %>%
mutate(
origin_census_block_group =
paste0(state,county,tract,block_group)
) %>%
dplyr::select(-c(GEO_ID,state,NAME,contains("EA"),contains("MA"),contains("M"),county,tract,block_group)) %>%
gather(key = "variable", value = "estimate", -origin_census_block_group) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
dplyr::select(-variable) %>%
separate(label, into = c(NA,NA,"type"), sep = "!!") %>%
filter(!is.na(type)) %>%
group_by(origin_census_block_group, type) %>%
summarize(estimate_tot = sum(estimate)) %>%
spread(key = type, value = estimate_tot) %>%
mutate(total_nums = `Owner occupied` + `Renter occupied`) %>%
mutate(
`% Owner Occupied` = `Owner occupied` / total_nums * 100,
`% Renter Occupied` = `Renter occupied` / total_nums * 100
) %>%
filter(origin_census_block_group %in% sj_blockgroup$origin_census_block_group)
sj_renter_homeowner <- sj_renter_homeowner[,c(1,5:6)]
sj_school_enrollment <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = c("group(B14007)")
) %>%
mutate(
origin_census_block_group =
paste0(state,county,tract,block_group)
) %>%
dplyr::select(-c(GEO_ID,state,NAME,contains("EA"),contains("MA"),contains("M"),county,tract,block_group)) %>%
dplyr::select(origin_census_block_group,B14007_001E,B14007_004E,B14007_010E,B14007_013E,B14007_014E,B14007_015E,B14007_016E) %>%
mutate(
total = B14007_001E,
`% kindergarten` = B14007_004E / B14007_001E *100,
`% 6th grade` = B14007_010E / B14007_001E * 100,
`% high school` = (B14007_013E+B14007_014E+B14007_015E+B14007_016E) / B14007_001E *100
) %>%
dplyr::select(origin_census_block_group, `% kindergarten`, `% 6th grade`, `% high school`) %>%
filter(origin_census_block_group %in% sj_blockgroup$origin_census_block_group)
sj_dem_data_initial <- readRDS("Simone_Speizer/sj_socialdistancing_demdata_prepostdifs_manyvars.rds")
sj_dem_data <-
sj_dem_data_initial %>%
left_join(sj_employment, by = c("blockgroup" = "origin_census_block_group")) %>%
left_join(sj_household_child, by = c("blockgroup" = "origin_census_block_group")) %>%
left_join(sj_occupants_room, by = c("blockgroup" = "origin_census_block_group")) %>%
left_join(sj_renter_homeowner, by = c("blockgroup" = "origin_census_block_group")) %>%
left_join(sj_school_enrollment, by = c("blockgroup" = "origin_census_block_group")) %>%
rename("% less than 1 year" = "percent less than 1")
In our analysis, we quantify the correlations between minimum distance to a park, yard area per capita, park visits per 1000 people, and other demographic variable percentages (including income, age, language ability, race/ethnicity, education enrollment, vehicle ownership, employment status, occupants per bedroom, households with children, vehicle ownership, and renter/owner occupied housing units). We choose to analyze these demographic variables based on our literature review of existing research on correlations with social distancing and our hypotheses of potentially relevant factors in San Jose and the Bay Area. Information on these demographic variables at the census block group level is from the American Community Survey 2014-2018 5 year summary data.
yard_visits_tied1 <-
visit_pop_combo %>%
left_join(
sj_yard_pop_weight %>%
st_set_geometry(NULL) %>%
dplyr::select(yard_per_capita, origin_census_block_group),
by = "origin_census_block_group"
)%>%
left_join(
min_dist_park_osrm %>%
st_set_geometry(NULL) %>%
dplyr::select(origin_census_block_group,distance),
by = "origin_census_block_group"
) %>%
filter(yard_per_capita != 0 & yard_per_capita < 2600) %>%
left_join(
sj_dem_data,
by = c("origin_census_block_group" = "blockgroup")
) %>%
st_set_geometry(NULL) %>%
mutate(
distance = distance * 5280
)
yard_visits_tied <-
yard_visits_tied1
#here I normalize all of the values to make the coefficients more readable: (x - mean(x)) then (x / sd(x))
for(j in 4:30){
mean_hold <- mean(yard_visits_tied[,j])
for(i in 1:nrow(yard_visits_tied)){
yard_visits_tied[i,j] = yard_visits_tied[i,j] - mean_hold
}
}
for(j in 4:30){
sd_hold <- sd(yard_visits_tied[,j])
for(i in 1:nrow(yard_visits_tied)){
yard_visits_tied[i,j] = yard_visits_tied[i,j] / sd_hold
}
}
Yard area per capita is not a strong predictor for monthly park visits per person.
To exclude outliers, we implemented a 2,600 sqft yard area per capita threshold. The following histogram depicts all yard area per capita values. The main curve ends at 2,300 sqft, and the outlier clusters begin around 2,900 sqft. As mentioned earlier, these large values may be attributed to zoning misclassifications, so we do not include them in the linear model.
yard_hist <- plot_ly(
x = sj_yard_pop_weight$yard_per_capita,
type = "histogram") %>%
layout(
title = "San Jose CBG Yard Area per Capita Distribution",
shapes=list(type='line', x0= 2600, x1= 2600, y0=0, y1=316, line=list(dash='dot', width=1)),
xaxis = list(
title = "Yard Area per Capita (sqft.)"
),
yaxis = list(
title = "Number of CBG Occurances"
),
annotations = list(
list(
xref = "x",
yref = "y",
x = 2600,
y = 316,
text = "Threshold",
xanchor = "center",
yanchor = "bottom",
showarrow = F
)
)
)
yard_hist
The following scatter plot represents the relationship between the average residential yard size and average monthly park visits per person for each CBG in San Jose. There is a positive correlation between these two factors, representing that in general, people who live in communities that have a larger yard area per capita tend to visit parks more than communities with smaller yard areas per capita. This correlation seems somewhat counterintuitive; we initially hypothesized that if one is lacking yard space, then that person would be more likely to seek open space in parks.
lim_scatter <-
yard_visits_tied1 %>%
ggplot(
aes(
x = yard_per_capita,
y = visits_per_pop
)
) +
geom_point() +
geom_smooth(method=lm) +
labs(
x = "CBG Yard Area per Capita (sqft.)",
y = "CBG Average Monthly Park Visits per Person",
title = "SJ Park Accessibility: Monthly Parks Visits per Person vs. Residential Yard Area per Population"
)
lim_scatter
To better assess this relative change in behavior, we fit a linear model to a CBGs monthly park visits per person and a CBGs yard area per capita. The results of that model, including the coefficient on yard area per capita (the slope of the linear fit) and the R-squared value, are shown below.
Coefficient:
lm_yard_visits <- lm(visits_per_pop ~ yard_per_capita, yard_visits_tied1)
print(summary.lm(lm_yard_visits)$coefficients, digits = 4, signif.stars=TRUE)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.7118436 7.473e-02 22.907 1.832e-71
## yard_per_capita 0.0002635 9.641e-05 2.733 6.594e-03
R-Squared:
2% of change in y-axis…distribution is still wide, so we feel like some other factor must be explaining park visits besides just yard area. Yard area must not bew correlated with this other thing, so we’re still seeing a lot of noise, which leads to wanting to do multiple regressions.
If multiple regressions still have a very low r-squared, then conclusion will be we can’t begin to understand from a correlation perspective what may be driving park visits.
print(summary.lm(lm_yard_visits)$r.squared, digits = 4)
## [1] 0.02101
From the coefficient value, we see that the relationship between yard area per capita and monthly park visits per person is very small. With a 10,000 square foot increase in yard area per capita, a CBG’s monthly park visit per person will only increase by 2.6 visits. The R-squared value assesses the degree to which this model accurately predicts the variation in change in devices staying completely at home observed in the data; the result of 0.02 indicates that the linear fit with yard area per capita predicts only 2% of the observed variation. The low p value indicates that these results are statistically significant, but this is not a relatively strong prediction.
The minimum distance to a park is an even worse predictor of monthly park visits per person than yard area per capita.
The following scatter plot represents the relationship between the minimum distance from to a park and average monthly park visits per person for each CBG in San Jose. There is a negative correlation between these two factors, representing that in general, people who live in communities that are further away from parks tend to visit parks less than communities that are closer to parks.
lim_scatter2 <-
yard_visits_tied1 %>%
ggplot(
aes(
x = distance,
y = visits_per_pop
)
) +
geom_point() +
geom_smooth(method=lm) +
labs(
x = "Distance to Nearest Park (feet)",
y = "CBG Average Monthly Park Visits per Person",
title = "SJ Park Accessibility: Monthly Parks Visits per Person vs. Distance to the Nearest Park"
)
lim_scatter2
Similar to the yard area per capita factor above, we fit a linear model to a CBGs monthly park visits per person and the minimum distance from a CBG centroid toa park. The coefficient on minimum distance (the slope of the linear fit) and the R-squared value, are shown below.
Coefficient:
lm_visits_distance <- lm(visits_per_pop ~ distance, yard_visits_tied1)
print(summary.lm(lm_visits_distance)$coefficients, digits = 4, signif.stars=TRUE)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.985e+00 8.88e-02 22.360 2.761e-69
## distance -3.509e-05 1.92e-05 -1.827 6.852e-02
R-Squared:
print(summary.lm(lm_visits_distance)$r.squared, digits = 4)
## [1] 0.009503
With a 100,000 foot increase in park proximity, a CBG’s monthly park visit per person will increase by 3 visits. The R-squared result of 0.01 indicates that the linear fit with minimum distance to a park predicts only 1% of the observed variation. The p value is greater than 0.05, which indicates that these results are not particularly statistically significant.
Assessing the predictive power of demographic factors (combined with minimum distance and yard area yard area per capita) on monthly park visits per capita
Coefficient:
# yard_visits_tied_new <-
# yard_visits_tied %>%
# dplyr::select(-total_pop,-mean_visits,-origin_census_block_group,-`% not completely at home`,-`% Completely at Home`,-`% not completely at home pre shelter`,-`% Completely at Home Pre Shelter`,-`% increase in staying completely home`,-frac_increase)
#
# col_num <- ncol(yard_visits_tied_new)
#
#
# rsquare_storage <- data.frame(num_factors = NA,rsquared = NA, col_remove = NA)[numeric(0),]
#
# for(i in 1:(col_num-2)){
# factor_num <- ncol(yard_visits_tied_new)-1
# lm_test <- lm(visits_per_pop ~ ., data = yard_visits_tied_new)
# rsquared <- summary.lm(lm_test)$r.squared
#
# min_second <- sort(abs(summary.lm(lm_test)$coefficients[,1]))[2]
# remove_name <- names(min_second)
#
#
# if(grepl("`",remove_name) == TRUE){
# remove_name = substr(remove_name,2,nchar(remove_name)-1)
# }
#
# rsquare_hold <- data.frame(num_factors = factor_num,rsquared = rsquared, col_remove = remove_name)
#
# yard_visits_tied_new <- yard_visits_tied_new[,!grepl(remove_name,names(yard_visits_tied_new))]
#
# rsquare_storage <-
# rsquare_storage %>%
# rbind(rsquare_hold)
# }
#
# rsquare_plot <-plot_ly(
# x = rsquare_storage$num_factors,
# y = rsquare_storage$rsquared,
# type = "scatter",
# mode='lines+markers'
# ) %>%
# layout(
# title = "Change in R-Squared With Predictors Removal",
# margin = list(t = 25),
# paper_bgcolor='rgb(255,255,255)',
# plot_bgcolor='rgb(229,229,229)',
# xaxis = list(
# title = "Number of Predictors",
# gridcolor = 'rgb(255,255,255)',
# showgrid = TRUE,
# showline = FALSE,
# showticklabels = TRUE,
# tickcolor = 'rgb(127,127,127)',
# ticks = 'outside',
# zeroline = FALSE,
# fixedrange = F
# ),
# yaxis = list(
# title = "R-Squared",
# gridcolor = 'rgb(255,255,255)',
# showgrid = TRUE,
# showline = FALSE,
# showticklabels = TRUE,
# tickcolor = 'rgb(127,127,127)',
# ticks = 'outside',
# zeroline = FALSE,
# fixedrange = T
# )
# )
#
# rsquare_plot
yard_visits_tied_new <-
yard_visits_tied %>%
dplyr::select(-total_pop,-mean_visits,-origin_census_block_group,-`% not completely at home`,-`% Completely at Home`,-`% not completely at home pre shelter`,-`% Completely at Home Pre Shelter`,-`% increase in staying completely home`,-frac_increase)
col_num <- ncol(yard_visits_tied_new)
rsquare_storage2 <- data.frame(num_factors = NA,rsquared = NA, col_remove = NA, rsquare_change = NA)[numeric(0),]
last_factor <- NULL
for(i in 1:(col_num-1)){
factor_num <- ncol(yard_visits_tied_new)-1
col_names <- colnames(yard_visits_tied_new)
col_names <- col_names[-1]
lm_test <- lm(visits_per_pop ~ ., data = yard_visits_tied_new)
rsquared <- summary.lm(lm_test)$r.squared
tied_subtract <- yard_visits_tied_new
delta_hold <- 1000
name_hold <- NA
if(i == col_num-1){
last_factor <- col_names
rsquare_hold <- data.frame(num_factors = factor_num,rsquared = rsquared, col_remove = last_factor, rsquare_change = rsquared)
}else{
for(j in 1:length(col_names)){
tied_hold <- tied_subtract %>% dplyr::select(col_names[j])
tied_subtract <- tied_subtract[,!names(tied_subtract) %in% col_names[j]]
lm_subtract <- lm(visits_per_pop ~., data = tied_subtract)
rsquared_subtract <- summary.lm(lm_subtract)$r.squared
delta <- abs(rsquared - rsquared_subtract)
if(delta < delta_hold){
delta_hold <- delta
name_hold <- col_names[j]
}
tied_subtract <-
tied_subtract %>%
cbind(tied_hold)
}
rsquare_hold <- data.frame(num_factors = factor_num,rsquared = rsquared, col_remove = name_hold, rsquare_change = delta_hold)
yard_visits_tied_new <- yard_visits_tied_new[,!names(tied_subtract) %in% name_hold]
}
rsquare_storage2 <-
rsquare_storage2 %>%
rbind(rsquare_hold) %>%
na.omit()
}
rsquare_plot2 <-plot_ly(
x = rsquare_storage2$num_factors,
y = rsquare_storage2$rsquared,
type = "scatter",
mode='lines+markers'
) %>%
layout(
title = "Change in R-Squared With Predictors Removal",
margin = list(t = 25),
paper_bgcolor='rgb(255,255,255)',
plot_bgcolor='rgb(229,229,229)',
xaxis = list(
title = "Number of Predictors",
gridcolor = 'rgb(255,255,255)',
showgrid = TRUE,
showline = FALSE,
showticklabels = TRUE,
tickcolor = 'rgb(127,127,127)',
ticks = 'outside',
zeroline = FALSE,
fixedrange = F
),
yaxis = list(
title = "R-Squared",
gridcolor = 'rgb(255,255,255)',
showgrid = TRUE,
showline = FALSE,
showticklabels = TRUE,
tickcolor = 'rgb(127,127,127)',
ticks = 'outside',
zeroline = FALSE,
fixedrange = T
)
)
rsquare_plot2
table_cbg <-
kable(
rsquare_storage2 %>%
dplyr::select(`Number of Factors` = num_factors,`Factor Removed` = col_remove)
)
table_cbg
Number of Factors | Factor Removed |
---|---|
32 | percent less than 1 occupant |
31 | percent more than 1 occupant |
30 | percent more than 0.5 occupants |
29 | % non hispanic/latino |
28 | % Renter Occupied |
27 | % over 100,000 |
26 | % 6th grade |
25 | % kindergarten |
24 | % not speaking spanish |
23 | % white |
22 | % Owner Occupied |
21 | % speaking english > well |
20 | % household with children |
19 | % over 125,000 |
18 | % employeed |
17 | percent high speed |
16 | % male workers |
15 | % high school |
14 | % over 75,000 |
13 | percent associates or higher |
12 | percent 0.5 or less occupants |
11 | percent less than 30 |
10 | percent less than 18 |
9 | percent 20-29 |
8 | percent with vehicles |
7 | percent more than 2 occupants |
6 | % less than 1 year |
5 | % hispanic/latino |
4 | distance |
3 | yard_per_capita |
2 | % Asian |
1 | percent elderly |
park_visits <- readRDS("park_output.rds") %>%
filter(date > "2019-12-31" %>% as.Date()) %>%
mutate(
visit_avg = (visit_counts_high + visit_counts_low) / 2
) %>%
group_by(Name) %>%
summarise(visit_avg = round(mean(visit_avg)))
park_visits <- park_visits[-8,]
dev_park_area <-
park_geometry %>%
st_set_geometry(NULL) %>%
dplyr::select(location_name,DEVACRES) %>%
rename(
"Name" = "location_name"
)
sj_dem_park_alter <-
iso_fraction_hold %>%
filter(fraction_iso > 0) %>%
left_join(
sj_dem_data,
by = c("origin_census_block_group" = "blockgroup")
) %>%
dplyr::select(-`% not completely at home`,-`% Completely at Home`,-`% not completely at home pre shelter`,-`% Completely at Home Pre Shelter`,-`% increase in staying completely home`,-frac_increase)
effect_col <- colnames(sj_dem_park_alter[,4:33])
sj_dem_park_alter_change <- sj_dem_park_alter
for(j in effect_col){
set(sj_dem_park_alter_change, i=NULL, j=j, value=sj_dem_park_alter_change[[j]]*sj_dem_park_alter_change[['fraction_iso']])
}
sj_dem_park_alter_sum <-
sj_dem_park_alter_change %>%
dplyr::select(-origin_census_block_group) %>%
group_by(park_name) %>%
summarise_all(sum) %>%
filter(fraction_iso >= 0.98)
park_all_tied1 <-
park_visits %>%
left_join(
park_10min_pop %>%
st_set_geometry(NULL) %>%
rename("Name" = "location_name") %>%
mutate(
pop_10min_park = round(pop_10min_park)
),
by = "Name"
) %>%
left_join(
dev_park_area,
by = "Name"
) %>%
left_join(
sj_dem_park_alter_sum,
by = c("Name" = "park_name")
) %>%
na.omit() %>%
dplyr::select(-fraction_iso)
park_all_tied <- park_all_tied1
#here I normalize all of the values to make the coefficients more readable: (x - mean(x)) then (x / sd(x))
for(j in 2:34){
mean_hold <- mean(pull(park_all_tied[,j]))
for(i in 1:nrow(park_all_tied)){
park_all_tied[i,j] = park_all_tied[i,j] - mean_hold
}
}
for(j in 2:34){
sd_hold <- sd(pull(park_all_tied[,j]))
for(i in 1:nrow(park_all_tied)){
park_all_tied[i,j] = park_all_tied[i,j] / sd_hold
}
}
park_visit_pop_tied <-
park_visits %>%
left_join(
park_10min_pop %>%
st_set_geometry(NULL) %>%
rename("Name" = "location_name") %>%
mutate(
pop_10min_park = round(pop_10min_park)
),
by = "Name"
)
lim_mod4 <- lm(park_visit_pop_tied$visit_avg ~ park_visit_pop_tied$pop_10min_park)
lim_scatter4 <-
park_visit_pop_tied %>%
ggplot(
aes(
x = pop_10min_park,
y = visit_avg
)
) +
geom_point() +
geom_smooth(method=lm) +
labs(
x = "Population Within 10 Minutes of a Park",
y = "Average Daily Park Visits Since Jan. 1, 2020",
title = "SJ Park Accessibility: Population within 10 Minutes of a Park vs. Average Daily Park Visits"
)
lim_scatter4
Coefficient:
print(summary.lm(lim_mod4)$coefficients, digits = 4, signif.stars=TRUE)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 120.41743 30.320581 3.971 1.026e-04
## park_visit_pop_tied$pop_10min_park 0.02558 0.005986 4.273 3.097e-05
R-Squared:
print(summary.lm(lim_mod4)$r.squared, digits = 4)
## [1] 0.09072
#Comparison: Developed Park Area vs. Average Daily Park Visits
park_visit_dev_tied <-
park_visits %>%
left_join(
dev_park_area,
by = "Name"
)
lim_mod5 <- lm(park_visit_dev_tied$visit_avg ~ park_visit_dev_tied$DEVACRES)
lim_scatter5 <-
park_visit_dev_tied %>%
ggplot(
aes(
x = DEVACRES,
y = visit_avg
)
) +
geom_point() +
geom_smooth(method=lm) +
labs(
x = "Developed Acres within Park",
y = "Average Daily Park Visits Since Jan. 1, 2020",
title = "SJ Park Accessibility: Park Developed Acres vs. Average Daily Park Visits"
)
lim_scatter5
Coefficient:
print(summary.lm(lim_mod5)$coefficients, digits = 4, signif.stars=TRUE)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 202.157 14.4375 14.002 9.407e-31
## park_visit_dev_tied$DEVACRES 4.619 0.7893 5.853 2.191e-08
R-Squared:
print(summary.lm(lim_mod5)$r.squared, digits = 4)
## [1] 0.1577
park_all_tied_new <- park_all_tied %>% dplyr::select(-Name)
col_num <- ncol(park_all_tied_new)
rsquare_storage_park <- data.frame(num_factors = NA,rsquared = NA, col_remove = NA)[numeric(0),]
for(i in 1:(col_num-1)){
factor_num <- ncol(park_all_tied_new)-1
col_names <- colnames(park_all_tied_new[,2:ncol(park_all_tied_new)])
lm_test <- lm(visit_avg ~ ., data = park_all_tied_new)
rsquared <- summary.lm(lm_test)$r.squared
tied_subtract <- park_all_tied_new
delta_hold <- 1000
name_hold <- NA
for(j in 1:length(col_names)){
tied_hold <- tied_subtract %>% dplyr::select(col_names[j])
tied_subtract <- tied_subtract[,!names(tied_subtract) %in% col_names[j]]
lm_subtract <- lm(visit_avg ~., data = tied_subtract)
rsquared_subtract <- summary.lm(lm_subtract)$r.squared
delta <- abs(rsquared - rsquared_subtract)
if(delta < delta_hold){
delta_hold <- delta
name_hold <- col_names[j]
}
tied_subtract <-
tied_subtract %>%
cbind(tied_hold)
}
rsquare_hold <- data.frame(num_factors = factor_num,rsquared = rsquared, col_remove = name_hold, rsquare_change = delta_hold)
park_all_tied_new <- park_all_tied_new[,!names(tied_subtract) %in% name_hold]
rsquare_storage_park <-
rsquare_storage_park %>%
rbind(rsquare_hold) %>%
na.omit()
}
rsquare_plot_park <-plot_ly(
x = rsquare_storage_park$num_factors,
y = rsquare_storage_park$rsquared,
type = "scatter",
mode='lines+markers'
) %>%
layout(
title = "Change in R-Squared With Predictors Removal",
margin = list(t = 25),
paper_bgcolor='rgb(255,255,255)',
plot_bgcolor='rgb(229,229,229)',
xaxis = list(
title = "Number of Predictors",
gridcolor = 'rgb(255,255,255)',
showgrid = TRUE,
showline = FALSE,
showticklabels = TRUE,
tickcolor = 'rgb(127,127,127)',
ticks = 'outside',
zeroline = FALSE,
fixedrange = F
),
yaxis = list(
title = "R-Squared",
gridcolor = 'rgb(255,255,255)',
showgrid = TRUE,
showline = FALSE,
showticklabels = TRUE,
tickcolor = 'rgb(127,127,127)',
ticks = 'outside',
zeroline = FALSE,
fixedrange = T
)
)
rsquare_plot_park
table_park <-
kable(
rsquare_storage_park %>%
dplyr::select(`Number of Factors` = num_factors,`Factor Removed` = col_remove)
)
table_park
Number of Factors | Factor Removed |
---|---|
32 | % Owner Occupied |
31 | percent more than 0.5 occupants |
30 | percent less than 1 occupant |
29 | % hispanic/latino |
28 | % Asian |
27 | percent high speed |
26 | % kindergarten |
25 | percent 0.5 or less occupants |
24 | % Renter Occupied |
23 | percent with vehicles |
22 | percent less than 18 |
21 | % over 125,000 |
20 | % over 100,000 |
19 | % household with children |
18 | percent 20-29 |
17 | % speaking english > well |
16 | % 6th grade |
15 | % over 75,000 |
14 | % high school |
13 | % male workers |
12 | percent associates or higher |
11 | % white |
10 | % employeed |
9 | percent elderly |
8 | percent more than 1 occupant |
7 | percent less than 30 |
6 | percent more than 2 occupants |
5 | % not speaking spanish |
4 | % non hispanic/latino |
3 | % less than 1 year |
2 | pop_10min_park |
1 | DEVACRES |