knitr::opts_chunk$set(warning = FALSE, message = FALSE) 
##Set Up

rm(list=ls())

library(tidyverse)
library(sf)
library(spdep)
library(caret)
library(ckanr)
library(FNN)
library(grid)
library(gridExtra)
library(ggcorrplot)
library(kableExtra)
library(jtools)   
library(ggstance) 
library(mapview)
library(tigris)
library(leaflet)
library(osmdata)
library(tidycensus)
library(stargazer)

root.dir = "https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/DATA/"

source("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")

palette5 <- c("#25CB10", "#5AB60C", "#8FA108",   "#C48C04", "#FA7800")

qBr <- function(df, variable, rnd) {
  if (missing(rnd)) {
    as.character(quantile(round(df[[variable]],0),
                          c(.01,.2,.4,.6,.8), na.rm=T))
  } else if (rnd == FALSE | rnd == F) {
    as.character(formatC(quantile(df[[variable]]), digits = 3),
                 c(.01,.2,.4,.6,.8), na.rm=T)
  }
}

st_c <- st_coordinates

1 Introduction

This project develops model to predict the home price for single-family housing in Boulder county, Colorado. Despite powerful models that Zillow possesses for home price prediction, the always unique context of places requires more refined and site-specific models that take local characteristics into consideration for price prediction. However, such a site-specific model is difficult to build because factors that influence home price tend to be correlate with each other within the system of real estate. Moreover, a site-specific model to Boulder has limited data for model training for Boulder is not a large county, a limitation that will harm the accuracy of the model. Responding to the request for a site-specific model, we develops a Hedonic model that deconstructs the home price into the values of constituent parts including internal characteristics such as housing quality, public services and amenities such as public transit, and spatial process such as the clustering of home price. We then use the model to predict the home price in relation to changes in aforementioned features.

The final model that we develop to predict home price in Boulder explains around 72% of the fluctuation of price with a 19% range of accuracy for each price that the model predicts. Our model has o average an absolute error of 130,000 for each price prediction; comparing to the mean of housing price ($739,555), our model has a decent accuracy for prediction. The mean absolute percentage error for all prediction is nearly randomly distributed in spatial term. While our model has a slightly higher mean absolute percentage error for neighborhoods with high income, the relatively small gap (3%) between the MAPE for high-income and low-income neighborhoods suggest that the model is generalizable. Thus, we believe that our model with good accuracy and generalizability will be useful for Zillow to predict home price in Boulder county, CO.

2 Data

2.1 Data Wrangling and Feature Engineering

We first download the county boundary data from the open data portal of Boulder county. We then use the county boundary to download key amenities–including parks, playgrounds, waters, restaurants, fast food, companies, and bus stations–spatial information from OpenStreetMap. We download spatial information of census tracts within Boulder and urban areas that the Census defines as densely developed areas with a population at least greater than 2,500 from the Census Bureau. We also manually collect k-12 school information from the website of Boulder Valley School District and clean it into a csv file.

We then convert raw variables into useful predictive features. We first eliminate outlinears that are significantly more expensive than the general housing stock in Boulder. To develop indicators, we calculate the age of homes and the distance to the nearest 1 ~ 3 amenities. We also create buffers to calculate how many amenities exist within certain miles of a house. We categorize numeric variables such as the number of bedrooms into categorical variables that divide the data into several categories such as house with more than 3 bedrooms. We spatial join the census tracts and urban areas to the housing characteristics so that each house is assigned to its census tract and urban status as either urban or non-urban. At last, we create lag price variable that represents the mean of the nearest 5 houses to account for the clustering effects of houses.

Boulder_city <- st_read("C:/Users/zheng/Desktop/MUSA 508/508 midterm/City_of_Boulder_City_Limits.kml") %>%
  st_transform('EPSG:26913')%>%
  select(-Name, -Description)
Boulder_city <- st_union(Boulder_city)

Boulder.county=st_read("https://opendata.arcgis.com/api/v3/datasets/964b8f3b3dbe401bb28d49ac93d29dc4_0/downloads/data?format=kml&spatialRefId=4326")%>%
  st_as_sf()
Boulder.county.reproject<-Boulder.county %>%
  st_transform('EPSG:26913')%>%
  dplyr::select(-Name,-Description)

q0 <- opq(bbox = c(-105.6945,39.91297,-105.0528,40.26396))

st_bbox(Boulder.county.reproject)

housing <- st_read("C:/Users/zheng/Desktop/MUSA 508/508 midterm/studentData.geojson", crs = 'ESRI:102254')%>%
  st_transform('EPSG:26913')

housing <- housing %>%
  select(-Stories, -UnitCount)%>%
  mutate(age = 2021- EffectiveYear)%>%
  filter(nbrBedRoom < 10, carStorageSF < 3000)

housing <- housing[-2637,]

##Public Facilities

#parks

park <- add_osm_feature(opq = q0, key = 'leisure', value = "park") %>%
  osmdata_sf(.)

park.sf <- st_geometry(park$osm_points) %>%
  st_transform(4326) %>%
  st_sf() %>%
  cbind(., park$osm_points$name) %>%
  rename(NAME = park.osm_points.name)%>%
  st_transform('EPSG:26913')%>%
  st_intersection(Boulder.county.reproject,park.sf)%>%
  dplyr::select(geometry)

housing <- housing%>%
  mutate(park=nn_function(st_c(housing),st_c(park.sf),1))

#fast food

fast_food <- add_osm_feature(opq = q0, key = 'amenity', value = "fast_food") %>%
  osmdata_sf(.)

fast_food.sf <- st_geometry(fast_food$osm_points) %>%
  st_transform(4326) %>%
  st_sf() %>%
  cbind(., fast_food$osm_points$amenity) %>%
  rename(NAME = fast_food.osm_points.amenity)%>%
  st_transform('EPSG:26913')%>%
  st_intersection(Boulder.county.reproject,park.sf)

fast_food.sf<-
  fast_food.sf%>%
  dplyr::select(geometry)

housing$fastfood_buffer =
  st_buffer(housing, 800) %>% ##WHAT IS THE RIGHT PARAMETER
  aggregate(mutate(fast_food.sf, counter = 1),., sum) %>% 
  pull(counter)


#water

water <- add_osm_feature(opq = q0, key = 'natural', value = "water") %>%
  osmdata_sf(.)

water.sf <- water$osm_points %>%
  dplyr::select(geometry) %>%
  st_as_sf(crs = 4326, agr = "constant") %>%
  distinct() %>%
  st_transform('EPSG:26913')%>%
  st_intersection(Boulder.county.reproject,park.sf)

housing <- housing%>%
  mutate(water_nn1=nn_function(st_c(housing),st_c(water.sf),1),
       water_nn2=nn_function(st_c(housing),st_c(water.sf),2),
       water_nn3=nn_function(st_c(housing),st_c(water.sf),3))

#playground 

playground <- add_osm_feature(opq = q0, key = 'leisure', value = "playground") %>%
  osmdata_sf(.)

playground.sf <- st_geometry(playground$osm_points) %>%
  st_transform(4326) %>%
  st_sf() %>%
  cbind(., playground$osm_points$name) %>%
  rename(NAME = playground.osm_points.name)%>%
  st_transform('EPSG:26913')%>%
  st_intersection(Boulder.county.reproject,park.sf)%>%
  dplyr::select(geometry)

housing <- housing%>%
  mutate(playground=nn_function(st_c(housing),st_c(playground.sf),1))

#restaurant
restaurant <- add_osm_feature(opq = q0, key = 'amenity', value = "restaurant") %>%
  osmdata_sf(.)

restaurant.sf <- st_geometry(restaurant$osm_points) %>%
  st_transform(4326) %>%
  st_sf() %>%
  cbind(., restaurant$osm_points$amenity) %>%
  rename(NAME = restaurant.osm_points.amenity)%>%
  st_transform('EPSG:26913')%>%
  st_intersection(Boulder.county.reproject,park.sf)%>%
  dplyr::select(geometry)%>%
  distinct()

housing<-housing%>%
  mutate(
    restaurant_nn1 = nn_function(st_c(housing), st_c(restaurant.sf), 1),
    restaurant_nn2 = nn_function(st_c(housing), st_c(restaurant.sf), 2), 
    restaurant_nn3 = nn_function(st_c(housing), st_c(restaurant.sf), 3))

##School District

school <- read.csv("C:/Users/zheng/Desktop/MUSA 508/508 midterm/school.csv")%>%
  st_as_sf(coords = c("LONGITUDE", "LATITUDE"), crs = 4326, agr = "constant")%>%
  st_transform('EPSG:26913')

school.sf <- school%>%
  dplyr::select(geometry) %>%
  st_as_sf(crs = 4326, agr = "constant")%>%
  st_transform('EPSG:26913')

housing$school_buffer =
  st_buffer(housing, 8000) %>% ##WHAT IS THE RIGHT PARAMETER
  aggregate(mutate(school.sf, counter = 1),., sum) %>% 
  pull(counter)

housing <- housing%>%
  mutate(school=nn_function(st_c(housing),st_c(school.sf),1))

#Company

company <- add_osm_feature(opq = q0, key = 'office', value = "company") %>%
  osmdata_sf(.)

company.sf <- st_geometry(company$osm_points) %>%
  st_transform(4326) %>%
  st_sf() %>%
  cbind(., company$osm_points$office) %>%
  rename(NAME = company.osm_points.office)%>%
  st_transform('EPSG:26913')%>%
  st_intersection(Boulder.county.reproject,park.sf)%>%
  na.omit()

housing <- housing %>%
  mutate(company=nn_function(st_c(housing),st_c(company.sf),1))

#Bus station

bus_station <- add_osm_feature(opq = q0, key = 'amenity', value = "bus_station") %>%
  osmdata_sf(.)

bus_station.sf <- st_geometry(bus_station$osm_points) %>%
  st_transform(4326) %>%
  st_sf() %>%
  cbind(., bus_station$osm_points$amenity) %>%
  rename(NAME = bus_station.osm_points.amenity)%>%
  st_transform('EPSG:26913')%>%
  dplyr::select(geometry)%>%
  st_intersection(Boulder.county.reproject,park.sf)


housing <- housing%>%
  mutate(bus_stop_nn1=nn_function(st_c(housing),st_c(bus_station.sf),1),
         bus_stop_nn2=nn_function(st_c(housing),st_c(bus_station.sf),2),
         bus_stop_nn3=nn_function(st_c(housing),st_c(bus_station.sf),3))

##Internal Characteristics

housing <- 
  housing %>%
  mutate(nbrBedRoom.cat = case_when(
    nbrBedRoom >= 0 & nbrBedRoom < 4  ~ "Up to 3 Bedrooms",
    nbrBedRoom >= 4 & nbrBedRoom < 5  ~ "4 Bedrooms",
    nbrBedRoom > 4                    ~ "5+ Bedrooms"))

##Spatial Process
#neighborhood
neighborhood <- get_acs(geography = "tract",
                             year = 2019, 
                             variables = c("B06011_001E", #Median income in the past 12 months
                                          "B19013_001E"), #Median household income in the past 12 months
                             geometry = T, 
                             state = "CO", 
                             county = "Boulder", 
                             output = "wide") %>%
  st_transform('EPSG:26913')%>%
  dplyr::select(-NAME,-B06011_001M,-B19013_001M,-B06011_001E,-B19013_001E)

#urban status
urban_area <- urban_areas(cb = FALSE, year = NULL)%>%
  st_transform('EPSG:26913')%>%
  select(NAME10, geometry)
urban_area <- st_intersection(Boulder.county.reproject, urban_area)%>%
  filter(NAME10 != "Denver--Aurora, CO")

housing <- st_join(housing, neighborhood, join = st_intersects)

housing <- st_join(housing, urban_area, join = st_intersects)

housing$NAME10[is.na(housing$NAME10)] <- 0

housing <- housing %>%
  rename(urban_status = NAME10)

housing$urban_status <- ifelse(housing$urban_status == "0", "non-urban", "urban")
#the spatial lag of housing
Boulder <- housing %>%
  filter(toPredict == 0) %>%
  dplyr::select(-toPredict)
coords <- st_coordinates(Boulder)
neighborList <- knn2nb(knearneigh(coords, 5))
spatialWeights <- nb2listw(neighborList, style="W")

Boulder$lagPrice <- lag.listw(spatialWeights,
                                             Boulder$price)

housing$price[is.na(housing$price)] <- 0
coords.all <- st_coordinates(housing)
neighborList.all <- knn2nb(knearneigh(coords.all, 5))
spatialWeights.all <- nb2listw(neighborList.all, style="W")

housing$lagPrice <- lag.listw(spatialWeights.all,
                              housing$price)

housing$price[is.na(housing$price)] <- 0
coords.all <- st_coordinates(housing)
neighborList.all <- knn2nb(knearneigh(coords.all, 5))
spatialWeights.all <- nb2listw(neighborList.all, style="W")

housing$lagPrice <- lag.listw(spatialWeights.all,
                              housing$price)

2.2 Variable Descriptions

For variables in our final model, we select the following variables:

Internal Characteristics
- age: age of the house calculated by 2021 minus the effective year of the house.
- designCodeDscr: description of building’s design type
- qualityCodeDscr: description of the quality of the house as determined by the government staff
- TotalFinishedSF: total number of finished square feet without the basement squre feed
- nbrBedRoom.cat : whether the house has more than 3 bedrooms
- HeatingDscr: description of the type of heating system in the house
- Roof_CoverDscr: the material used on the roof

Public Service/Amenities
- park: distance from the house to the nearest park
- school: distance from the house to the nearest school
- restaurant_nn1: distance from the house to the nearest restaurant
- bus_stop_nn1: distance from the house to the nearest bus station
- company: distance from the house to the nearest company
- water_nn1: distance from the house to the nearest water

Spatial Process
- GEOID: census tract that the house belongs to
- urban_status: whether the house is located in an urban area defined by the Census
- lagPrice: the average sale price of the house’s 5 nearest neighbors

The summary statistics for all numerical varibles are presented below.

final_vars<-st_drop_geometry(Boulder)%>%
  select(
         price,
                                         age,
                                         GEOID,
                                         urban_status,
                                         designCodeDscr,
                                         qualityCodeDscr,
                                         TotalFinishedSF,
                                         nbrBedRoom.cat,
                                         HeatingDscr,
                                         Roof_CoverDscr,
                                         park,
                                         school,
                                         restaurant_nn1,
                                         bus_stop_nn1,
                                         company,
                                         water_nn1,
                                         lagPrice)%>% na.omit()
stargazer(final_vars,type = "text",title = "Table 1. summary statistics with chosen variable descriptions")
## 
## Table 1. summary statistics with chosen variable descriptions
## ======================================================================================
## Statistic         N       Mean      St. Dev.     Min   Pctl(25)   Pctl(75)     Max    
## --------------------------------------------------------------------------------------
## price           11,259 746,123.700 543,627.200 10,000   452,600   831,100   7,350,000 
## age             11,259   23.300      17.476       0       11         32        142    
## TotalFinishedSF 11,259  1,953.673    889.723      0      1,276    2,442.5     10,188  
## park            11,259   669.182    1,375.010   7.590   154.185   501.759   10,712.720
## school          11,259  6,110.231   6,185.961  12.666   849.973  12,282.800 22,328.780
## restaurant_nn1  11,259  1,444.577   1,473.535  10.489   581.046  1,723.857  12,111.570
## bus_stop_nn1    11,259 11,891.590   7,875.543  105.256 4,588.042 19,497.640 35,165.250
## company         11,259  6,830.712   6,154.506  50.168  1,364.029 12,533.640 31,577.050
## water_nn1       11,259   675.573     520.170   11.318   340.253   869.275   5,028.900 
## lagPrice        11,259 733,469.600 427,418.800 137,900  461,870   839,880   3,849,000 
## --------------------------------------------------------------------------------------

2.3 Data Selection: Correlation Matrix and Correlation Plots

Below if the correlation matrix of all variables that we develop. We select and delete variables based on the correlation matrix. If two variables have a high correlation, we only choose one to include in our model.

numericVars <- 
  select_if(st_drop_geometry(housing), is.numeric) %>% na.omit()

ggcorrplot(
  round(cor(numericVars), 1), 
  p.mat = cor_pmat(numericVars),
  colors = c("#25CB10", "white", "#FA7800"),
  type="lower",
  insig = "blank") +  
  labs(title = "Figure 1. Correlation across numeric variables")

For categorical variables that cannot be incorporated into the correlation matrix, we plot price as a function of categorical variables to see if there is a correlation. Below are the plots for variables of interest that show a correlation to the home price.

st_drop_geometry(housing) %>% 
  dplyr::select(price,
                designCodeDscr,
                qualityCodeDscr,
                HeatingDscr,
                Roof_CoverDscr) %>%
  gather(Variable, Value, -price) %>% 
  ggplot(aes(Value, price)) +
  geom_bar(position = "dodge", stat = "summary", fun.y = "mean") +
  facet_wrap(~Variable, ncol = 2, scales = "free") +
  labs(title = "Price as a function of\ncategorical variables", y = "Mean_Price") +
  plotTheme() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

2.4 Price Correlation Scatter Plots

The scatter plots below show the home price correlated with 4 different variables of interest. The age of house is negatively correlated with the home price, meaning that the older a house is, the lower the sale price will be; yet the correlation line is rather gradual, potentially skewed by a few old houses with high value. The distance to the nearest bus stop is negatively correlate with home price, suggesting that the home price is likely to decrease with the decrease in accessibility to public transit. The distance to the nearest company site is negatively correlate with home price, suggesting that more far away a house is from company sites, the lower the sale price will be; this correlation indicates the relationship between home price and potential job opportunity. The distance to the nearest school site is also negatively correlate with home price, meaning that more distant a house is to school, the lower the sale price will be.

st_drop_geometry(housing) %>% 
  dplyr::select(price, company, school, age, bus_stop_nn1) %>% 
  gather(Variable, Value, -price) %>% 
  ggplot(aes(Value, price)) +
  geom_point(size = .5) + 
  geom_smooth(method = "lm", se=T, colour = "#FA7800") +
  facet_wrap(~Variable, ncol = 2, scales = "free") +
  labs(title = "Figure 2. home price correlation scatter plots") +
  plotTheme()

2.5 Map Home Sale Price

The map below visualizes the distribution of sale prices in Boulder county. The higher price tends to concentrate in the Boulder city, and the lower price tends to concentrate in Northeast Boulder and to scatter over the less populated parts in the county.

ggplot()  +
  geom_sf(data = Boulder.county.reproject, fill = "grey40") +
  geom_sf(data = Boulder_city)+
  geom_sf(data = housing, aes(colour = q5(price)), 
          show.legend = "point", size = .75) +
  scale_colour_manual(values = palette5,
                      labels=qBr(housing,"price"),
                      name="Quintile\nBreaks") +
  labs(title="Figure 3. Sales price distribution in Boulder") +
  mapTheme()

2.6 Maps of Interesting Independent Variables

The following maps show the spatial distribution of three independent variables: urban status, distance to nearest bus station, and the design style of house. Each variable has its own distribution pattern, all with certain degree of clustering that does not resonate with a single spatial process, say, urban status. The different patterns of clustering for independent variables suggest the importance to include all of them into the model.

#urban status
ggplot() +
  geom_sf(data = Boulder.county.reproject, fill = "grey40") +
  geom_sf(data = housing, aes(colour = urban_status), 
          show.legend = "point", size = .75) +
  labs(title="Figure 4.1. Urban status in Boulder") +
  mapTheme()

#distance to bus station
ggplot() +
  geom_sf(data = Boulder.county.reproject, fill = "grey40") +
  geom_sf(data = housing, aes(colour = q5(bus_stop_nn1)), 
          show.legend = "point", size = .75) +
  scale_colour_manual(values = palette5,
                      labels=qBr(housing,"bus_stop_nn1"),
                      name="Quintile\nBreaks") +
  labs(title="Figure 4.2. Distance to the Nearest Bus Station, Boulder") +
  mapTheme()

#housing design
ggplot() +
  geom_sf(data = Boulder.county.reproject, fill = "grey40") +
  geom_sf(data = housing, aes(colour = designCodeDscr), 
          show.legend = "point", size = .75) +
  labs(title="Figure 5.2 Style of Design for Houses in Boulder") +
  mapTheme()

3 Methods

3.1 Split the training and testing datasets

We first split our data into training for 75% and testing sets for 25%. According to the correlation matrix above (Figure 1), we should eliminate the multicollinearity as possible to ensure that all these variables should be independent.

set.seed(645)
inTrain <- createDataPartition(
  y = paste(Boulder$designCodeDscr,
  Boulder$nbrBedRoom.cat,
  Boulder$HeatingDscr,
  Boulder$Roof_CoverDscr,
  Boulder$GEOID), 
  p = .75, list = FALSE)

Boulder.training <- Boulder[inTrain,]
Boulder.testing  <- Boulder[-inTrain,] 
housing.test.nhood <- Boulder.testing

3.2 OLS regression model

After selecting the variables from correlation matrix, we conducted OLS regression analysis on these variables with different combinations. We compare the results of them by looking at the following indexes: 1. P-value and estimated coefficients of each variables. P-values should be significant as possible.
2. The final F-statistics
3. The adjusted R square, which should be greater as possible.
After several trials, we finally selected the following variables for our training datasets.

Internal characteristics
- Age
- designCodeDscr
- qualityCodeDscr
- TotalFinishedSF
- nbrBedRoom.cat
- HeatingDscr
- Roof_CoverDscr

Public services
-park
-school
-restaurant_nn1
-bus_stop_nn1
-company
-water_nn1

Spatial structure
-neighbourhoods
-urban status
-price lag

##Regression 1
reg1 <- lm(price ~ ., data = st_drop_geometry(housing) %>% 
             dplyr::select(price,
                                age,
                                urban_status,
                           GEOID,
                                designCodeDscr,
                                qualityCodeDscr,
                                TotalFinishedSF,
                                nbrBedRoom.cat,
                                HeatingDscr,
                                Roof_CoverDscr,
                                park,
                                school,
                                restaurant_nn1,
                                bus_stop_nn1,
                                company,
                                water_nn1,
                                lagPrice))
summary(reg1)
## 
## Call:
## lm(formula = price ~ ., data = st_drop_geometry(housing) %>% 
##     dplyr::select(price, age, urban_status, GEOID, designCodeDscr, 
##         qualityCodeDscr, TotalFinishedSF, nbrBedRoom.cat, HeatingDscr, 
##         Roof_CoverDscr, park, school, restaurant_nn1, bus_stop_nn1, 
##         company, water_nn1, lagPrice))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3176880   -78418    -1262    76986  6300544 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            9.890e+05  5.113e+04  19.344  < 2e-16
## age                                   -1.742e+03  2.428e+02  -7.175 7.70e-13
## urban_statusurban                     -3.270e+04  1.490e+04  -2.195 0.028213
## GEOID08013012102                      -3.028e+05  3.100e+04  -9.766  < 2e-16
## GEOID08013012103                      -2.478e+05  3.792e+04  -6.533 6.72e-11
## GEOID08013012104                      -2.119e+05  4.005e+04  -5.291 1.24e-07
## GEOID08013012105                      -3.466e+05  3.800e+04  -9.120  < 2e-16
## GEOID08013012201                      -2.066e+04  4.086e+04  -0.506 0.613155
## GEOID08013012202                      -3.244e+05  5.293e+04  -6.129 9.12e-10
## GEOID08013012203                      -5.696e+05  4.176e+04 -13.638  < 2e-16
## GEOID08013012204                       2.745e+05  7.596e+04   3.613 0.000304
## GEOID08013012300                      -1.079e+05  2.125e+05  -0.508 0.611730
## GEOID08013012401                      -2.236e+05  4.252e+04  -5.258 1.48e-07
## GEOID08013012501                      -4.875e+05  5.215e+04  -9.349  < 2e-16
## GEOID08013012505                      -2.382e+05  3.573e+04  -6.666 2.75e-11
## GEOID08013012507                      -4.986e+05  4.128e+04 -12.077  < 2e-16
## GEOID08013012508                      -4.697e+05  5.064e+04  -9.276  < 2e-16
## GEOID08013012509                      -3.712e+05  4.137e+04  -8.974  < 2e-16
## GEOID08013012510                      -1.952e+05  4.012e+04  -4.866 1.16e-06
## GEOID08013012511                      -5.052e+05  5.134e+04  -9.839  < 2e-16
## GEOID08013012603                      -4.889e+05  3.979e+04 -12.287  < 2e-16
## GEOID08013012605                      -5.335e+05  1.251e+05  -4.265 2.01e-05
## GEOID08013012607                      -5.077e+05  7.448e+04  -6.817 9.80e-12
## GEOID08013012608                      -5.209e+05  4.435e+04 -11.746  < 2e-16
## GEOID08013012701                      -5.422e+05  3.605e+04 -15.043  < 2e-16
## GEOID08013012705                      -5.717e+05  5.437e+04 -10.514  < 2e-16
## GEOID08013012707                      -3.214e+05  6.092e+04  -5.275 1.35e-07
## GEOID08013012708                      -6.100e+05  4.344e+04 -14.042  < 2e-16
## GEOID08013012709                      -5.885e+05  5.423e+04 -10.852  < 2e-16
## GEOID08013012710                      -4.302e+05  4.022e+04 -10.698  < 2e-16
## GEOID08013012800                      -7.618e+05  5.180e+04 -14.704  < 2e-16
## GEOID08013012903                      -6.896e+05  5.202e+04 -13.255  < 2e-16
## GEOID08013012904                      -6.235e+05  4.272e+04 -14.596  < 2e-16
## GEOID08013012905                      -5.661e+05  5.744e+04  -9.857  < 2e-16
## GEOID08013012907                      -5.845e+05  5.260e+04 -11.113  < 2e-16
## GEOID08013013003                      -5.708e+05  3.958e+04 -14.423  < 2e-16
## GEOID08013013004                      -5.575e+05  4.280e+04 -13.025  < 2e-16
## GEOID08013013005                      -5.497e+05  4.279e+04 -12.845  < 2e-16
## GEOID08013013006                      -5.322e+05  3.702e+04 -14.377  < 2e-16
## GEOID08013013201                      -5.732e+05  7.412e+04  -7.733 1.14e-14
## GEOID08013013202                      -2.430e+05  6.970e+04  -3.486 0.000492
## GEOID08013013205                      -6.322e+05  4.950e+04 -12.770  < 2e-16
## GEOID08013013207                      -6.597e+05  6.345e+04 -10.397  < 2e-16
## GEOID08013013208                      -6.547e+05  5.951e+04 -11.002  < 2e-16
## GEOID08013013210                      -6.200e+05  5.629e+04 -11.013  < 2e-16
## GEOID08013013211                      -7.046e+05  5.303e+04 -13.286  < 2e-16
## GEOID08013013212                      -6.305e+05  5.396e+04 -11.685  < 2e-16
## GEOID08013013213                      -7.484e+05  5.041e+04 -14.844  < 2e-16
## GEOID08013013302                      -5.394e+05  5.822e+04  -9.266  < 2e-16
## GEOID08013013305                      -6.223e+05  6.288e+04  -9.897  < 2e-16
## GEOID08013013306                      -6.022e+05  6.614e+04  -9.104  < 2e-16
## GEOID08013013307                      -6.195e+05  6.285e+04  -9.857  < 2e-16
## GEOID08013013308                      -5.745e+05  6.277e+04  -9.153  < 2e-16
## GEOID08013013401                      -5.668e+05  6.307e+04  -8.986  < 2e-16
## GEOID08013013402                      -6.489e+05  6.054e+04 -10.720  < 2e-16
## GEOID08013013503                      -5.813e+05  6.330e+04  -9.183  < 2e-16
## GEOID08013013505                      -5.982e+05  7.173e+04  -8.339  < 2e-16
## GEOID08013013506                      -6.755e+05  6.413e+04 -10.533  < 2e-16
## GEOID08013013507                      -6.494e+05  6.469e+04 -10.038  < 2e-16
## GEOID08013013508                      -6.858e+05  6.292e+04 -10.899  < 2e-16
## GEOID08013013601                      -5.919e+05  5.749e+04 -10.296  < 2e-16
## GEOID08013013602                      -5.800e+05  6.065e+04  -9.563  < 2e-16
## GEOID08013013701                      -5.623e+05  3.798e+04 -14.806  < 2e-16
## GEOID08013013702                      -6.287e+05  4.438e+04 -14.165  < 2e-16
## GEOID08013060600                      -7.354e+05  3.486e+04 -21.097  < 2e-16
## GEOID08013060700                      -6.188e+05  4.172e+04 -14.834  < 2e-16
## GEOID08013060800                      -5.912e+05  5.033e+04 -11.747  < 2e-16
## GEOID08013060900                      -6.908e+05  4.312e+04 -16.021  < 2e-16
## GEOID08013061300                      -7.412e+05  4.218e+04 -17.570  < 2e-16
## GEOID08013061400                      -7.252e+05  4.066e+04 -17.837  < 2e-16
## designCodeDscr2-3 Story               -2.440e+04  7.983e+03  -3.057 0.002240
## designCodeDscrBi-level                 4.288e+04  1.791e+04   2.394 0.016676
## designCodeDscrMULTI STORY- TOWNHOUSE  -1.237e+05  1.174e+04 -10.538  < 2e-16
## designCodeDscrSplit-level              1.142e+04  1.195e+04   0.956 0.339211
## qualityCodeDscrAVERAGE +              -4.153e+04  1.299e+04  -3.196 0.001397
## qualityCodeDscrAVERAGE ++             -4.504e+03  1.341e+04  -0.336 0.737060
## qualityCodeDscrEXCELLENT               9.101e+05  3.067e+04  29.676  < 2e-16
## qualityCodeDscrEXCELLENT +             1.168e+06  6.528e+04  17.888  < 2e-16
## qualityCodeDscrEXCELLENT++             1.687e+06  5.321e+04  31.703  < 2e-16
## qualityCodeDscrEXCEPTIONAL 1           8.688e+05  6.652e+04  13.061  < 2e-16
## qualityCodeDscrEXCEPTIONAL 2           1.519e+06  1.799e+05   8.443  < 2e-16
## qualityCodeDscrFAIR                   -4.417e+04  3.470e+04  -1.273 0.203083
## qualityCodeDscrGOOD                    2.901e+04  9.501e+03   3.053 0.002271
## qualityCodeDscrGOOD +                  4.235e+04  1.566e+04   2.705 0.006848
## qualityCodeDscrGOOD ++                 1.266e+05  1.467e+04   8.632  < 2e-16
## qualityCodeDscrLOW                    -1.273e+05  7.070e+04  -1.800 0.071860
## qualityCodeDscrVERY GOOD               1.764e+05  1.565e+04  11.273  < 2e-16
## qualityCodeDscrVERY GOOD +             3.542e+05  2.670e+04  13.268  < 2e-16
## qualityCodeDscrVERY GOOD ++            4.734e+05  2.255e+04  20.991  < 2e-16
## TotalFinishedSF                        1.261e+02  6.115e+00  20.630  < 2e-16
## nbrBedRoom.cat5+ Bedrooms              4.466e+04  9.598e+03   4.654 3.30e-06
## nbrBedRoom.catUp to 3 Bedrooms        -9.917e+03  7.193e+03  -1.379 0.168004
## HeatingDscrElectric                   -4.767e+04  3.523e+04  -1.353 0.176066
## HeatingDscrElectric Wall Heat (1500W)  8.863e+04  2.138e+05   0.414 0.678531
## HeatingDscrForced Air                 -2.240e+04  3.111e+04  -0.720 0.471528
## HeatingDscrGravity                    -3.479e+04  6.768e+04  -0.514 0.607228
## HeatingDscrHeat Pump                  -2.241e+05  1.110e+05  -2.019 0.043510
## HeatingDscrHot Water                   2.944e+04  3.268e+04   0.901 0.367586
## HeatingDscrNo HVAC                     2.040e+05  2.142e+05   0.952 0.341082
## HeatingDscrPackage Unit                5.780e+03  3.012e+05   0.019 0.984690
## HeatingDscrRadiant Floor               2.568e+05  4.453e+04   5.768 8.25e-09
## HeatingDscrVentilation Only           -4.648e+05  3.024e+05  -1.537 0.124299
## HeatingDscrWall Furnace                7.982e+04  4.563e+04   1.749 0.080303
## Roof_CoverDscrAsphalt                 -1.612e+04  7.115e+03  -2.265 0.023523
## Roof_CoverDscrBuilt-Up                 2.699e+05  1.163e+05   2.321 0.020299
## Roof_CoverDscrClay Tile               -3.931e+04  4.716e+04  -0.834 0.404547
## Roof_CoverDscrConcrete Tile           -1.098e+05  2.384e+04  -4.606 4.16e-06
## Roof_CoverDscrMetal                    1.697e+05  2.562e+04   6.623 3.68e-11
## Roof_CoverDscrRoll                    -1.451e+05  2.995e+05  -0.485 0.627925
## Roof_CoverDscrRubber Membrane          8.147e+04  2.670e+04   3.051 0.002288
## Roof_CoverDscrShake                   -7.813e+04  4.622e+04  -1.691 0.090935
## Roof_CoverDscrTar and Gravel           8.104e+04  1.512e+05   0.536 0.591916
## park                                  -1.127e+01  4.106e+00  -2.745 0.006053
## school                                 2.396e+00  3.081e+00   0.778 0.436822
## restaurant_nn1                         8.145e+00  3.653e+00   2.229 0.025808
## bus_stop_nn1                          -1.094e+01  5.324e+00  -2.054 0.039992
## company                                3.826e+00  5.299e+00   0.722 0.470281
## water_nn1                             -3.679e+00  7.696e+00  -0.478 0.632628
## lagPrice                               3.147e-01  1.282e-02  24.552  < 2e-16
##                                          
## (Intercept)                           ***
## age                                   ***
## urban_statusurban                     *  
## GEOID08013012102                      ***
## GEOID08013012103                      ***
## GEOID08013012104                      ***
## GEOID08013012105                      ***
## GEOID08013012201                         
## GEOID08013012202                      ***
## GEOID08013012203                      ***
## GEOID08013012204                      ***
## GEOID08013012300                         
## GEOID08013012401                      ***
## GEOID08013012501                      ***
## GEOID08013012505                      ***
## GEOID08013012507                      ***
## GEOID08013012508                      ***
## GEOID08013012509                      ***
## GEOID08013012510                      ***
## GEOID08013012511                      ***
## GEOID08013012603                      ***
## GEOID08013012605                      ***
## GEOID08013012607                      ***
## GEOID08013012608                      ***
## GEOID08013012701                      ***
## GEOID08013012705                      ***
## GEOID08013012707                      ***
## GEOID08013012708                      ***
## GEOID08013012709                      ***
## GEOID08013012710                      ***
## GEOID08013012800                      ***
## GEOID08013012903                      ***
## GEOID08013012904                      ***
## GEOID08013012905                      ***
## GEOID08013012907                      ***
## GEOID08013013003                      ***
## GEOID08013013004                      ***
## GEOID08013013005                      ***
## GEOID08013013006                      ***
## GEOID08013013201                      ***
## GEOID08013013202                      ***
## GEOID08013013205                      ***
## GEOID08013013207                      ***
## GEOID08013013208                      ***
## GEOID08013013210                      ***
## GEOID08013013211                      ***
## GEOID08013013212                      ***
## GEOID08013013213                      ***
## GEOID08013013302                      ***
## GEOID08013013305                      ***
## GEOID08013013306                      ***
## GEOID08013013307                      ***
## GEOID08013013308                      ***
## GEOID08013013401                      ***
## GEOID08013013402                      ***
## GEOID08013013503                      ***
## GEOID08013013505                      ***
## GEOID08013013506                      ***
## GEOID08013013507                      ***
## GEOID08013013508                      ***
## GEOID08013013601                      ***
## GEOID08013013602                      ***
## GEOID08013013701                      ***
## GEOID08013013702                      ***
## GEOID08013060600                      ***
## GEOID08013060700                      ***
## GEOID08013060800                      ***
## GEOID08013060900                      ***
## GEOID08013061300                      ***
## GEOID08013061400                      ***
## designCodeDscr2-3 Story               ** 
## designCodeDscrBi-level                *  
## designCodeDscrMULTI STORY- TOWNHOUSE  ***
## designCodeDscrSplit-level                
## qualityCodeDscrAVERAGE +              ** 
## qualityCodeDscrAVERAGE ++                
## qualityCodeDscrEXCELLENT              ***
## qualityCodeDscrEXCELLENT +            ***
## qualityCodeDscrEXCELLENT++            ***
## qualityCodeDscrEXCEPTIONAL 1          ***
## qualityCodeDscrEXCEPTIONAL 2          ***
## qualityCodeDscrFAIR                      
## qualityCodeDscrGOOD                   ** 
## qualityCodeDscrGOOD +                 ** 
## qualityCodeDscrGOOD ++                ***
## qualityCodeDscrLOW                    .  
## qualityCodeDscrVERY GOOD              ***
## qualityCodeDscrVERY GOOD +            ***
## qualityCodeDscrVERY GOOD ++           ***
## TotalFinishedSF                       ***
## nbrBedRoom.cat5+ Bedrooms             ***
## nbrBedRoom.catUp to 3 Bedrooms           
## HeatingDscrElectric                      
## HeatingDscrElectric Wall Heat (1500W)    
## HeatingDscrForced Air                    
## HeatingDscrGravity                       
## HeatingDscrHeat Pump                  *  
## HeatingDscrHot Water                     
## HeatingDscrNo HVAC                       
## HeatingDscrPackage Unit                  
## HeatingDscrRadiant Floor              ***
## HeatingDscrVentilation Only              
## HeatingDscrWall Furnace               .  
## Roof_CoverDscrAsphalt                 *  
## Roof_CoverDscrBuilt-Up                *  
## Roof_CoverDscrClay Tile                  
## Roof_CoverDscrConcrete Tile           ***
## Roof_CoverDscrMetal                   ***
## Roof_CoverDscrRoll                       
## Roof_CoverDscrRubber Membrane         ** 
## Roof_CoverDscrShake                   .  
## Roof_CoverDscrTar and Gravel             
## park                                  ** 
## school                                   
## restaurant_nn1                        *  
## bus_stop_nn1                          *  
## company                                  
## water_nn1                                
## lagPrice                              ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 298400 on 11240 degrees of freedom
## Multiple R-squared:  0.7042, Adjusted R-squared:  0.7011 
## F-statistic: 226.8 on 118 and 11240 DF,  p-value: < 2.2e-16

3.3 Accounting for neighbourhoods effects

In this regression model, we include all the variables above except the spatial structure, since we want to compare the neighbouhood effects in the later analysis. The final adjusted R square is 0.7232, which means that 72.32% of sales price can be explained by these variables we picked. We also conducted the cross validation and saw that the mean absolute error (MAE) is 155485.5. We hypothesized that introducing neighbourhoods to this model will enhance the accuracy, then we tested our hypothesis in the following results parts.

reg.training <-
  lm(price ~ ., data = as.data.frame(Boulder.training) %>%
       dplyr::select(price,
                                age,
                                designCodeDscr,
                                qualityCodeDscr,
                                TotalFinishedSF,
                                nbrBedRoom.cat,
                                HeatingDscr,
                                Roof_CoverDscr,
                                park,
                                school,
                                restaurant_nn1,
                                bus_stop_nn1,
                                company,
                                water_nn1))
summary(reg.training)
## 
## Call:
## lm(formula = price ~ ., data = as.data.frame(Boulder.training) %>% 
##     dplyr::select(price, age, designCodeDscr, qualityCodeDscr, 
##         TotalFinishedSF, nbrBedRoom.cat, HeatingDscr, Roof_CoverDscr, 
##         park, school, restaurant_nn1, bus_stop_nn1, company, 
##         water_nn1))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2735250  -126430   -21526    85208  6692192 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            7.858e+05  4.396e+04  17.876  < 2e-16
## age                                    1.215e+03  2.876e+02   4.225 2.41e-05
## designCodeDscr2-3 Story               -6.035e+04  9.885e+03  -6.105 1.07e-09
## designCodeDscrBi-level                 4.440e+04  2.162e+04   2.054 0.040045
## designCodeDscrMULTI STORY- TOWNHOUSE  -2.425e+05  1.414e+04 -17.156  < 2e-16
## designCodeDscrSplit-level              8.595e+03  1.482e+04   0.580 0.561841
## qualityCodeDscrAVERAGE +              -7.745e+03  1.633e+04  -0.474 0.635247
## qualityCodeDscrAVERAGE ++              3.312e+04  1.671e+04   1.982 0.047551
## qualityCodeDscrEXCELLENT               1.306e+06  3.736e+04  34.944  < 2e-16
## qualityCodeDscrEXCELLENT +             1.545e+06  8.511e+04  18.151  < 2e-16
## qualityCodeDscrEXCELLENT++             2.214e+06  6.711e+04  32.996  < 2e-16
## qualityCodeDscrEXCEPTIONAL 1           1.328e+06  8.559e+04  15.516  < 2e-16
## qualityCodeDscrEXCEPTIONAL 2           1.719e+06  2.102e+05   8.177 3.30e-16
## qualityCodeDscrFAIR                   -1.226e+05  4.256e+04  -2.880 0.003980
## qualityCodeDscrGOOD                    7.516e+04  1.148e+04   6.549 6.10e-11
## qualityCodeDscrGOOD +                  1.040e+05  1.964e+04   5.293 1.23e-07
## qualityCodeDscrGOOD ++                 2.094e+05  1.788e+04  11.711  < 2e-16
## qualityCodeDscrLOW                    -2.305e+05  9.230e+04  -2.498 0.012515
## qualityCodeDscrVERY GOOD               3.090e+05  1.913e+04  16.159  < 2e-16
## qualityCodeDscrVERY GOOD +             7.182e+05  3.387e+04  21.208  < 2e-16
## qualityCodeDscrVERY GOOD ++            7.360e+05  2.784e+04  26.435  < 2e-16
## TotalFinishedSF                        1.489e+02  7.416e+00  20.080  < 2e-16
## nbrBedRoom.cat5+ Bedrooms              4.044e+04  1.225e+04   3.301 0.000968
## nbrBedRoom.catUp to 3 Bedrooms        -9.786e+03  9.203e+03  -1.063 0.287619
## HeatingDscrElectric                   -7.088e+04  4.325e+04  -1.639 0.101257
## HeatingDscrElectric Wall Heat (1500W)  2.158e+03  2.501e+05   0.009 0.993113
## HeatingDscrForced Air                 -1.082e+05  3.835e+04  -2.820 0.004810
## HeatingDscrGravity                    -8.029e+04  7.984e+04  -1.006 0.314570
## HeatingDscrHeat Pump                  -1.105e+05  1.300e+05  -0.850 0.395177
## HeatingDscrHot Water                   5.435e+04  4.026e+04   1.350 0.177021
## HeatingDscrNo HVAC                     9.983e+03  2.507e+05   0.040 0.968234
## HeatingDscrPackage Unit               -7.413e+04  3.513e+05  -0.211 0.832888
## HeatingDscrRadiant Floor               3.198e+05  5.343e+04   5.984 2.25e-09
## HeatingDscrVentilation Only           -5.377e+05  3.530e+05  -1.523 0.127785
## HeatingDscrWall Furnace                4.917e+04  5.616e+04   0.876 0.381277
## Roof_CoverDscrAsphalt                  2.786e+04  8.917e+03   3.124 0.001790
## Roof_CoverDscrBuilt-Up                 4.322e+05  1.472e+05   2.935 0.003341
## Roof_CoverDscrClay Tile                2.492e+04  5.638e+04   0.442 0.658513
## Roof_CoverDscrConcrete Tile           -2.529e+05  2.733e+04  -9.254  < 2e-16
## Roof_CoverDscrMetal                    2.056e+05  3.077e+04   6.681 2.51e-11
## Roof_CoverDscrRoll                    -2.553e+04  3.498e+05  -0.073 0.941835
## Roof_CoverDscrRubber Membrane          8.521e+04  3.239e+04   2.631 0.008540
## Roof_CoverDscrShake                   -2.329e+04  5.456e+04  -0.427 0.669467
## Roof_CoverDscrTar and Gravel           2.429e+05  1.761e+05   1.379 0.167798
## park                                   1.499e-01  4.242e+00   0.035 0.971812
## school                                 1.769e+01  1.600e+00  11.055  < 2e-16
## restaurant_nn1                        -1.574e+01  3.598e+00  -4.375 1.23e-05
## bus_stop_nn1                          -5.098e+01  1.684e+00 -30.268  < 2e-16
## company                                2.203e+01  2.067e+00  10.659  < 2e-16
## water_nn1                              2.451e+01  7.898e+00   3.103 0.001921
##                                          
## (Intercept)                           ***
## age                                   ***
## designCodeDscr2-3 Story               ***
## designCodeDscrBi-level                *  
## designCodeDscrMULTI STORY- TOWNHOUSE  ***
## designCodeDscrSplit-level                
## qualityCodeDscrAVERAGE +                 
## qualityCodeDscrAVERAGE ++             *  
## qualityCodeDscrEXCELLENT              ***
## qualityCodeDscrEXCELLENT +            ***
## qualityCodeDscrEXCELLENT++            ***
## qualityCodeDscrEXCEPTIONAL 1          ***
## qualityCodeDscrEXCEPTIONAL 2          ***
## qualityCodeDscrFAIR                   ** 
## qualityCodeDscrGOOD                   ***
## qualityCodeDscrGOOD +                 ***
## qualityCodeDscrGOOD ++                ***
## qualityCodeDscrLOW                    *  
## qualityCodeDscrVERY GOOD              ***
## qualityCodeDscrVERY GOOD +            ***
## qualityCodeDscrVERY GOOD ++           ***
## TotalFinishedSF                       ***
## nbrBedRoom.cat5+ Bedrooms             ***
## nbrBedRoom.catUp to 3 Bedrooms           
## HeatingDscrElectric                      
## HeatingDscrElectric Wall Heat (1500W)    
## HeatingDscrForced Air                 ** 
## HeatingDscrGravity                       
## HeatingDscrHeat Pump                     
## HeatingDscrHot Water                     
## HeatingDscrNo HVAC                       
## HeatingDscrPackage Unit                  
## HeatingDscrRadiant Floor              ***
## HeatingDscrVentilation Only              
## HeatingDscrWall Furnace                  
## Roof_CoverDscrAsphalt                 ** 
## Roof_CoverDscrBuilt-Up                ** 
## Roof_CoverDscrClay Tile                  
## Roof_CoverDscrConcrete Tile           ***
## Roof_CoverDscrMetal                   ***
## Roof_CoverDscrRoll                       
## Roof_CoverDscrRubber Membrane         ** 
## Roof_CoverDscrShake                      
## Roof_CoverDscrTar and Gravel             
## park                                     
## school                                ***
## restaurant_nn1                        ***
## bus_stop_nn1                          ***
## company                               ***
## water_nn1                             ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 349100 on 9069 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6038 
## F-statistic: 284.6 on 49 and 9069 DF,  p-value: < 2.2e-16
Boulder.testing <-
  Boulder.testing %>%
  mutate(Regression = "Baseline Regression",
         price.Predict = predict(reg.training, Boulder.testing),
         price.Error = price.Predict - price,
         price.AbsError = abs(price.Predict - price),
         price.APE = (abs(price.Predict - price)) /
           price.Predict)%>%
  filter(price < 8000000)

##Generalizibility
fitControl <- trainControl(method = "cv", number = 100)
set.seed(825)

reg.cv <- 
  train(price ~ ., data = st_drop_geometry(Boulder.training) %>% 
          dplyr::select(price,
                                age,
                                urban_status,
                                designCodeDscr,
                                qualityCodeDscr,
                                TotalFinishedSF,
                                nbrBedRoom.cat,
                                HeatingDscr,
                                Roof_CoverDscr,
                                park,
                                school,
                                restaurant_nn1,
                                bus_stop_nn1,
                                company,
                                water_nn1,
                                lagPrice), 
        method = "lm", trControl = fitControl, na.action = na.pass)

reg.cv
## Linear Regression 
## 
## 9119 samples
##   15 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (100 fold) 
## Summary of sample sizes: 9029, 9028, 9027, 9028, 9028, 9027, ... 
## Resampling results:
## 
##   RMSE      Rsquared  MAE     
##   293539.7  0.723245  155487.2
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
mean(reg.cv$resample[,3])
## [1] 155487.2

4 Results

4.1 Results of training dataset

Table 2 shows the summary results of our training dataset. According to the P-value and t-value, the majority of factors are significant. Variables categorized as internal characteristics (total finished size, quality code, age, etc.) and spatial structure (most neighbourhoods and price lag) play most significant roles in the OLS regression model of our training dataset. According to the coefficients, houses with younger built ages, high quality, larger size, more bedrooms are associated with higher salesprice, which accords with our expectations. Except for the distance to the restaurant, public services are not statistically significant in our designed model. The is adjusted R square is 0.7267 , which means that 72.67% of variance in this model can be explained by all these predictors we chose.

reg.nhood <- lm(price ~ ., data = st_drop_geometry(Boulder.training) %>% 
                  dplyr::select(price,
                                age,
                                GEOID,
                                urban_status,
                                designCodeDscr,
                                qualityCodeDscr,
                                TotalFinishedSF,
                                nbrBedRoom.cat,
                                HeatingDscr,
                                Roof_CoverDscr,
                                park,
                                school,
                                restaurant_nn1,
                                bus_stop_nn1,
                                company,
                                water_nn1,
                                lagPrice))
kable(tidy(reg.nhood),caption = "Table 2. summary results of training dataset") %>% kable_styling()
Table 2. summary results of training dataset
term estimate std.error statistic p.value
(Intercept) 8.893211e+05 5.492575e+04 16.1913341 0.0000000
age -1.694320e+03 2.611500e+02 -6.4879161 0.0000000
GEOID08013012102 -2.577510e+05 3.369440e+04 -7.6496687 0.0000000
GEOID08013012103 -2.123701e+05 4.054170e+04 -5.2383127 0.0000002
GEOID08013012104 -1.174276e+05 4.256825e+04 -2.7585731 0.0058172
GEOID08013012105 -2.932291e+05 4.040904e+04 -7.2565233 0.0000000
GEOID08013012201 3.267974e+04 4.249490e+04 0.7690275 0.4418972
GEOID08013012202 -2.710871e+05 5.489525e+04 -4.9382611 0.0000008
GEOID08013012203 -4.985736e+05 4.551599e+04 -10.9538128 0.0000000
GEOID08013012204 4.151615e+05 7.882404e+04 5.2669402 0.0000001
GEOID08013012300 -4.833522e+04 2.068784e+05 -0.2336407 0.8152693
GEOID08013012401 -1.453286e+05 4.542295e+04 -3.1994531 0.0013817
GEOID08013012501 -4.249189e+05 5.496778e+04 -7.7303265 0.0000000
GEOID08013012505 -1.728226e+05 3.823529e+04 -4.5199752 0.0000063
GEOID08013012507 -4.310379e+05 4.498063e+04 -9.5827461 0.0000000
GEOID08013012508 -4.103342e+05 5.427016e+04 -7.5609540 0.0000000
GEOID08013012509 -3.000941e+05 4.405788e+04 -6.8113613 0.0000000
GEOID08013012510 -1.502248e+05 4.255698e+04 -3.5299688 0.0004177
GEOID08013012511 -4.559635e+05 5.332887e+04 -8.5500310 0.0000000
GEOID08013012603 -4.269271e+05 4.257071e+04 -10.0286588 0.0000000
GEOID08013012605 -4.726432e+05 1.223867e+05 -3.8618850 0.0001133
GEOID08013012607 -4.386106e+05 7.926728e+04 -5.5333128 0.0000000
GEOID08013012608 -4.541142e+05 4.748705e+04 -9.5629048 0.0000000
GEOID08013012701 -5.160196e+05 3.922808e+04 -13.1543424 0.0000000
GEOID08013012705 -5.125036e+05 5.884075e+04 -8.7100105 0.0000000
GEOID08013012707 -2.841530e+05 6.286795e+04 -4.5198394 0.0000063
GEOID08013012708 -5.351398e+05 4.693536e+04 -11.4016342 0.0000000
GEOID08013012709 -5.198687e+05 5.813934e+04 -8.9417715 0.0000000
GEOID08013012710 -3.547513e+05 4.282652e+04 -8.2834511 0.0000000
GEOID08013012800 -6.966864e+05 5.593500e+04 -12.4552863 0.0000000
GEOID08013012903 -6.069328e+05 5.651994e+04 -10.7383844 0.0000000
GEOID08013012904 -5.492873e+05 4.630304e+04 -11.8628780 0.0000000
GEOID08013012905 -4.903037e+05 6.059436e+04 -8.0915721 0.0000000
GEOID08013012907 -4.970911e+05 5.681057e+04 -8.7499759 0.0000000
GEOID08013013003 -4.931513e+05 4.284049e+04 -11.5113353 0.0000000
GEOID08013013004 -4.845179e+05 4.603950e+04 -10.5239606 0.0000000
GEOID08013013005 -4.769830e+05 4.611979e+04 -10.3422635 0.0000000
GEOID08013013006 -4.596758e+05 4.028766e+04 -11.4098403 0.0000000
GEOID08013013201 -5.352633e+05 7.845758e+04 -6.8223280 0.0000000
GEOID08013013202 -2.452556e+05 7.215767e+04 -3.3988843 0.0006795
GEOID08013013205 -5.517420e+05 5.326846e+04 -10.3577618 0.0000000
GEOID08013013207 -6.147870e+05 6.809967e+04 -9.0277525 0.0000000
GEOID08013013208 -6.115623e+05 6.386656e+04 -9.5756274 0.0000000
GEOID08013013210 -5.655319e+05 6.056723e+04 -9.3372583 0.0000000
GEOID08013013211 -6.565803e+05 5.694002e+04 -11.5310863 0.0000000
GEOID08013013212 -5.681172e+05 5.797262e+04 -9.7997508 0.0000000
GEOID08013013213 -6.935187e+05 5.432134e+04 -12.7669677 0.0000000
GEOID08013013302 -4.928778e+05 6.236340e+04 -7.9033181 0.0000000
GEOID08013013305 -5.727005e+05 6.737697e+04 -8.4999444 0.0000000
GEOID08013013306 -5.587540e+05 7.060516e+04 -7.9137841 0.0000000
GEOID08013013307 -5.736340e+05 6.715546e+04 -8.5418812 0.0000000
GEOID08013013308 -5.143094e+05 6.744503e+04 -7.6256088 0.0000000
GEOID08013013401 -5.023131e+05 6.771046e+04 -7.4185449 0.0000000
GEOID08013013402 -6.149091e+05 6.500894e+04 -9.4588388 0.0000000
GEOID08013013503 -5.542901e+05 6.776112e+04 -8.1800614 0.0000000
GEOID08013013505 -5.564080e+05 7.658938e+04 -7.2648195 0.0000000
GEOID08013013506 -6.398578e+05 6.880859e+04 -9.2990976 0.0000000
GEOID08013013507 -5.954870e+05 6.947707e+04 -8.5709858 0.0000000
GEOID08013013508 -6.420151e+05 6.739493e+04 -9.5261635 0.0000000
GEOID08013013601 -5.484979e+05 6.135548e+04 -8.9396731 0.0000000
GEOID08013013602 -5.254734e+05 6.502775e+04 -8.0807559 0.0000000
GEOID08013013701 -5.023326e+05 4.138079e+04 -12.1392707 0.0000000
GEOID08013013702 -5.714023e+05 4.805335e+04 -11.8909983 0.0000000
GEOID08013060600 -6.629024e+05 3.793964e+04 -17.4725542 0.0000000
GEOID08013060700 -5.477331e+05 4.552775e+04 -12.0307538 0.0000000
GEOID08013060800 -5.110172e+05 5.425024e+04 -9.4196300 0.0000000
GEOID08013060900 -6.146713e+05 4.689233e+04 -13.1081420 0.0000000
GEOID08013061300 -6.882637e+05 4.609559e+04 -14.9312258 0.0000000
GEOID08013061400 -6.592551e+05 4.410467e+04 -14.9475130 0.0000000
urban_statusurban -2.784517e+04 1.618221e+04 -1.7207272 0.0853347
designCodeDscr2-3 Story -2.898402e+04 8.593303e+03 -3.3728617 0.0007470
designCodeDscrBi-level 4.715352e+04 1.835847e+04 2.5684893 0.0102303
designCodeDscrMULTI STORY- TOWNHOUSE -1.245953e+05 1.271913e+04 -9.7958998 0.0000000
designCodeDscrSplit-level 2.123469e+04 1.268721e+04 1.6737075 0.0942228
qualityCodeDscrAVERAGE + -4.314465e+04 1.402002e+04 -3.0773592 0.0020947
qualityCodeDscrAVERAGE ++ -6.814125e+03 1.431565e+04 -0.4759912 0.6340922
qualityCodeDscrEXCELLENT 9.129136e+05 3.247932e+04 28.1075357 0.0000000
qualityCodeDscrEXCELLENT + 1.107320e+06 7.173182e+04 15.4369400 0.0000000
qualityCodeDscrEXCELLENT++ 1.745379e+06 5.708364e+04 30.5758123 0.0000000
qualityCodeDscrEXCEPTIONAL 1 9.275892e+05 7.246713e+04 12.8001376 0.0000000
qualityCodeDscrEXCEPTIONAL 2 1.386462e+06 1.761342e+05 7.8716240 0.0000000
qualityCodeDscrFAIR -4.974818e+04 3.615091e+04 -1.3761253 0.1688171
qualityCodeDscrGOOD 2.265787e+04 1.033216e+04 2.1929463 0.0283367
qualityCodeDscrGOOD + 3.438810e+04 1.704543e+04 2.0174386 0.0436794
qualityCodeDscrGOOD ++ 1.184495e+05 1.592375e+04 7.4385438 0.0000000
qualityCodeDscrLOW -1.042651e+05 7.765507e+04 -1.3426700 0.1794127
qualityCodeDscrVERY GOOD 1.738849e+05 1.687169e+04 10.3063130 0.0000000
qualityCodeDscrVERY GOOD + 3.801528e+05 2.930885e+04 12.9705837 0.0000000
qualityCodeDscrVERY GOOD ++ 4.978339e+05 2.446135e+04 20.3518514 0.0000000
TotalFinishedSF 1.367235e+02 6.530435e+00 20.9363478 0.0000000
nbrBedRoom.cat5+ Bedrooms 4.239169e+04 1.029384e+04 4.1181612 0.0000385
nbrBedRoom.catUp to 3 Bedrooms -1.090800e+04 7.728250e+03 -1.4114448 0.1581481
HeatingDscrElectric -4.604484e+04 3.648099e+04 -1.2621598 0.2069240
HeatingDscrElectric Wall Heat (1500W) 8.146415e+04 2.082973e+05 0.3910956 0.6957358
HeatingDscrForced Air -2.047118e+04 3.220163e+04 -0.6357187 0.5249760
HeatingDscrGravity -4.472803e+04 6.669198e+04 -0.6706658 0.5024506
HeatingDscrHeat Pump -2.556164e+05 1.086514e+05 -2.3526281 0.0186627
HeatingDscrHot Water 2.679087e+04 3.378022e+04 0.7930936 0.4277442
HeatingDscrNo HVAC 2.036438e+05 2.087373e+05 0.9755985 0.3292897
HeatingDscrPackage Unit 1.350673e+04 2.931802e+05 0.0460697 0.9632557
HeatingDscrRadiant Floor 2.357667e+05 4.485083e+04 5.2566857 0.0000002
HeatingDscrVentilation Only -4.257058e+05 2.945207e+05 -1.4454191 0.1483750
HeatingDscrWall Furnace 8.825565e+04 4.702262e+04 1.8768766 0.0605673
Roof_CoverDscrAsphalt -1.344134e+04 7.642054e+03 -1.7588647 0.0786345
Roof_CoverDscrBuilt-Up 4.246797e+05 1.229092e+05 3.4552295 0.0005524
Roof_CoverDscrClay Tile -5.990298e+04 4.720520e+04 -1.2689911 0.2044771
Roof_CoverDscrConcrete Tile -1.299805e+05 2.495116e+04 -5.2093975 0.0000002
Roof_CoverDscrMetal 1.261479e+05 2.583480e+04 4.8828685 0.0000011
Roof_CoverDscrRoll -1.700284e+05 2.913044e+05 -0.5836794 0.5594506
Roof_CoverDscrRubber Membrane 9.947722e+04 2.776302e+04 3.5830833 0.0003414
Roof_CoverDscrShake -8.088747e+04 4.558041e+04 -1.7746105 0.0759960
Roof_CoverDscrTar and Gravel 3.378294e+04 1.471402e+05 0.2295970 0.8184101
park -1.087381e+01 4.340866e+00 -2.5049866 0.0122631
school 3.584541e+00 3.286894e+00 1.0905557 0.2754976
restaurant_nn1 9.420545e+00 3.952131e+00 2.3836620 0.0171619
bus_stop_nn1 -1.233675e+01 5.685006e+00 -2.1700500 0.0300291
company 6.237236e+00 5.658850e+00 1.1022091 0.2704003
water_nn1 -9.872340e-01 8.266886e+00 -0.1194203 0.9049450
lagPrice 3.309460e-01 1.391190e-02 23.7887729 0.0000000
summary(reg.nhood)
## 
## Call:
## lm(formula = price ~ ., data = st_drop_geometry(Boulder.training) %>% 
##     dplyr::select(price, age, GEOID, urban_status, designCodeDscr, 
##         qualityCodeDscr, TotalFinishedSF, nbrBedRoom.cat, HeatingDscr, 
##         Roof_CoverDscr, park, school, restaurant_nn1, bus_stop_nn1, 
##         company, water_nn1, lagPrice))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2230996   -84580    -5029    75868  6271992 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            8.893e+05  5.493e+04  16.191  < 2e-16
## age                                   -1.694e+03  2.612e+02  -6.488 9.16e-11
## GEOID08013012102                      -2.578e+05  3.369e+04  -7.650 2.22e-14
## GEOID08013012103                      -2.124e+05  4.054e+04  -5.238 1.66e-07
## GEOID08013012104                      -1.174e+05  4.257e+04  -2.759 0.005817
## GEOID08013012105                      -2.932e+05  4.041e+04  -7.257 4.30e-13
## GEOID08013012201                       3.268e+04  4.249e+04   0.769 0.441897
## GEOID08013012202                      -2.711e+05  5.490e+04  -4.938 8.02e-07
## GEOID08013012203                      -4.986e+05  4.552e+04 -10.954  < 2e-16
## GEOID08013012204                       4.152e+05  7.882e+04   5.267 1.42e-07
## GEOID08013012300                      -4.834e+04  2.069e+05  -0.234 0.815269
## GEOID08013012401                      -1.453e+05  4.542e+04  -3.199 0.001382
## GEOID08013012501                      -4.249e+05  5.497e+04  -7.730 1.19e-14
## GEOID08013012505                      -1.728e+05  3.824e+04  -4.520 6.26e-06
## GEOID08013012507                      -4.310e+05  4.498e+04  -9.583  < 2e-16
## GEOID08013012508                      -4.103e+05  5.427e+04  -7.561 4.39e-14
## GEOID08013012509                      -3.001e+05  4.406e+04  -6.811 1.03e-11
## GEOID08013012510                      -1.502e+05  4.256e+04  -3.530 0.000418
## GEOID08013012511                      -4.560e+05  5.333e+04  -8.550  < 2e-16
## GEOID08013012603                      -4.269e+05  4.257e+04 -10.029  < 2e-16
## GEOID08013012605                      -4.726e+05  1.224e+05  -3.862 0.000113
## GEOID08013012607                      -4.386e+05  7.927e+04  -5.533 3.23e-08
## GEOID08013012608                      -4.541e+05  4.749e+04  -9.563  < 2e-16
## GEOID08013012701                      -5.160e+05  3.923e+04 -13.154  < 2e-16
## GEOID08013012705                      -5.125e+05  5.884e+04  -8.710  < 2e-16
## GEOID08013012707                      -2.842e+05  6.287e+04  -4.520 6.27e-06
## GEOID08013012708                      -5.351e+05  4.694e+04 -11.402  < 2e-16
## GEOID08013012709                      -5.199e+05  5.814e+04  -8.942  < 2e-16
## GEOID08013012710                      -3.548e+05  4.283e+04  -8.283  < 2e-16
## GEOID08013012800                      -6.967e+05  5.593e+04 -12.455  < 2e-16
## GEOID08013012903                      -6.069e+05  5.652e+04 -10.738  < 2e-16
## GEOID08013012904                      -5.493e+05  4.630e+04 -11.863  < 2e-16
## GEOID08013012905                      -4.903e+05  6.059e+04  -8.092 6.65e-16
## GEOID08013012907                      -4.971e+05  5.681e+04  -8.750  < 2e-16
## GEOID08013013003                      -4.932e+05  4.284e+04 -11.511  < 2e-16
## GEOID08013013004                      -4.845e+05  4.604e+04 -10.524  < 2e-16
## GEOID08013013005                      -4.770e+05  4.612e+04 -10.342  < 2e-16
## GEOID08013013006                      -4.597e+05  4.029e+04 -11.410  < 2e-16
## GEOID08013013201                      -5.353e+05  7.846e+04  -6.822 9.54e-12
## GEOID08013013202                      -2.453e+05  7.216e+04  -3.399 0.000680
## GEOID08013013205                      -5.517e+05  5.327e+04 -10.358  < 2e-16
## GEOID08013013207                      -6.148e+05  6.810e+04  -9.028  < 2e-16
## GEOID08013013208                      -6.116e+05  6.387e+04  -9.576  < 2e-16
## GEOID08013013210                      -5.655e+05  6.057e+04  -9.337  < 2e-16
## GEOID08013013211                      -6.566e+05  5.694e+04 -11.531  < 2e-16
## GEOID08013013212                      -5.681e+05  5.797e+04  -9.800  < 2e-16
## GEOID08013013213                      -6.935e+05  5.432e+04 -12.767  < 2e-16
## GEOID08013013302                      -4.929e+05  6.236e+04  -7.903 3.04e-15
## GEOID08013013305                      -5.727e+05  6.738e+04  -8.500  < 2e-16
## GEOID08013013306                      -5.588e+05  7.061e+04  -7.914 2.79e-15
## GEOID08013013307                      -5.736e+05  6.716e+04  -8.542  < 2e-16
## GEOID08013013308                      -5.143e+05  6.745e+04  -7.626 2.68e-14
## GEOID08013013401                      -5.023e+05  6.771e+04  -7.419 1.29e-13
## GEOID08013013402                      -6.149e+05  6.501e+04  -9.459  < 2e-16
## GEOID08013013503                      -5.543e+05  6.776e+04  -8.180 3.22e-16
## GEOID08013013505                      -5.564e+05  7.659e+04  -7.265 4.05e-13
## GEOID08013013506                      -6.399e+05  6.881e+04  -9.299  < 2e-16
## GEOID08013013507                      -5.955e+05  6.948e+04  -8.571  < 2e-16
## GEOID08013013508                      -6.420e+05  6.739e+04  -9.526  < 2e-16
## GEOID08013013601                      -5.485e+05  6.136e+04  -8.940  < 2e-16
## GEOID08013013602                      -5.255e+05  6.503e+04  -8.081 7.27e-16
## GEOID08013013701                      -5.023e+05  4.138e+04 -12.139  < 2e-16
## GEOID08013013702                      -5.714e+05  4.805e+04 -11.891  < 2e-16
## GEOID08013060600                      -6.629e+05  3.794e+04 -17.473  < 2e-16
## GEOID08013060700                      -5.477e+05  4.553e+04 -12.031  < 2e-16
## GEOID08013060800                      -5.110e+05  5.425e+04  -9.420  < 2e-16
## GEOID08013060900                      -6.147e+05  4.689e+04 -13.108  < 2e-16
## GEOID08013061300                      -6.883e+05  4.610e+04 -14.931  < 2e-16
## GEOID08013061400                      -6.593e+05  4.410e+04 -14.948  < 2e-16
## urban_statusurban                     -2.785e+04  1.618e+04  -1.721 0.085335
## designCodeDscr2-3 Story               -2.898e+04  8.593e+03  -3.373 0.000747
## designCodeDscrBi-level                 4.715e+04  1.836e+04   2.568 0.010230
## designCodeDscrMULTI STORY- TOWNHOUSE  -1.246e+05  1.272e+04  -9.796  < 2e-16
## designCodeDscrSplit-level              2.123e+04  1.269e+04   1.674 0.094223
## qualityCodeDscrAVERAGE +              -4.314e+04  1.402e+04  -3.077 0.002095
## qualityCodeDscrAVERAGE ++             -6.814e+03  1.432e+04  -0.476 0.634092
## qualityCodeDscrEXCELLENT               9.129e+05  3.248e+04  28.108  < 2e-16
## qualityCodeDscrEXCELLENT +             1.107e+06  7.173e+04  15.437  < 2e-16
## qualityCodeDscrEXCELLENT++             1.745e+06  5.708e+04  30.576  < 2e-16
## qualityCodeDscrEXCEPTIONAL 1           9.276e+05  7.247e+04  12.800  < 2e-16
## qualityCodeDscrEXCEPTIONAL 2           1.386e+06  1.761e+05   7.872 3.91e-15
## qualityCodeDscrFAIR                   -4.975e+04  3.615e+04  -1.376 0.168817
## qualityCodeDscrGOOD                    2.266e+04  1.033e+04   2.193 0.028337
## qualityCodeDscrGOOD +                  3.439e+04  1.705e+04   2.017 0.043679
## qualityCodeDscrGOOD ++                 1.184e+05  1.592e+04   7.439 1.11e-13
## qualityCodeDscrLOW                    -1.043e+05  7.766e+04  -1.343 0.179413
## qualityCodeDscrVERY GOOD               1.739e+05  1.687e+04  10.306  < 2e-16
## qualityCodeDscrVERY GOOD +             3.802e+05  2.931e+04  12.971  < 2e-16
## qualityCodeDscrVERY GOOD ++            4.978e+05  2.446e+04  20.352  < 2e-16
## TotalFinishedSF                        1.367e+02  6.530e+00  20.936  < 2e-16
## nbrBedRoom.cat5+ Bedrooms              4.239e+04  1.029e+04   4.118 3.85e-05
## nbrBedRoom.catUp to 3 Bedrooms        -1.091e+04  7.728e+03  -1.411 0.158148
## HeatingDscrElectric                   -4.604e+04  3.648e+04  -1.262 0.206924
## HeatingDscrElectric Wall Heat (1500W)  8.146e+04  2.083e+05   0.391 0.695736
## HeatingDscrForced Air                 -2.047e+04  3.220e+04  -0.636 0.524976
## HeatingDscrGravity                    -4.473e+04  6.669e+04  -0.671 0.502451
## HeatingDscrHeat Pump                  -2.556e+05  1.087e+05  -2.353 0.018663
## HeatingDscrHot Water                   2.679e+04  3.378e+04   0.793 0.427744
## HeatingDscrNo HVAC                     2.036e+05  2.087e+05   0.976 0.329290
## HeatingDscrPackage Unit                1.351e+04  2.932e+05   0.046 0.963256
## HeatingDscrRadiant Floor               2.358e+05  4.485e+04   5.257 1.50e-07
## HeatingDscrVentilation Only           -4.257e+05  2.945e+05  -1.445 0.148375
## HeatingDscrWall Furnace                8.826e+04  4.702e+04   1.877 0.060567
## Roof_CoverDscrAsphalt                 -1.344e+04  7.642e+03  -1.759 0.078634
## Roof_CoverDscrBuilt-Up                 4.247e+05  1.229e+05   3.455 0.000552
## Roof_CoverDscrClay Tile               -5.990e+04  4.721e+04  -1.269 0.204477
## Roof_CoverDscrConcrete Tile           -1.300e+05  2.495e+04  -5.209 1.94e-07
## Roof_CoverDscrMetal                    1.261e+05  2.583e+04   4.883 1.06e-06
## Roof_CoverDscrRoll                    -1.700e+05  2.913e+05  -0.584 0.559451
## Roof_CoverDscrRubber Membrane          9.948e+04  2.776e+04   3.583 0.000341
## Roof_CoverDscrShake                   -8.089e+04  4.558e+04  -1.775 0.075996
## Roof_CoverDscrTar and Gravel           3.378e+04  1.471e+05   0.230 0.818410
## park                                  -1.087e+01  4.341e+00  -2.505 0.012263
## school                                 3.585e+00  3.287e+00   1.091 0.275498
## restaurant_nn1                         9.421e+00  3.952e+00   2.384 0.017162
## bus_stop_nn1                          -1.234e+01  5.685e+00  -2.170 0.030029
## company                                6.237e+00  5.659e+00   1.102 0.270400
## water_nn1                             -9.872e-01  8.267e+00  -0.119 0.904945
## lagPrice                               3.310e-01  1.391e-02  23.789  < 2e-16
##                                          
## (Intercept)                           ***
## age                                   ***
## GEOID08013012102                      ***
## GEOID08013012103                      ***
## GEOID08013012104                      ** 
## GEOID08013012105                      ***
## GEOID08013012201                         
## GEOID08013012202                      ***
## GEOID08013012203                      ***
## GEOID08013012204                      ***
## GEOID08013012300                         
## GEOID08013012401                      ** 
## GEOID08013012501                      ***
## GEOID08013012505                      ***
## GEOID08013012507                      ***
## GEOID08013012508                      ***
## GEOID08013012509                      ***
## GEOID08013012510                      ***
## GEOID08013012511                      ***
## GEOID08013012603                      ***
## GEOID08013012605                      ***
## GEOID08013012607                      ***
## GEOID08013012608                      ***
## GEOID08013012701                      ***
## GEOID08013012705                      ***
## GEOID08013012707                      ***
## GEOID08013012708                      ***
## GEOID08013012709                      ***
## GEOID08013012710                      ***
## GEOID08013012800                      ***
## GEOID08013012903                      ***
## GEOID08013012904                      ***
## GEOID08013012905                      ***
## GEOID08013012907                      ***
## GEOID08013013003                      ***
## GEOID08013013004                      ***
## GEOID08013013005                      ***
## GEOID08013013006                      ***
## GEOID08013013201                      ***
## GEOID08013013202                      ***
## GEOID08013013205                      ***
## GEOID08013013207                      ***
## GEOID08013013208                      ***
## GEOID08013013210                      ***
## GEOID08013013211                      ***
## GEOID08013013212                      ***
## GEOID08013013213                      ***
## GEOID08013013302                      ***
## GEOID08013013305                      ***
## GEOID08013013306                      ***
## GEOID08013013307                      ***
## GEOID08013013308                      ***
## GEOID08013013401                      ***
## GEOID08013013402                      ***
## GEOID08013013503                      ***
## GEOID08013013505                      ***
## GEOID08013013506                      ***
## GEOID08013013507                      ***
## GEOID08013013508                      ***
## GEOID08013013601                      ***
## GEOID08013013602                      ***
## GEOID08013013701                      ***
## GEOID08013013702                      ***
## GEOID08013060600                      ***
## GEOID08013060700                      ***
## GEOID08013060800                      ***
## GEOID08013060900                      ***
## GEOID08013061300                      ***
## GEOID08013061400                      ***
## urban_statusurban                     .  
## designCodeDscr2-3 Story               ***
## designCodeDscrBi-level                *  
## designCodeDscrMULTI STORY- TOWNHOUSE  ***
## designCodeDscrSplit-level             .  
## qualityCodeDscrAVERAGE +              ** 
## qualityCodeDscrAVERAGE ++                
## qualityCodeDscrEXCELLENT              ***
## qualityCodeDscrEXCELLENT +            ***
## qualityCodeDscrEXCELLENT++            ***
## qualityCodeDscrEXCEPTIONAL 1          ***
## qualityCodeDscrEXCEPTIONAL 2          ***
## qualityCodeDscrFAIR                      
## qualityCodeDscrGOOD                   *  
## qualityCodeDscrGOOD +                 *  
## qualityCodeDscrGOOD ++                ***
## qualityCodeDscrLOW                       
## qualityCodeDscrVERY GOOD              ***
## qualityCodeDscrVERY GOOD +            ***
## qualityCodeDscrVERY GOOD ++           ***
## TotalFinishedSF                       ***
## nbrBedRoom.cat5+ Bedrooms             ***
## nbrBedRoom.catUp to 3 Bedrooms           
## HeatingDscrElectric                      
## HeatingDscrElectric Wall Heat (1500W)    
## HeatingDscrForced Air                    
## HeatingDscrGravity                       
## HeatingDscrHeat Pump                  *  
## HeatingDscrHot Water                     
## HeatingDscrNo HVAC                       
## HeatingDscrPackage Unit                  
## HeatingDscrRadiant Floor              ***
## HeatingDscrVentilation Only              
## HeatingDscrWall Furnace               .  
## Roof_CoverDscrAsphalt                 .  
## Roof_CoverDscrBuilt-Up                ***
## Roof_CoverDscrClay Tile                  
## Roof_CoverDscrConcrete Tile           ***
## Roof_CoverDscrMetal                   ***
## Roof_CoverDscrRoll                       
## Roof_CoverDscrRubber Membrane         ***
## Roof_CoverDscrShake                   .  
## Roof_CoverDscrTar and Gravel             
## park                                  *  
## school                                   
## restaurant_nn1                        *  
## bus_stop_nn1                          *  
## company                                  
## water_nn1                                
## lagPrice                              ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 290000 on 9000 degrees of freedom
## Multiple R-squared:  0.7302, Adjusted R-squared:  0.7266 
## F-statistic: 206.4 on 118 and 9000 DF,  p-value: < 2.2e-16

4.2 Results of test dataset

Table 3 shows the mean absolute error (AbsError) and the mean absolute percentage error (MAPE) for a single test set. Here we compare the test set with and without neighbourhood effects. Notably model with neighbourhood effects becomes more accurate, since both MAPE and AbsError decreased as we introduced neighbourhoods to our model. However, a model with a MAPE of 18.9% still needs future improvement.

reg.nhood.training <- lm(price ~ ., data = st_drop_geometry(Boulder.training) %>% 
                           dplyr::select(price,
                                age,
                                GEOID,
                                urban_status,
                                designCodeDscr,
                                qualityCodeDscr,
                                TotalFinishedSF,
                                nbrBedRoom.cat,
                                HeatingDscr,
                                Roof_CoverDscr,
                                park,
                                school,
                                restaurant_nn1,
                                bus_stop_nn1,
                                company,
                                water_nn1,
                                lagPrice))
housing.test.nhood <-
  housing.test.nhood %>%
  mutate(Regression = "Neighbourhood effects",
         price.Predict = predict(reg.nhood.training, housing.test.nhood),
         price.Error = price.Predict - price,
         price.AbsError = abs(price.Predict - price),
         price.APE = (abs(price.Predict - price)) /
           price.Predict)%>%
  filter(price < 8000000)

coords_1 <-  st_coordinates(housing.test.nhood) 
neighborList_1 <- knn2nb(knearneigh(coords_1, 5))
spatialWeights_1 <- nb2listw(neighborList_1, style="W")
housing.test.nhood$lagPriceError <- lag.listw(spatialWeights_1, housing.test.nhood$price.AbsError)

coords_2 <- st_coordinates(Boulder.testing)
neighborList_2 <- knn2nb(knearneigh(coords_2, 5))
spatialWeights_2 <- nb2listw(neighborList_2, style="W")
Boulder.testing$lagPriceError <- lag.listw(spatialWeights_2,
                                           Boulder.testing$price.AbsError)

comparison <- 
  rbind(
    dplyr::select(Boulder.testing, starts_with("price"), Regression) %>%
      mutate(lagPriceError = lag.listw(spatialWeights_2, price.Error)),
    dplyr::select(housing.test.nhood, starts_with("price"), Regression) %>%
      mutate(lagPriceError = lag.listw(spatialWeights_1, price.Error))) 

st_drop_geometry(comparison) %>%
  gather(Variable, Value, -Regression) %>%
  filter(Variable == "price.AbsError" | Variable == "price.APE") %>%
  group_by(Regression, Variable) %>%
  summarize(meanValue = mean(Value, na.rm = T)) %>%
  spread(Variable, meanValue) %>%
  kable() %>%
  kable_styling("striped", full_width = F) %>%
  row_spec(1, color = "black", background = "#25CB10") %>%
  row_spec(2, color = "black", background = "#FA7800") %>%
  footnote(general_title = "\n",
           general = "Table 3. Table of mean absolute error and MAPE for a single test set")
Regression price.AbsError price.APE
Baseline Regression 175798.4 0.2662743
Neighbourhood effects 130056.4 0.1894812

Table 3. Table of mean absolute error and MAPE for a single test set

4.3 Cross validation

100-folds cross validation is then conducted to identify the optimal parameters in our training dataset. The MAE is 149189.9 and its standard deviation is 23366.62. The root mean square (RMSE) is 283864.8 and the R square is 0.7434937, which suggests that about 74.75% of variance can be explained in validation dataset. According to this histogram, MAE, RMSE and R square are not distributed normally. Further analysis should be conducted to test the generalizability of the training dataset.

fitControl <- trainControl(method = "cv", number = 100)
set.seed(825)

reg_neighbourhood.cv <- 
  train(price ~ ., data = st_drop_geometry(Boulder.training) %>% 
          dplyr::select(price,
                        age,
                        GEOID,
                        designCodeDscr,
                        qualityCodeDscr,
                        TotalFinishedSF,
                        nbrBedRoom.cat,
                        HeatingDscr,
                        Roof_CoverDscr,
                        urban_status,
                        park,
                        school,
                        restaurant_nn1,
                        bus_stop_nn1,
                        company,
                        water_nn1,
                        lagPrice), 
        method = "lm", trControl = fitControl, na.action = na.pass)

reg_neighbourhood.cv
## Linear Regression 
## 
## 9119 samples
##   16 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (100 fold) 
## Summary of sample sizes: 9029, 9028, 9027, 9028, 9028, 9027, ... 
## Resampling results:
## 
##   RMSE      Rsquared  MAE     
##   280149.2  0.747521  149194.7
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
reg_neighbourhood.cv$resample %>% 
  pivot_longer(-Resample) %>% 
  mutate(name = as.factor(name)) %>% 
  ggplot(., aes(x = value, color = name)) +
  geom_histogram(bins = 30, colour="black", fill = "#FDE725FF") +
  facet_wrap(~name, ncol = 3, scales = "free") +
  theme_bw() +
  theme(
    legend.position = "none"
  )

MAE_mean<- mean(reg_neighbourhood.cv$resample[,3])
MAE_SD<- sd(reg_neighbourhood.cv$resample[,3])

compare<- data.frame(MAE_mean=MAE_mean,
                     MAE_SD=MAE_SD)
compare
##   MAE_mean   MAE_SD
## 1 149194.7 23373.69

4.4 Predicted sale price as function of observed price

Figure 6.1 plotted the predicted sale price as function of observed price. By comparing the gaps between the prediction (green line) and the perfect prediction (orange line), our predicted prices were close for majority. However, as prices increased, more errors spread out. By comparing the baseline regression and the neighbourhoods effects model in Figure 6.2, we noticed that the previous baseline regression was already close to the prefect prediction, but neighbourhoods effect even fits better slightly.

housing.test.nhood1<-housing.test.nhood%>%
  filter(price<5500000, price.Predict<5500000&price.Predict>0)

ggplot(housing.test.nhood1,aes(price, price.Predict)) +
  geom_point() +
  stat_smooth(data=housing.test.nhood1,aes(price, price),
              method = "lm", se = FALSE, size = 1, colour="#FA7800") +
  stat_smooth(data=housing.test.nhood1,aes(price,price.Predict),
              method = "lm", se = FALSE, size = 1, colour="#25CB10") +
  facet_wrap(~Regression) +
  labs(title="Figure 6.1 Predicted sale price as a function of observed price",
       subtitle="Orange line represents a perfect prediction;
 Green line represents prediction") +
  plotTheme()

comparison %>%
  dplyr::select(price.Predict, price, Regression) %>%
  ggplot(aes(price, price.Predict)) +
  geom_point() +
  stat_smooth(aes(price, price),
              method = "lm", se = FALSE, size = 1, colour="#FA7800") +
  stat_smooth(aes(price.Predict, price),
              method = "lm", se = FALSE, size = 1, colour="#25CB10") +
  facet_wrap(~Regression) +
  labs(title="Figure 6.2 Predicted sale price as a function of observed price in comparison",
       subtitle="Orange line represents a perfect prediction;
Green line represents prediction") +
  plotTheme()

4.5 Residuals of test sets

Figure 7.1 shows the distribution of residuals for the test set. We observed some clusters in the east, southeastern area and city of Boulder, suggesting the positive spatial autocorrelation exists. Figure 7.2 indicates that the observed Moran’s I (represented by the orange line) is significantly higher than all 999 randomly permuted I, with a positive value of about 0.25, which again confirmed that positive spatial autocorrelation exists in these residuals. Then according to Figure 7.3, generally the prices increase as the spatial lag of errors increase. Seemingly more house prices, as well as outliers, are above the best-fit line, which suggest a clustering pattern of house prices.

Map of residuals

ggplot() +
  geom_sf(data = Boulder.county.reproject, fill = "grey40") +
  geom_sf(data = housing.test.nhood, aes(colour = q5(price.AbsError)), 
          show.legend = "point", size = 1) +
  scale_colour_manual(values = palette5,
                      labels=qBr(housing.test.nhood,"price.AbsError"),
                      name="Quintile\nBreaks") +
  labs(title="Figure 7.1. Map of Residuals for the test set") +
  mapTheme()

Moran’s I

moranTest <- moran.mc(housing.test.nhood$price.AbsError,
                      spatialWeights_1, nsim = 999)
ggplot(as.data.frame(moranTest$res[c(1:999)]),
       aes(moranTest$res[c(1:999)])) +
  geom_histogram(binwidth = 0.01) +
  geom_vline(aes(xintercept = moranTest$statistic),
             colour = "#FA7800",size=1) +
  scale_x_continuous(limits = c(-1, 1)) +
  labs(title="Figure 7.2. Observed and permuted Moran's I",
       subtitle= "Observed Moran's I in orange",
       x="Moran's I",
       y="Count") +
  plotTheme()

plot of the spatial lag in errors

ggplot(housing.test.nhood, aes(x=lagPriceError, y=price)) +
  geom_point(colour = "#FA7800") +
  geom_smooth(method = "lm", se = FALSE, colour = "#25CB10") +
  labs(title = "Figure 7.3. plot of the spatial lag in errors",
       x = "Spatial lag of errors",
       y = "price") +
  plotTheme()

4.6 Mapping predicted values

With the regression model we predicted sales prices where to predict equals 0 in the original housing dataset. Then we plotted our results for all sales price where toPredict is both 0 and 1 as Figure 8 shows. To our prediction, the most expensive houses are concentrated in Boulder city and southeast corner of Boulder county, while the cheapest prices are clustered close to the northeast corner of Boulder city.

housing.test.all <- housing
housing.test.all <- housing.test.all%>%
  mutate(Regression = "Neighbourhood effects",
         price.Predict = predict(reg.nhood.training, housing.test.all),
         price.Error = price.Predict - price,
         price.AbsError = abs(price.Predict - price),
         price.APE = (abs(price.Predict - price)) /
           price.Predict)%>%
  filter(price < 8000000)
ggplot() +
  geom_sf(data = Boulder.county.reproject, fill = "grey40") +
  geom_sf(data = housing.test.all, aes(colour = q5(price.Predict)), 
          show.legend = "point", size = 1) +
  scale_colour_manual(values = palette5,
                      labels=qBr(housing.test.all,"price.Predict"),
                      name="Quintile\nBreaks") +
  labs(title="Figure 8. map of predicted values for where toPredict is both 0 and 1") +
  mapTheme()

4.7 MAPE by neighborhood

Using the test set prediction, mean absolute percentage error (MAPE) by neighborhood is mapped as Figure 9. Most neighhourhoods have low value of MAPE, which is below 0.4. Notably, in the central Boulder county, one neighbourhood was significantly high, which exceeds 100%. Probably because this area has limited houses and more easily caused distortion.

st_drop_geometry(housing.test.nhood)%>%
  group_by(GEOID) %>% 
  summarise(MAPE = mean(price.APE, na.rm = T))%>%
  ungroup()%>%
  left_join(neighborhood)%>%
  st_sf()%>%
  ggplot() +
  geom_sf(aes(fill = MAPE)) +
  geom_sf(data = housing.test.nhood, colour ="black", size =.5) +
  scale_fill_gradient(low = palette5[1], high = palette5[5],
                      name = "MAPE") +
  labs(title = "Figure 9. map of mean absolute percentage error(MAPE) by neighborhood") +
  mapTheme()

Figure 10 plots the mean MAPE by neighborhood as a function of mean price by neighborhood. Except for few outliers, the MAPE is relatively stable and is below 0.4, which suggests good generalizability of our model.

nhood.summary <- housing.test.nhood %>% 
  group_by(GEOID) %>%
  summarize(meanPrice = mean(price, na.rm = T),
            meanPrediction = mean(price.Predict, na.rm = T),
            meanMAE = mean(price.AbsError, na.rm = T),
            MAPE=mean(price.APE, na.rm = T))

nhood.summary %>% 
  st_drop_geometry %>%
  arrange(desc(meanMAE)) %>% 
  knitr::kable() %>% kable_styling()
GEOID meanPrice meanPrediction meanMAE MAPE
08013012707 2120500.0 1166850.7 1273518.75 1.1238463
08013013202 2226250.0 1776044.6 918029.34 0.4859673
08013012204 798350.0 1621731.4 823381.40 0.5106646
08013012101 2290516.7 2067379.8 569175.45 0.2693979
08013012103 1404588.2 1337481.9 440614.70 0.2642201
08013012102 1406130.0 1278106.9 398836.43 0.2870487
08013012104 1014168.7 1301185.4 394024.46 0.3140760
08013012505 1398710.5 1359648.7 304612.49 0.2173348
08013012701 1224512.9 1116642.0 295421.07 0.2183454
08013012201 2014471.4 1876777.7 283350.50 0.1423570
08013012510 1260266.7 1180778.0 276279.93 0.2093141
08013012401 1132791.7 1280710.3 274719.39 0.2138087
08013012710 1079828.6 1228024.5 251959.46 0.1995180
08013013205 1162228.5 1255044.4 235572.86 0.1791924
08013012105 1382153.8 1330401.1 222853.53 0.1573124
08013012903 770633.3 853343.6 201922.28 0.1961022
08013013701 911678.8 873731.8 201628.80 0.2328420
08013013201 739500.0 872439.4 175776.96 0.2174667
08013013503 467741.7 338356.2 173040.03 1.0628816
08013012511 1139250.0 970878.5 168994.98 0.1695412
08013013602 350129.4 373474.0 151727.96 0.4345501
08013012202 1070925.0 1003688.6 134416.38 0.1345300
08013013402 506371.4 467595.4 130221.07 0.3728257
08013061400 721897.9 741355.8 129741.97 0.1825475
08013012800 663073.8 683950.5 127966.92 0.1950462
08013013213 704024.5 695430.2 121345.35 0.1947325
08013013003 748582.1 820966.9 121063.08 0.1287529
08013012708 728356.9 784143.6 119787.89 0.1424354
08013060700 805805.6 826428.3 116788.44 0.1280086
08013012603 996720.0 963564.5 115047.43 0.1122261
08013060600 623158.3 685646.4 110131.48 0.1553547
08013012509 884500.0 914385.7 110110.16 0.1193184
08013013702 578085.9 588847.6 106741.35 0.2055834
08013013005 862844.4 830690.1 104235.80 0.1480960
08013060900 665766.7 659528.2 103482.74 0.1833571
08013060800 533064.3 519844.9 93364.76 0.1825616
08013013601 665925.0 696686.9 88760.92 0.1414480
08013013004 752631.2 756682.4 86471.70 0.1116169
08013013006 723474.4 742955.9 84629.70 0.1091599
08013013302 487274.4 464067.2 84497.17 0.1930581
08013013211 560723.9 532946.5 83426.29 0.1933349
08013012608 651090.9 619179.0 82783.69 0.1293464
08013013508 492277.0 508276.7 77933.09 0.2010885
08013012203 552178.9 552744.1 77187.08 0.1454108
08013061300 760837.0 716928.6 77107.80 0.1214151
08013013208 479793.5 471337.1 76043.49 0.1742847
08013013401 361942.9 425785.0 74706.87 0.1682240
08013012905 483333.3 484521.0 73558.65 0.1673199
08013012907 483635.6 530540.3 73424.42 0.1354508
08013012705 757411.1 745427.4 73238.90 0.1011871
08013012507 671218.2 685611.4 72625.75 0.1048037
08013012501 804266.7 794134.6 72184.49 0.0923170
08013012508 735550.0 694082.5 72159.15 0.1068274
08013013308 389150.0 425392.5 72043.64 0.1527908
08013012904 596509.5 585442.7 71110.95 0.1317329
08013013207 534327.3 539925.1 70813.58 0.1479728
08013012607 617433.3 640693.1 70589.73 0.1103053
08013013506 476996.4 454730.9 61513.88 0.1681004
08013012709 663875.0 640852.1 59984.50 0.0863425
08013013305 368845.6 380597.2 44313.41 0.1177579
08013013307 389977.8 384569.1 42429.32 0.1101306
08013013210 385119.0 389124.1 41046.18 0.1020875
08013013212 460527.8 474533.4 39894.48 0.0844196
08013013505 354450.0 345362.8 34131.12 0.1010426
08013013507 390552.0 394478.7 32998.27 0.0790556
08013013306 375133.3 367951.7 32391.78 0.0904617
map_MAPE <- housing.test.nhood %>% 
  group_by(GEOID) %>% 
  summarise(MAPE = mean(price.APE, na.rm = T))

plot(nhood.summary$meanPrice, 
     nhood.summary$MAPE, 
     main="Figure 10. MAPE by neighborhood as a function of mean price by neighbourhood", 
     cex.main=0.75, ylim=range(0:1))

4.8 Test generalizability under income context

To further test our model’s generalizability, we split our cities into high and low income groups. We collected income data from 15-19 ACS using tidycensus and calculated the median income of Boulder county (40453). Figure 11 shows the test of generalizability under income context, where most neighbourhoods are wealthier. MAPE is calculated across the Baseline regression and neighbourhoods effects under income contexts in Table 3, where we observed lower MAPE of models with neighbourhoods effects. Therefore, neighourhood effects make a more generalizable prediction model.

tracts19<- get_acs(geography = "tract", variables = c("B01001_001E","B01001A_001E","B06011_001"), 
                   year = 2019,                              
                   geometry = T, 
                   state = "CO", 
                   county = "Boulder", 
                   output = "wide") %>%
  st_transform('EPSG:26913')  %>%
  rename(Median_Income = B06011_001E)  %>%
  mutate(incomeContext = ifelse(Median_Income > 40453, "High Income", "Low income"))

ggplot() + 
geom_sf(data = na.omit(tracts19),
              aes(fill = incomeContext)) +
               scale_fill_manual(values = c("#25CB10", "#FA7800"),
               name="Income Context") +
               labs(title = "Figure 11. Test of generalizability under income context") +
               mapTheme() + theme(legend.position="bottom")

st_join(comparison, tracts19) %>% 
  group_by(Regression, incomeContext) %>%
  summarize(mean.MAPE = scales::percent(mean(price.APE, na.rm = T))) %>%
  st_drop_geometry() %>%
  spread(incomeContext, mean.MAPE) %>%
  kable(caption = "Table 4. Table set MAPE by neighbourhood income context") %>%
  kable_styling("striped", full_width = F) %>%
  row_spec(1, color = "black", background = "#25CB10") %>%
  row_spec(2, color = "black", background = "#FA7800")
Table 4. Table set MAPE by neighbourhood income context
Regression High Income Low income
Baseline Regression 26% 27%
Neighbourhood effects 18% 21%

5 Discussion

Our model is an effective model with the p-value of the F-statistic small smaller than 0.001 (p-value < 2.2e-16). Our model predicts around 72% of the variation in prices.
Some of the more interesting variables include distance to nearest school, distance to nearest bus station, and distance to nearest company that are statistically significant in the baseline model but lose significance once we introduce the spatial process variables including the neighborhood variable and the urban status.
Important features for home price prediction include age, neighborhood, urban status, design style, quality, built-up area, number of bedrooms, heating system, roof material, distance to park, distance to restaurant, and the mean price of the nearest 5 houses. Specifically, the neighborhood, urban status, and lag price variables represent the spatial process of price clustering in which houses in neighborhoods with higher home price tend to also have higher price. The design style and roof material variables are distributed spatially correlated to the clustering process of homes and are corresponding with the urban/non-urban division of spatial context; specific design styles are concentrated in urban neighborhoods that tend to have higher home price. While the distance to amenities such as parks and restaurants stay significant after we introduce spatial process variables, the significance of them decrease because they too are clustered in relation to the neighborhood effects. Thus, the correlation of variables are complicated in spatial sense, reflecting a clustering distribution that is crucial for home price. The model has a mean of absolute error of 149189.9 and a mean absolute percentage error of 19%. While our model has a slightly higher mean absolute percentage error for neighborhoods with high income, the relatively small gap between the MAPE for high-income and low-income neighborhoods suggest that the model can account the spatial variation in prices. The residuals, although largely randomly distributed, tend to be higher in Boulder city; however, the residuals tend to be lower in Longmont, also an urban region. Thus, what causes the error can be spatial characteristics specific to Boulder city that our model fails to capture. For instance, the large young population of students of University of Colorado in Boulder city can skew the home price. In more remote regions where few observations exist, the results of residuals are mixed, indicating that the model might require more observation from the rural context to reduce the residual.

6 Conclusion

We would recommend our model to Zillow to include more local intelligence into their home price prediction model. Our model has a decent degree of accuracy and generalizability to predict home price across neighborhoods and to account for the variation of prices in a spatial sense. We can improve the model more by using more training data as the training data is rather limited fro Boulder county and we lack some key data such as crime data to train our model better. Also, the socioeconomic data including income, racial composition, and educational attainment that are accessible to us is census data that has a high collinearity with the neighborhood data that we use census tracts to approximate for; thus, we are unable to include socioeconomic data that is likely to be influential into our model. More feature engineering with using log rather than the raw data might also reshapes the distribution of independent variables and builds a fitter model. We believe that Zillow can use the rich resources and data that they already possess to train and improve our model with more available variables to get even better prediction results.