Case study: urban segregation with scraped AirBnB data (from inside AirBnb or scrape it yourself)

After loading packages…

… define the fixed parameters you will need on this tutorial. For example, we choose to work on three Canadian cities with census data, so we set here the id they have in the census, the coordinates of the city halls, the variable names/codes of visible minorities as well as some elements of size etc

#Code for CSD in census
ct <-  "35535"
cm  <-  "24462"
cv  <-  "59933"

#Code for central municipality in census
cmt <-  "3520005"
cmv <- "5915022"
cmm <- "2466023"

#Coordinates of city hall
cht  <-  c(-79.5257363, 43.6148863)
chv  <-  c(-123.2309412, 49.2219987)
chm  <-  c(-73.7171664, 45.4726094)

#Critical distance for AirBnb
bdtv <- 10000
bdm <-  12000

#Files from inside airbnb
f  <-  list.files('insideairbnb/')

## Identifying the vectors for visible Minority status
parent_vector <- "v_CA16_3954"
minorities <- list_census_vectors("CA16") %>% 
  filter(vector == "v_CA16_3954") %>% 
  child_census_vectors(leaves_only = TRUE) %>% 
  pull(vector)

minority_vectors <- c(parent_vector, minorities)
lab  <-  as.data.frame (list_census_vectors('CA16') %>% 
                       filter(vector %in% minorities) %>% 
                       select(vector, label))

We go on to the loading of pre-downloaded files from the website inside airbnb, and pull them together in a single table.

montrealAirbnb <- read.csv(paste0('insideairbnb/', f[1]))
torontoAirbnb <- read.csv(paste0('insideairbnb/', f[2]))
vancouverAirbnb <- read.csv(paste0('insideairbnb/', f[3]))

df <-  rbind(montrealAirbnb, torontoAirbnb, vancouverAirbnb)
#summary(df)

In order to use airbnb data for residential segregation studies, we need to remove the listings specific to tourism and not inhabited all year round.

l <- levels(df$property_type)
lookup  <-  data.frame('type' = 1:length(l))
lookup$type <- as.factor(l)
lookup$property_group <- c(
  # [1] "Aparthotel"             "Apartment"              "Bed and breakfast"      "Boat"                   "Boutique hotel"         "Bungalow"               "Cabin"                 
  'hotel', 'home', 'hotel', 'other', 'hotel', 'home', 'other',
  #  [8] "Camper/RV"              "Campsite"               "Casa particular (Cuba)" "Cave"                   "Chalet"                 "Condominium"            "Cottage"               
  'other', 'other', 'home', 'other', 'home', 'home', 'home',
  # [15] "Farm stay"              "Guest suite"            "Guesthouse"             "Hostel"                 "Hotel"                  "House"                  "Houseboat"             
  'home', 'home', 'hotel', 'hotel', 'hotel', 'home', 'home',
  # [22] "Hut"                    "Loft"                   "Nature lodge"           "Other"                  "Serviced apartment"     "Tent"                   "Timeshare"             
  'other', 'home', 'other', 'other', 'hotel', 'other', 'other',
  # [29] "Tiny house"             "Townhouse"              "Villa"                  "Barn"                   "Castle"                 "Dorm"                   "Earth house"           
  'home', 'home', 'home', 'home', 'home', 'hotel', 'other',
  # [36] "In-law"                 "Parking Space"          "Treehouse"              "Resort"            
  'home', 'other', 'other', 'hotel'
)

dfb = data.frame(df,lookup[match(df$property_type, lookup$type),] )
dfh = subset(dfb, property_group == 'home' & as.character(dfb$host_neighbourhood) == as.character(dfb$neighbourhood))
dim(dfh)[1] - dim(df)[1]
## [1] -11639
dfh$property_group <- NULL
dfhu = dfh[!duplicated(dfh$host_id),]
dim(dfhu)[1] - dim(dfh)[1]
## [1] -7168

IN case some moved out, we only keep recent listings:

dfhu$year  <-  as.numeric(substr(dfhu$last_review, 1, 4))
dfhun = subset(dfhu, year >= 2017)
dim(dfhun)[1] - dim(dfhu)[1]
## [1] -7499

And try to estimate the price per room, given that the number of square meter per listing is too sparsely known.

dfhun$numPrice <- as.numeric(gsub("[$]",'',dfhun$price))
## Warning: NAs introduits lors de la conversion automatique
final  <-  subset(dfhun, room_type != "Shared room")
final$rooms  <-  ifelse(final$bedrooms == 0, 1, final$bedrooms)
final$priceperroom  <-  as.numeric(ifelse(final$room_type == "Private room",  final$numPrice,  final$numPrice / final$rooms))

dim(final)
## [1] 16755   101

This coumd have been done properly with dplyr, in one command, but it does not really matter (to me!) how you get there as long as you get there with the same result.

final <- left_join(df, lookup, by = c("property_type" = "type")) %>%
  filter(property_group == 'home' &
           as.character(host_neighbourhood) == as.character(neighbourhood) ) %>% #&
  filter(!duplicated(host_id)) %>%
  mutate(year = as.numeric(substr(last_review, 1, 4))) %>%
  filter(year >= 2017) %>%
  mutate(numPrice = as.numeric(gsub("[$]",'', price))) %>%
  filter(room_type != "Shared room") %>%
  mutate(rooms = ifelse(bedrooms == 0, 1, bedrooms),
         priceperroom = as.numeric(ifelse(room_type == "Private room", numPrice,numPrice / rooms)))

Retrieving the census data

(This chunk of code comes from @dshkol - Thanks!) https://github.com/dshkol/scratchpad/blob/master/content/post/2018-05-10-diversity-and-segregation-i.Rmd

Diversity

We define a measure of minority diversity (entropy) at the aggregate level of geographical units and define a segregation index as the deviation from the city-wide diversity value.

Let’s apply these fonctions to get the diversity index for all CSD for all Canadian CMAS. (This chunk of code comes from @dshkol - Thanks!)

Segregation

(This chunk of code comes from @dshkol - Thanks!)

We define a measure of minority segregation index as the deviation from the city-wide diversity value.

And apply it to Canadial metropolises. (This chunk of code comes from @dshkol - Thanks!)

Economic segregation

Now let’s define a measure of segration which accounts for ordinal variables such as income, wealth or airbnb prices, with Reardon’s ordinal measure of segregation (2009).

Comparing cities

(This chunk of code comes from @dshkol - Thanks!)

ggplot(bind_rows(cma_seg %>% 
                   filter(Population > 100000) %>% 
                   clean_names2 %>% 
                   top_n(10, -H), 
                 cma_seg %>% 
                   filter(Population > 100000) %>% 
                   clean_names2 %>% 
                   top_n(10, H)), 
       aes(y = H, x = reorder(`Region Name`, H), size = Population)) + 
  #geom_bar(stat = "identity") +
  geom_point(colour = "#7e008c") + 
  coord_flip() + 
  theme_ipsum() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_line(linetype = "dotted"),
        legend.position = "none") +
  labs(y = "Segregation entropy index", x = "", 
       #title = "The most and the least segregated large cities in Canada",
       caption = "Segregation index of visible minorities\n@dshkol | Data: Statistics Canada, Census 2016")

(This chunk of code comes from @dshkol - Thanks!)

ggplot(cma_seg%>% filter(Population > 100000) %>% clean_names2 %>% 
         mutate(big_cma = ifelse(`CMA Population` > 1000000, `CMA Name`,"Other")),
       aes(y= H, x = Ei, size = Population^1.5, colour = big_cma)) +
  #geom_label_repel(aes(label = `Region Name`)) +
  geom_text_repel(aes(label = `Region Name`)) +
  scale_size_continuous(guide = FALSE) + 
  scale_colour_ipsum("", guide = FALSE) +
  theme_ipsum() +
  theme(panel.grid = element_blank(),
        axis.text = element_blank()) + 
  labs(x = "More diverse \u2192", y = "More segregated \u2192",
       caption = "Entropy index based calculations of diversity and segregation\nof visible minority groups in cities with population over 100,000\n@dshkol | Data: Statistics Canada, Census 2016")

Start city specific analysis with Airbnb Data

Now let’s select a city to work with more specifically, for example “toronto”

city <-  "toronto"

if(city == "toronto"){
  citynames  <-  unique(torontoAirbnb$city)
  censuscode  <-  ct
  cityhall  <-  cht
  censusCodeMuni  <- cmt 
  breakDist <-  bdtv
}
if(city == "vancouver"){
  citynames  <-  unique(vancouverAirbnb$city)
  censuscode  <-  cv
  cityhall  <-  chv
  censusCodeMuni  <-  cmv 
  breakDist  <-  bdtv
}
if(city == "montreal"){
  citynames  <-  unique(montrealAirbnb$city)
  censuscode  <-  cm
  cityhall  <-  chm
  censusCodeMuni <-  cmm
  breakDist  <-  bdm
}

city_data <-  subset(final, city %in% citynames)

Let’s retrieve the census minority data at the census tract and community area level: (This chunk of code comes from @dshkol - Thanks!)

Diversity

And plot resulting diversity map (we could also add airbnb listings when uncommenting the markers)

tractTable <- diversity_index(censuscode)


pal <- colorQuantile(
  palette =  'Blues',
  domain = city_data$priceperroom,
  n = 10
)
pal2 <- colorQuantile(
  palette =  'Reds',
  domain = tractTable$Ei,
  n = 10)

map <- leaflet() %>% addProviderTiles("CartoDB.Positron") %>%
  addPolygons(
    data = csd.csd.geo,
    color = 'black',
    fill = F,
    weight = 0.7,
    opacity = 0.9
  ) %>% addPolygons(
    data = tractTable,
    color =  ~ pal2(Ei),
    fill = ~ pal2(Ei),
    weight = 0.4
  ) %>%
  #addCircleMarkers(
  #   data = city_data,
  #       radius = ~ sqrt(4 * rooms),
  #   lat = ~ latitude,
  #   fillColor = ~ pal(priceperroom),
  #   color = 'black',
  #   stroke = T,
  #   fillOpacity = 0.5,
  #  weight = 0.1,
  #   layerId = ~ id,
  #   lng = ~ longitude
# ) %>% 
# addLegend(pal = pal, position = 'topleft', values = city_data$priceperroom)%>% 
addLegend(pal = pal2, position = 'topleft', values = tractTable$Ei)

map

Minority concentration

(This chunk of code comes from @dshkol - Thanks!)

The next plot shows the spatial concentration of minorities in the central municipality

long.ct.tidy <- long.ct %>% 
  select(-White) %>% 
  tidyr::gather(Group, Count, `South Asian`:Multiple) %>% 
  mutate(Proportion = Count/Total)

ggplot(long.ct.tidy) + geom_sf(aes(fill = Proportion^(1/2), colour = Proportion^(1/2))) + 
  scale_fill_viridis_c(option = 3, guide = FALSE) + 
  scale_colour_viridis_c(option = 3, guide = FALSE) + 
  theme_void() + 
  coord_sf(datum = NA) +
  facet_wrap(~Group, ncol = 4)  +
  labs(caption = "Visible minority groups by square-root proportion of Census Tract population\n@dshkol | Data: Statistics Canada, Census 2016")

While this map shows the distribution of airbnb listings.

pal <- colorQuantile(
  palette =  'Blues',
  domain = city_data$priceperroom,
  n = 10
)

map <- leaflet() %>% addProviderTiles("CartoDB.Positron") %>%
  addPolygons(
    data = csd.csd.geo,
    color = 'black',
    fill = F,
    weight = 0.7,
    opacity = 0.9
  ) %>% addPolygons(
    data = csd.geo,
    color = 'grey',
    fill = F,
    weight = 0.4
  ) %>%
  addCircleMarkers(
    data = city_data,
    radius = ~ sqrt(4 * rooms),
    lat = ~ latitude,
    fillColor = ~ pal(priceperroom),
    color = 'black',
    stroke = T,
    fillOpacity = 0.5,
    weight = 0.1,
    layerId = ~ id,
    lng = ~ longitude
  ) %>% 
  addLegend(pal = pal, position = 'topleft', values = city_data$priceperroom)
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
map

Interaction of big and small data

If we want to make the two sets of data communicate, we need to do an intersection between the two, by either aggregating scatter data into areal polygons, either by assigning polygon information to points.

#############aggregate airbnb into city tracts.
city_tracts  <-  as_Spatial(csd.geo[csd.geo$CSD_UID == censusCodeMuni,])
rbnb_pts <- city_data 
coordinates(rbnb_pts) <- ~longitude+latitude
rbnb_pts@proj4string <-CRS("+proj=longlat +datum=WGS84")
city_tracts@proj4string <-CRS("+proj=longlat +datum=WGS84")
PTS <- as(rbnb_pts, "sf")
POLY <- as(city_tracts, "sf")
idata <- st_intersection(PTS, POLY)
## although coordinates are longitude/latitude, st_intersection assumes that they are planar
## Warning: attribute variables are assumed to be spatially constant
## throughout all geometries

Then we count the number of airbnb listings in tract and their price distribution

idata$distribPrice <- cut2(idata$priceperroom, g = 10)
intervals <- levels(unique(idata$distribPrice))

rbnbPerTract <- as.data.frame(idata %>%
                                #group_by(CSD_UID) %>%
                                count(GeoUID, sort = TRUE) %>%
                                select(GeoUID, n))[,1:2]
rbnbDistributionPerTract <- as.data.frame(idata %>%
                                            #group_by(CSD_UID) %>%
                                            count(GeoUID, distribPrice) %>%
                                            select(GeoUID, distribPrice, n))[,1:3]

tractDistrib <- dcast(rbnbDistributionPerTract, GeoUID ~ distribPrice)
## Using n as value column: use value.var to override.
tractDistrib[is.na(tractDistrib)] <- 0
tractDistrib <- tractDistrib[,c("GeoUID", intervals)]
colnames(tractDistrib)[2:11] <- paste0("G", 1:10)

lookupDistrib <-  data.frame('name' = paste0("G", 1:10), 'interval' = intervals)
listingCounts <-  as.data.frame(rbnbPerTract[,1:2])
city_tracts@data <- data.frame(city_tracts@data,
                                  listingCounts[match(city_tracts@data$GeoUID,
                                                      listingCounts$GeoUID),])
city_tracts@data <- data.frame(city_tracts@data,
                                  tractDistrib[match(city_tracts@data$GeoUID,
                                                     tractDistrib$GeoUID),])
city_tracts@data <- data.frame(city_tracts@data,
                                  tractTable[match(city_tracts@data$GeoUID,
                                                   tractTable$GeoUID),])

We also include minority share in the spatial table

Mino <-  as.data.frame(cma.ct[,c("GeoUID", minorities)])

city_tracts@data <- data.frame(city_tracts@data,
                                  Mino[match(city_tracts@data$GeoUID,
                                             Mino$GeoUID),])

city_tracts@data$ListingPerCapita <-  city_tracts@data$n / city_tracts@data$Population
city_tracts@data <- city_tracts@data %>% 
  mutate_at(minorities, funs(Share =./ Population) )

And the distance of each tract centroid to the city hall:

centroidsCity <-  gCentroid(city_tracts,byid=TRUE)
listMinoShares  <-  paste0(minorities, "_Share")
EstimatorAirbnbPresence <-  cbind(city_tracts@data[,c("GeoUID", "ListingPerCapita", "Ei", 
                                                       listMinoShares)],coordinates(centroidsCity))
DistCityHall <- distm(EstimatorAirbnbPresence[,c('x','y')], 
                      cityhall, fun=distVincentyEllipsoid)
EstimatorAirbnbPresence <-  cbind(EstimatorAirbnbPresence, DistCityHall)
hist(EstimatorAirbnbPresence$DistCityHall)

There is a non-linear relationship between the density of listing per tract and the distance to city hall.

ggplot(EstimatorAirbnbPresence, aes(x = DistCityHall, y = ListingPerCapita)) +
  geom_point() + geom_smooth() + scale_x_log10() + scale_y_log10() +
  geom_vline(xintercept = breakDist, col = "orange", cex = 1)

So we use two subsets of data to do further regressions: central tracts located less than 10km away from the city and peripheral tracts located further.

# mean value of listing per capita of tracts between 0 & 10 km: 
central <- EstimatorAirbnbPresence %>%
  filter(DistCityHall <= breakDist, !is.na(ListingPerCapita)) %>%
  select(ListingPerCapita, DistCityHall, Ei, listMinoShares)
dim(central)
## [1] 129  16
# regression listing per capita of tracts above 10 km: 
peripheral <- EstimatorAirbnbPresence %>%
  filter(DistCityHall > breakDist, !is.na(ListingPerCapita)) %>%
  select(ListingPerCapita, DistCityHall, Ei, listMinoShares)
dim(peripheral)
## [1] 388  16

Let’s regress the scaled log of airbnb listings with scaled variables such as the log of distance to city hall and proportions of visible minorities for our three data samples:

for (sample in c("all", "central", "peripheral")){
  if (sample == "all") df <-  EstimatorAirbnbPresence
  if (sample == "central") df <-  central
  if (sample == "peripheral") df  <-  peripheral
  
  f <- paste0("scale(log(ListingPerCapita)) ~ scale(log(DistCityHall)) + ",
             paste0("scale(",listMinoShares[-1], ")", collapse = " + "))
  model <- lm(formula(f), data = df)
  print(paste0("obs = ", dim(df)[1]))
  print(summary(model))
  
  coeffs <-  as.data.frame(summary(model)$coefficients)
  coeffs$label <- substr(rownames(coeffs), 7, 17)
  res  <-  data.frame(coeffs, lab[match(coeffs$label, lab$vector),])
  res[,c("label", "vector")] <- NULL
  assign(paste0("Results_", sample), res)
}
## [1] "obs = 572"
## 
## Call:
## lm(formula = formula(f), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.57836 -0.49532 -0.01496  0.50824  2.31458 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.04615    0.03493  -1.321 0.187008    
## scale(log(DistCityHall)) -0.06803    0.04695  -1.449 0.147932    
## scale(v_CA16_3960_Share) -0.27618    0.05015  -5.508 5.82e-08 ***
## scale(v_CA16_3963_Share) -0.15508    0.04646  -3.338 0.000906 ***
## scale(v_CA16_3966_Share) -0.20503    0.05508  -3.722 0.000220 ***
## scale(v_CA16_3969_Share) -0.25682    0.04081  -6.292 6.81e-10 ***
## scale(v_CA16_3972_Share) -0.16055    0.05481  -2.929 0.003556 ** 
## scale(v_CA16_3975_Share)  0.02231    0.03990   0.559 0.576284    
## scale(v_CA16_3978_Share)  0.10553    0.05917   1.784 0.075079 .  
## scale(v_CA16_3981_Share) -0.17017    0.05054  -3.367 0.000817 ***
## scale(v_CA16_3984_Share)  0.07030    0.04905   1.433 0.152407    
## scale(v_CA16_3987_Share)  0.17224    0.03906   4.409 1.27e-05 ***
## scale(v_CA16_3990_Share)  0.04040    0.04816   0.839 0.401908    
## scale(v_CA16_3993_Share)  0.10269    0.04417   2.325 0.020470 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7883 on 502 degrees of freedom
##   (56 observations deleted due to missingness)
## Multiple R-squared:  0.3932, Adjusted R-squared:  0.3775 
## F-statistic: 25.02 on 13 and 502 DF,  p-value: < 2.2e-16
## 
## [1] "obs = 129"
## 
## Call:
## lm(formula = formula(f), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.49437 -0.47483  0.08667  0.42039  1.52203 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.01220    0.06309  -0.193 0.847000    
## scale(log(DistCityHall))  0.26612    0.07872   3.380 0.000992 ***
## scale(v_CA16_3960_Share) -0.06791    0.07964  -0.853 0.395603    
## scale(v_CA16_3963_Share)  0.44595    0.07945   5.613 1.42e-07 ***
## scale(v_CA16_3966_Share) -0.28827    0.09331  -3.089 0.002521 ** 
## scale(v_CA16_3969_Share) -0.04498    0.08377  -0.537 0.592318    
## scale(v_CA16_3972_Share) -0.11881    0.09007  -1.319 0.189809    
## scale(v_CA16_3975_Share)  0.19086    0.07210   2.647 0.009270 ** 
## scale(v_CA16_3978_Share) -0.05635    0.09086  -0.620 0.536355    
## scale(v_CA16_3981_Share) -0.15375    0.07385  -2.082 0.039604 *  
## scale(v_CA16_3984_Share) -0.14064    0.07321  -1.921 0.057227 .  
## scale(v_CA16_3987_Share)  0.09434    0.06984   1.351 0.179418    
## scale(v_CA16_3990_Share)  0.22782    0.07653   2.977 0.003555 ** 
## scale(v_CA16_3993_Share)  0.04045    0.07889   0.513 0.609078    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7137 on 114 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.542,  Adjusted R-squared:  0.4897 
## F-statistic: 10.38 on 13 and 114 DF,  p-value: 3.748e-14
## 
## [1] "obs = 388"
## 
## Call:
## lm(formula = formula(f), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.58511 -0.45024 -0.01079  0.45360  2.05590 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -1.375e-15  3.468e-02   0.000 1.000000    
## scale(log(DistCityHall)) -5.058e-01  5.097e-02  -9.923  < 2e-16 ***
## scale(v_CA16_3960_Share) -1.474e-01  4.938e-02  -2.986 0.003016 ** 
## scale(v_CA16_3963_Share) -3.871e-02  4.451e-02  -0.870 0.385008    
## scale(v_CA16_3966_Share)  3.147e-03  5.554e-02   0.057 0.954841    
## scale(v_CA16_3969_Share) -1.982e-01  4.021e-02  -4.929 1.24e-06 ***
## scale(v_CA16_3972_Share) -2.886e-01  5.411e-02  -5.333 1.68e-07 ***
## scale(v_CA16_3975_Share) -1.927e-04  3.735e-02  -0.005 0.995887    
## scale(v_CA16_3978_Share) -1.442e-02  5.010e-02  -0.288 0.773702    
## scale(v_CA16_3981_Share) -1.848e-01  5.207e-02  -3.549 0.000437 ***
## scale(v_CA16_3984_Share)  1.205e-01  5.287e-02   2.280 0.023180 *  
## scale(v_CA16_3987_Share)  1.467e-01  4.015e-02   3.653 0.000296 ***
## scale(v_CA16_3990_Share) -1.306e-02  5.381e-02  -0.243 0.808356    
## scale(v_CA16_3993_Share)  1.256e-01  4.237e-02   2.964 0.003230 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6832 on 374 degrees of freedom
## Multiple R-squared:  0.549,  Adjusted R-squared:  0.5333 
## F-statistic: 35.02 on 13 and 374 DF,  p-value: < 2.2e-16
#str(Results_all)
reg_total  <-  cbind(Results_all,Results_central, Results_peripheral)
reg_total[,c("t.value", "Std..Error", "t.value.1", "Std..Error.1", "label.1.1", "t.value.2", "Std..Error.2", "label.1.2")] <- NULL
colnames(reg_total)  <-  c("est_all", "pval_all",  "minority",  "est_central", "pval_central", 
                        "est_peripheral", "pval_peripheral")
reg_total$variables  <-  rownames(reg_total)
reg_total$var  <-  ifelse(is.na(reg_total$minority), as.character(reg_total$variables), as.character(reg_total$minority))

and plot the results regarding coefficients:

pall <- ggplot(reg_total, aes(x = var)) + 
  geom_bar(aes(y = est_all, fill = ifelse(pval_all < 0.05, "pval < 0.05", "pval >= 0.05")), stat="identity") + 
  # scale_y_continuous(limits = c(-0.55, 0.55)) +
  coord_flip() + theme(legend.title=element_blank()) 
pcent <- ggplot(reg_total, aes(x = var)) + 
  geom_bar(aes(y = est_central, fill = ifelse(pval_central < 0.05, "pval < 0.05", "pval >= 0.05")), stat="identity") +  
  # scale_y_continuous(limits = c(-0.55, 0.55)) + 
  coord_flip() +   theme(legend.title=element_blank() ) 

pper <- ggplot(reg_total, aes(x = var)) + 
  geom_bar(aes(y = est_peripheral, fill = ifelse(pval_peripheral < 0.05, "pval < 0.05", "pval >= 0.05")), stat="identity") +
  #scale_y_continuous(limits = c(-0.55, 0.55)) + 
  coord_flip() +   theme(legend.title=element_blank()) 

grid.arrange(pall, pcent, pper)

Now let’s see how we could model relationships at the level of airbnb listings directly, in an attempt to model the price per room:

EstimatorAirbnbPrice <- data.frame(idata, EstimatorAirbnbPresence[match(idata$GeoUID, 
                                                                        EstimatorAirbnbPresence$GeoUID),])

EstimatorAirbnbPrice_all <- EstimatorAirbnbPrice %>%
  filter(!is.na(priceperroom), priceperroom > 0) 
# mean value of listing per capita of tracts between 0 & 10 km: 
central_P <- EstimatorAirbnbPrice_all %>%
  filter(DistCityHall <= breakDist) #%>%   select(ListingPerCapita, DistCityHall, Ei, listMinoShares)

peripheral_P <- EstimatorAirbnbPrice_all %>%
  filter(DistCityHall > breakDist) #%>%   select(ListingPerCapita, DistCityHall, Ei, listMinoShares)



for (sample in c("all", "central", "peripheral")){
  if (sample == "all") df <-  EstimatorAirbnbPrice_all
  if (sample == "central") df  <-  central_P
  if (sample == "peripheral") df  <-  peripheral_P
  
  f  <-  paste0("scale(priceperroom) ~ scale(log(DistCityHall)) + ",
             paste0("scale(",listMinoShares[-1], ")", collapse = " + "))
  model <- lm(formula(f), data = df)
  
  print(paste0("obs = ", dim(df)[1]))
  print(summary(model))
  
  coeffs  <-  as.data.frame(summary(model)$coefficients)
  coeffs$label <- substr(rownames(coeffs), 7, 17)
  res  <-  data.frame(coeffs, lab[match(coeffs$label, lab$vector),])
  res[,c("label", "vector")] <- NULL
  assign(paste0("Results_", sample, "Price"), res)
}
## [1] "obs = 6804"
## 
## Call:
## lm(formula = formula(f), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1172 -0.6004 -0.1854  0.3676 12.5271 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.0001691  0.0116239   0.015  0.98839    
## scale(log(DistCityHall)) -0.0645544  0.0148191  -4.356 1.34e-05 ***
## scale(v_CA16_3960_Share) -0.0013992  0.0145028  -0.096  0.92315    
## scale(v_CA16_3963_Share) -0.0305864  0.0149019  -2.053  0.04016 *  
## scale(v_CA16_3966_Share) -0.0937654  0.0155565  -6.027 1.75e-09 ***
## scale(v_CA16_3969_Share) -0.1358318  0.0136204  -9.973  < 2e-16 ***
## scale(v_CA16_3972_Share) -0.0650294  0.0153333  -4.241 2.25e-05 ***
## scale(v_CA16_3975_Share)  0.1959723  0.0139920  14.006  < 2e-16 ***
## scale(v_CA16_3978_Share)  0.0213549  0.0150946   1.415  0.15719    
## scale(v_CA16_3981_Share) -0.0209739  0.0184476  -1.137  0.25560    
## scale(v_CA16_3984_Share) -0.0124280  0.0181898  -0.683  0.49448    
## scale(v_CA16_3987_Share) -0.0040881  0.0128859  -0.317  0.75106    
## scale(v_CA16_3990_Share) -0.0402413  0.0140014  -2.874  0.00406 ** 
## scale(v_CA16_3993_Share)  0.0846088  0.0134402   6.295 3.26e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9587 on 6789 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.08259,    Adjusted R-squared:  0.08083 
## F-statistic: 47.01 on 13 and 6789 DF,  p-value: < 2.2e-16
## 
## [1] "obs = 1903"
## 
## Call:
## lm(formula = formula(f), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8166 -0.6029 -0.1717  0.3819  8.9509 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.0003061  0.0218483   0.014   0.9888    
## scale(log(DistCityHall))  0.1443948  0.0268202   5.384 8.20e-08 ***
## scale(v_CA16_3960_Share) -0.0158484  0.0325442  -0.487   0.6263    
## scale(v_CA16_3963_Share)  0.1307021  0.0291494   4.484 7.77e-06 ***
## scale(v_CA16_3966_Share) -0.0538214  0.0290751  -1.851   0.0643 .  
## scale(v_CA16_3969_Share)  0.1329977  0.0316182   4.206 2.72e-05 ***
## scale(v_CA16_3972_Share) -0.1257152  0.0286462  -4.389 1.20e-05 ***
## scale(v_CA16_3975_Share)  0.2284338  0.0305962   7.466 1.25e-13 ***
## scale(v_CA16_3978_Share) -0.0510720  0.0331065  -1.543   0.1231    
## scale(v_CA16_3981_Share) -0.0335243  0.0304280  -1.102   0.2707    
## scale(v_CA16_3984_Share) -0.0195396  0.0265827  -0.735   0.4624    
## scale(v_CA16_3987_Share)  0.0212537  0.0236393   0.899   0.3687    
## scale(v_CA16_3990_Share)  0.0048932  0.0303606   0.161   0.8720    
## scale(v_CA16_3993_Share)  0.0300574  0.0289871   1.037   0.2999    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9528 on 1888 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.09834,    Adjusted R-squared:  0.09214 
## F-statistic: 15.84 on 13 and 1888 DF,  p-value: < 2.2e-16
## 
## [1] "obs = 4901"
## 
## Call:
## lm(formula = formula(f), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0305 -0.5466 -0.1550  0.3313 12.3440 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               6.657e-16  1.310e-02   0.000 1.000000    
## scale(log(DistCityHall)) -3.476e-01  1.790e-02 -19.426  < 2e-16 ***
## scale(v_CA16_3960_Share)  5.971e-02  1.777e-02   3.360 0.000786 ***
## scale(v_CA16_3963_Share) -3.090e-02  1.627e-02  -1.899 0.057650 .  
## scale(v_CA16_3966_Share) -5.739e-02  1.965e-02  -2.921 0.003503 ** 
## scale(v_CA16_3969_Share) -9.056e-02  1.602e-02  -5.653 1.66e-08 ***
## scale(v_CA16_3972_Share) -1.144e-01  1.821e-02  -6.281 3.66e-10 ***
## scale(v_CA16_3975_Share)  1.127e-01  1.569e-02   7.183 7.85e-13 ***
## scale(v_CA16_3978_Share) -4.690e-03  1.684e-02  -0.278 0.780697    
## scale(v_CA16_3981_Share)  3.116e-02  2.147e-02   1.451 0.146755    
## scale(v_CA16_3984_Share) -2.361e-03  2.126e-02  -0.111 0.911567    
## scale(v_CA16_3987_Share) -3.712e-02  1.469e-02  -2.527 0.011532 *  
## scale(v_CA16_3990_Share) -4.716e-03  2.009e-02  -0.235 0.814439    
## scale(v_CA16_3993_Share)  5.865e-02  1.470e-02   3.990 6.71e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.917 on 4887 degrees of freedom
## Multiple R-squared:  0.1613, Adjusted R-squared:  0.1591 
## F-statistic: 72.32 on 13 and 4887 DF,  p-value: < 2.2e-16
reg_total2  <-  cbind(Results_allPrice,Results_centralPrice, Results_peripheralPrice)
reg_total2[,c("t.value", "Std..Error", "t.value.1", "Std..Error.1", "label.1.1", "t.value.2", "Std..Error.2", "label.1.2")] <- NULL
colnames(reg_total2) <-  c("est_all", "pval_all",  "minority",  "est_central", "pval_central", 
                         "est_peripheral", "pval_peripheral")
reg_total2$variables <-  rownames(reg_total2)
reg_total2$var  <-  ifelse(is.na(reg_total2$minority), as.character(reg_total2$variables), 
                        as.character(reg_total2$minority))

And visualise the results compared to previous results:

pall2 <- ggplot(reg_total2, aes(x = var)) + 
  geom_bar(aes(y = est_all, fill = ifelse(pval_all < 0.05, "pval < 0.05", "pval >= 0.05")), stat="identity") + 
  # scale_y_continuous(limits = c(-0.55, 0.55)) +
  coord_flip() + theme(legend.title=element_blank()) 
pcent2 <- ggplot(reg_total2, aes(x = var)) + 
  geom_bar(aes(y = est_central, fill = ifelse(pval_central < 0.05, "pval < 0.05", "pval >= 0.05")), stat="identity") +  
  # scale_y_continuous(limits = c(-0.55, 0.55)) + 
  coord_flip() + theme(legend.title=element_blank() ) 
pper2 <- ggplot(reg_total2, aes(x = var)) + 
  geom_bar(aes(y = est_peripheral, fill = ifelse(pval_peripheral < 0.05, "pval < 0.05", "pval >= 0.05")), stat="identity") +
  #scale_y_continuous(limits = c(-0.55, 0.55)) +
  coord_flip() + 
  theme(legend.title=element_blank()) 

grid.arrange(pall, pall2, 
             pcent, pcent2, 
             pper, pper2, 
             ncol = 2)

Then let’s compare the segregation levels on minority entropy and ordinal price for the chosen city.

ReardonSegregationAirbnbPrice <- segIndex10(tabOfSpatialUnits = city_tracts@data, distributionColNames = paste0("G", 1:10))
EntropySegregationMinorities  <-  cma_seg[cma_seg$CSD_UID == censusCodeMuni,"H"]

ReardonSegregationAirbnbPrice
## [1] 0.2616599
EntropySegregationMinorities
## # A tibble: 1 x 1
##       H
##   <dbl>
## 1 0.180

Now up to you to: