import geopandas as gpd
import holoviews as hv
import hvplot.pandas
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import requests
import missingno as msno
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Show all columns
= 999 pd.options.display.max_columns
Predictive Modeling of Housing Prices in Philadelphia
- Optimizing our hyperparameters during the modeling process using cross-validation and a grid search
- Testing the fairness of our model by calculating the intersection of the model error rate and poverty rate across neighborhoods
Part 2: Modeling Philadelphia’s Housing Prices and Algorithmic Fairness
2.1 Load data from the Office of Property Assessment
Use the requests package to query the CARTO API for single-family property assessment data in Philadelphia for properties that had their last sale during 2022.
Sources: - OpenDataPhilly - Metadata
# The API endpoint
= "https://phl.carto.com/api/v2/sql"
carto_url
# Only pull 2022 sales for single family residential properties
= "sale_date >= '2022-01-01' and sale_date <= '2022-12-31'"
where = where + " and category_code_description IN ('SINGLE FAMILY', 'Single Family')" where
# Create the query
= f"SELECT * FROM opa_properties_public WHERE {where}"
query
# Make the request
= {"q": query, "format": "geojson", "where": where}
params = requests.get(carto_url, params=params)
response
# Make the GeoDataFrame
= gpd.GeoDataFrame.from_features(response.json(), crs="EPSG:4326") salesRaw
salesRaw.head()
geometry | cartodb_id | assessment_date | basements | beginning_point | book_and_page | building_code | building_code_description | category_code | category_code_description | census_tract | central_air | cross_reference | date_exterior_condition | depth | exempt_building | exempt_land | exterior_condition | fireplaces | frontage | fuel | garage_spaces | garage_type | general_construction | geographic_ward | homestead_exemption | house_extension | house_number | interior_condition | location | mailing_address_1 | mailing_address_2 | mailing_care_of | mailing_city_state | mailing_street | mailing_zip | market_value | market_value_date | number_of_bathrooms | number_of_bedrooms | number_of_rooms | number_stories | off_street_open | other_building | owner_1 | owner_2 | parcel_number | parcel_shape | quality_grade | recording_date | registry_number | sale_date | sale_price | separate_utilities | sewer | site_type | state_code | street_code | street_designation | street_direction | street_name | suffix | taxable_building | taxable_land | topography | total_area | total_livable_area | type_heater | unfinished | unit | utility | view_type | year_built | year_built_estimate | zip_code | zoning | pin | building_code_new | building_code_description_new | objectid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | POINT (-75.13389 40.03928) | 1056 | 2022-05-24T00:00:00Z | H | 241' N OF CHEW ST | 54230133 | R30 | ROW B/GAR 2 STY MASONRY | 1 | SINGLE FAMILY | 275 | N | None | None | 95.0 | 0.0 | 0.0 | 7 | 0.0 | 15.0 | None | 1.0 | None | B | 61 | 0 | None | 5732 | 4 | 5732 N 7TH ST | WALKER MICHAEL | None | None | SICKLERVILLE NJ | 44 FARMHOUSE RD | 08081 | 133400 | None | 1.0 | 3.0 | NaN | 2.0 | 1920.0 | None | WALKER MICHAEL | None | 612234600 | E | C | 2023-10-04T00:00:00Z | 135N7 61 | 2022-08-21T00:00:00Z | 21000 | None | None | None | NJ | 87930 | ST | N | 7TH | None | 106720.0 | 26680.0 | F | 1425.0 | 1164.0 | H | None | None | None | I | 1925 | Y | 19120 | RSA5 | 1001602509 | 24 | ROW PORCH FRONT | 401090517 |
1 | POINT (-75.14337 40.00957) | 1145 | 2022-05-24T00:00:00Z | D | 415' N OF ERIE AVE | 54230032 | O30 | ROW 2 STY MASONRY | 1 | SINGLE FAMILY | 198 | N | None | None | 45.0 | 0.0 | 0.0 | 4 | 0.0 | 16.0 | None | 0.0 | None | A | 43 | 0 | None | 3753 | 4 | 3753 N DELHI ST | None | None | None | DELRAY BEACH FL | 4899 NW 6TH STREET | 33445 | 73800 | None | 1.0 | 3.0 | NaN | 2.0 | 1683.0 | None | RJ SIMPLE SOLUTION LLC | None | 432345900 | E | C | 2023-10-04T00:00:00Z | 100N040379 | 2022-06-13T00:00:00Z | 35000 | None | None | None | FL | 28040 | ST | N | DELHI | None | 59040.0 | 14760.0 | F | 720.0 | 960.0 | H | None | None | None | I | 1942 | Y | 19140 | RM1 | 1001175031 | 24 | ROW PORCH FRONT | 401090494 |
2 | POINT (-75.07249 40.01381) | 1357 | 2022-05-24T00:00:00Z | D | 119'11 1/2" NE | 54228837 | H30 | SEMI/DET 2 STY MASONRY | 1 | SINGLE FAMILY | 299 | N | None | None | 76.0 | 0.0 | 0.0 | 4 | 0.0 | 20.0 | None | 0.0 | None | B | 62 | 0 | None | 5033 | 4 | 5033 DITMAN ST | CSC INGEO | None | None | PHILADELPHIA PA | 5033 DITMAN ST | 19124-2230 | 119100 | None | 1.0 | 3.0 | NaN | 2.0 | 698.0 | None | LISHANSKY MARINA | None | 622444400 | E | C | 2023-10-02T00:00:00Z | 89N17 208 | 2022-12-28T00:00:00Z | 1 | None | None | None | PA | 28660 | ST | None | DITMAN | None | 95280.0 | 23820.0 | F | 1523.0 | 1190.0 | B | None | None | None | I | 1945 | None | 19124 | RSA5 | 1001181518 | 32 | TWIN CONVENTIONAL | 401090706 |
3 | POINT (-75.12854 40.03916) | 1766 | 2022-05-24T00:00:00Z | None | 71'8" E LAWRENCE ST | 54226519 | O30 | ROW 2 STY MASONRY | 1 | SINGLE FAMILY | 274 | None | None | None | 88.0 | 0.0 | 0.0 | 4 | 0.0 | 14.0 | None | 0.0 | None | A | 61 | 0 | None | 416 | 4 | 416 W GRANGE AVE | None | None | None | PHILADELPHIA PA | 416 W GRANGE AVE | 19120-1854 | 124100 | None | 1.0 | 3.0 | NaN | 2.0 | 957.0 | None | WALLACE DANE | None | 612061100 | E | C | 2023-09-25T00:00:00Z | 122N2 150 | 2022-10-26T00:00:00Z | 1 | None | None | None | PA | 38040 | AVE | W | GRANGE | None | 99280.0 | 24820.0 | F | 1241.0 | 1104.0 | None | None | None | None | I | 1953 | Y | 19120 | RSA5 | 1001249126 | 24 | ROW PORCH FRONT | 401091115 |
4 | POINT (-75.17362 39.99887) | 2005 | 2022-05-24T00:00:00Z | D | 261'4" N OF SOMERSET | 54217081 | O50 | ROW 3 STY MASONRY | 1 | SINGLE FAMILY | 172 | N | None | None | 56.0 | 0.0 | 0.0 | 4 | 0.0 | 16.0 | None | 0.0 | None | B | 38 | 0 | None | 2834 | 4 | 2834 N 26TH ST | NEAL KIYONNA | None | None | PHILADELPHIA PA | 6007 N FRONT ST | 19120 | 92900 | None | 0.0 | 5.0 | NaN | 3.0 | 2457.0 | None | NEAL KIYONNA | None | 381152100 | E | C+ | 2023-08-28T00:00:00Z | 035N230348 | 2022-05-11T00:00:00Z | 1 | None | None | None | PA | 88300 | ST | N | 26TH | None | 74320.0 | 18580.0 | F | 896.0 | 1636.0 | H | None | None | None | I | 1940 | Y | 19132 | RSA5 | 1001643492 | 24 | ROW PORCH FRONT | 401092557 |
len(salesRaw)
24456
Clean data
# The feature columns we want to use
= [
cols "sale_price",
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"exterior_condition",
"zip_code",
"geometry"
]
# Trim to these columns and remove NaNs
= salesRaw[cols].dropna()
sales
# Trim zip code to only the first five digits
'zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5) sales[
len(sales)
23463
2.2 Load data for census tracts and neighborhoods
Load various Philadelphia-based regions that we will use in our analysis.
- Census tracts can be downloaded from: https://opendata.arcgis.com/datasets/8bc0786524a4486bb3cf0f9862ad0fbf_0.geojson
- Neighborhoods can be downloaded from: https://raw.githubusercontent.com/azavea/geo-data/master/Neighborhoods_Philadelphia/Neighborhoods_Philadelphia.geojson
= gpd.read_file("https://raw.githubusercontent.com/azavea/geo-data/master/Neighborhoods_Philadelphia/Neighborhoods_Philadelphia.geojson")
neigh = gpd.read_file("Census_Tracts_2010.geojson") tract
tract.explore()
2.3 Spatially join the sales data and neighborhoods/census tracts.
Perform a spatial join, such that each sale has an associated neighborhood and census tract.
Note: After performing the first spatial join, you will need to use the drop()
function to remove the index_right
column; otherwise an error will be raised on the second spatial join about duplicate columns.
= gpd.sjoin(
join ="within").drop(columns=["index_right"])
sales, tract.to_crs(tract.crs), predicate# print(join.columns)
= gpd.sjoin(
sales_full
join,= "within").drop(columns=["index_right"])
neigh.to_crs(neigh.crs), predicate sales_full.head()
sale_price | total_livable_area | total_area | garage_spaces | fireplaces | number_of_bathrooms | number_of_bedrooms | number_stories | exterior_condition | zip_code | geometry | OBJECTID | STATEFP10 | COUNTYFP10 | TRACTCE10 | GEOID10 | NAME10 | NAMELSAD10 | MTFCC10 | FUNCSTAT10 | ALAND10 | AWATER10 | INTPTLAT10 | INTPTLON10 | LOGRECNO | name | listname | mapname | shape_leng | shape_area | cartodb_id | created_at | updated_at | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 21000 | 1164.0 | 1425.0 | 1.0 | 0.0 | 1.0 | 3.0 | 2.0 | 7 | 19120 | POINT (-75.13389 40.03928) | 350 | 42 | 101 | 027500 | 42101027500 | 275 | Census Tract 275 | G5020 | S | 606825 | 0 | +40.0400497 | -075.1322707 | 10588 | OLNEY | Olney | Olney | 32197.205271 | 5.030840e+07 | 8 | 2013-03-19 17:41:50.508000+00:00 | 2013-03-19 17:41:50.743000+00:00 |
586 | 1 | 1188.0 | 1575.0 | 0.0 | 0.0 | 1.0 | 3.0 | 2.0 | 3 | 19120 | POINT (-75.13367 40.04039) | 350 | 42 | 101 | 027500 | 42101027500 | 275 | Census Tract 275 | G5020 | S | 606825 | 0 | +40.0400497 | -075.1322707 | 10588 | OLNEY | Olney | Olney | 32197.205271 | 5.030840e+07 | 8 | 2013-03-19 17:41:50.508000+00:00 | 2013-03-19 17:41:50.743000+00:00 |
1041 | 137000 | 1176.0 | 1234.0 | 1.0 | 0.0 | 1.0 | 3.0 | 2.0 | 4 | 19120 | POINT (-75.13277 40.04028) | 350 | 42 | 101 | 027500 | 42101027500 | 275 | Census Tract 275 | G5020 | S | 606825 | 0 | +40.0400497 | -075.1322707 | 10588 | OLNEY | Olney | Olney | 32197.205271 | 5.030840e+07 | 8 | 2013-03-19 17:41:50.508000+00:00 | 2013-03-19 17:41:50.743000+00:00 |
1567 | 259000 | 1310.0 | 1754.0 | 1.0 | 0.0 | 2.0 | 3.0 | 2.0 | 4 | 19120 | POINT (-75.13173 40.03868) | 350 | 42 | 101 | 027500 | 42101027500 | 275 | Census Tract 275 | G5020 | S | 606825 | 0 | +40.0400497 | -075.1322707 | 10588 | OLNEY | Olney | Olney | 32197.205271 | 5.030840e+07 | 8 | 2013-03-19 17:41:50.508000+00:00 | 2013-03-19 17:41:50.743000+00:00 |
2000 | 220000 | 1272.0 | 1445.0 | 0.0 | 0.0 | 2.0 | 3.0 | 2.0 | 4 | 19120 | POINT (-75.13245 40.03959) | 350 | 42 | 101 | 027500 | 42101027500 | 275 | Census Tract 275 | G5020 | S | 606825 | 0 | +40.0400497 | -075.1322707 | 10588 | OLNEY | Olney | Olney | 32197.205271 | 5.030840e+07 | 8 | 2013-03-19 17:41:50.508000+00:00 | 2013-03-19 17:41:50.743000+00:00 |
sales_full.head()
sale_price | total_livable_area | total_area | garage_spaces | fireplaces | number_of_bathrooms | number_of_bedrooms | number_stories | exterior_condition | zip_code | geometry | OBJECTID | STATEFP10 | COUNTYFP10 | TRACTCE10 | GEOID10 | NAME10 | NAMELSAD10 | MTFCC10 | FUNCSTAT10 | ALAND10 | AWATER10 | INTPTLAT10 | INTPTLON10 | LOGRECNO | name | listname | mapname | shape_leng | shape_area | cartodb_id | created_at | updated_at | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 21000 | 1164.0 | 1425.0 | 1.0 | 0.0 | 1.0 | 3.0 | 2.0 | 7 | 19120 | POINT (-75.13389 40.03928) | 350 | 42 | 101 | 027500 | 42101027500 | 275 | Census Tract 275 | G5020 | S | 606825 | 0 | +40.0400497 | -075.1322707 | 10588 | OLNEY | Olney | Olney | 32197.205271 | 5.030840e+07 | 8 | 2013-03-19 17:41:50.508000+00:00 | 2013-03-19 17:41:50.743000+00:00 |
586 | 1 | 1188.0 | 1575.0 | 0.0 | 0.0 | 1.0 | 3.0 | 2.0 | 3 | 19120 | POINT (-75.13367 40.04039) | 350 | 42 | 101 | 027500 | 42101027500 | 275 | Census Tract 275 | G5020 | S | 606825 | 0 | +40.0400497 | -075.1322707 | 10588 | OLNEY | Olney | Olney | 32197.205271 | 5.030840e+07 | 8 | 2013-03-19 17:41:50.508000+00:00 | 2013-03-19 17:41:50.743000+00:00 |
1041 | 137000 | 1176.0 | 1234.0 | 1.0 | 0.0 | 1.0 | 3.0 | 2.0 | 4 | 19120 | POINT (-75.13277 40.04028) | 350 | 42 | 101 | 027500 | 42101027500 | 275 | Census Tract 275 | G5020 | S | 606825 | 0 | +40.0400497 | -075.1322707 | 10588 | OLNEY | Olney | Olney | 32197.205271 | 5.030840e+07 | 8 | 2013-03-19 17:41:50.508000+00:00 | 2013-03-19 17:41:50.743000+00:00 |
1567 | 259000 | 1310.0 | 1754.0 | 1.0 | 0.0 | 2.0 | 3.0 | 2.0 | 4 | 19120 | POINT (-75.13173 40.03868) | 350 | 42 | 101 | 027500 | 42101027500 | 275 | Census Tract 275 | G5020 | S | 606825 | 0 | +40.0400497 | -075.1322707 | 10588 | OLNEY | Olney | Olney | 32197.205271 | 5.030840e+07 | 8 | 2013-03-19 17:41:50.508000+00:00 | 2013-03-19 17:41:50.743000+00:00 |
2000 | 220000 | 1272.0 | 1445.0 | 0.0 | 0.0 | 2.0 | 3.0 | 2.0 | 4 | 19120 | POINT (-75.13245 40.03959) | 350 | 42 | 101 | 027500 | 42101027500 | 275 | Census Tract 275 | G5020 | S | 606825 | 0 | +40.0400497 | -075.1322707 | 10588 | OLNEY | Olney | Olney | 32197.205271 | 5.030840e+07 | 8 | 2013-03-19 17:41:50.508000+00:00 | 2013-03-19 17:41:50.743000+00:00 |
2.4 Train a Random Forest on the sales data
In this step, you should follow the steps outlined in lecture to preprocess and train your model. We’ll extend our analysis to do a hyperparameter grid search to find the best model configuration. As you train your model, follow the following steps:
Preprocessing Requirements
Trim the sales data to those sales with prices between $3,000 and $1 million
Set up a pipeline that includes both numerical columns and categorical columns Include one-hot encoded variable for the neighborhood of the sale, instead of ZIP code. We don’t want to include multiple location based categories, since they encode much of the same information.
Training requirements - Use a 70/30% training/test split and predict the log of the sales price. - Use GridSearchCV to perform a k-fold cross validation that optimize at least 2 hyperparameters of the RandomForestRegressor - After fitting your model and finding the optimal hyperparameters, you should evaluate the score (R-squared) on the test set (the original 30% sample withheld)
Note: You don’t need to include additional features (such as spatial distance features) or perform any extra feature engineering beyond what is required above to receive full credit. Of course, you are always welcome to experiment!
= sales_full.loc[sales_full["sale_price"]>3000].loc[sales_full["sale_price"]<1000000]
trim_sales ={"mapname":"neighborhood"},inplace=True)
trim_sales.rename(columns
# The feature columns we want to use
= [
cols "sale_price",
"total_livable_area",
"total_area",
"garage_spaces",
"fireplaces",
"number_of_bathrooms",
"number_of_bedrooms",
"number_stories",
"exterior_condition",
"neighborhood",
"log(sale_price)"
]
# Trim to these columns and remove NaNs
"log(sale_price)"] = np.log(trim_sales["sale_price"])
trim_sales[= trim_sales[cols].dropna()
saleDF
saleDF.head()
sale_price | total_livable_area | total_area | garage_spaces | fireplaces | number_of_bathrooms | number_of_bedrooms | number_stories | exterior_condition | neighborhood | log(sale_price) | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 21000 | 1164.0 | 1425.0 | 1.0 | 0.0 | 1.0 | 3.0 | 2.0 | 7 | Olney | 9.952278 |
1041 | 137000 | 1176.0 | 1234.0 | 1.0 | 0.0 | 1.0 | 3.0 | 2.0 | 4 | Olney | 11.827736 |
1567 | 259000 | 1310.0 | 1754.0 | 1.0 | 0.0 | 2.0 | 3.0 | 2.0 | 4 | Olney | 12.464583 |
2000 | 220000 | 1272.0 | 1445.0 | 0.0 | 0.0 | 2.0 | 3.0 | 2.0 | 4 | Olney | 12.301383 |
2618 | 200000 | 1304.0 | 1360.0 | 1.0 | 0.0 | 1.0 | 3.0 | 2.0 | 4 | Olney | 12.206073 |
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from itertools import compress
= []
numCols = []
txtCols for col in cols:
numCols.append(is_numeric_dtype(saleDF[col]))~numCols[-1])
txtCols.append(
= list(compress(cols,numCols))[1:-1]
numCols = ["exterior_condition","neighborhood"]
catCols
= ColumnTransformer(
transformer =[
transformers"num", StandardScaler(), numCols),
("cat", OneHotEncoder(handle_unknown="ignore"), catCols),
(
]
) numCols
['total_livable_area',
'total_area',
'garage_spaces',
'fireplaces',
'number_of_bathrooms',
'number_of_bedrooms',
'number_stories']
# Split the data 70/30
= train_test_split(saleDF, test_size=0.3, random_state=42)
train_set, test_set
# the target labels: log of sale price
= train_set["log(sale_price)"]
y_train = test_set["log(sale_price)"]
y_test
= make_pipeline(
pipe =42)
transformer, RandomForestRegressor(random_state
)
= "randomforestregressor"
model_step = {
param_grid f"{model_step}__n_estimators": [5, 10, 15, 20, 30, 50, 100, 200],
f"{model_step}__max_depth": [2, 5, 7, 9, 13, 21, 33, 51],
}
# Create the grid and use 3-fold CV
= GridSearchCV(pipe, param_grid, cv=3, verbose=1)
grid
# Run the search
grid.fit(train_set, y_train)
Fitting 3 folds for each of 64 candidates, totalling 192 fits
GridSearchCV(cv=3, estimator=Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('num', StandardScaler(), ['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']), ('cat', OneHotEncoder(handle_unknown='ignore'), ['exterior_condition', 'neighborhood'])])), ('randomforestregressor', RandomForestRegressor(random_state=42))]), param_grid={'randomforestregressor__max_depth': [2, 5, 7, 9, 13, 21, 33, 51], 'randomforestregressor__n_estimators': [5, 10, 15, 20, 30, 50, 100, 200]}, verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=3, estimator=Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('num', StandardScaler(), ['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']), ('cat', OneHotEncoder(handle_unknown='ignore'), ['exterior_condition', 'neighborhood'])])), ('randomforestregressor', RandomForestRegressor(random_state=42))]), param_grid={'randomforestregressor__max_depth': [2, 5, 7, 9, 13, 21, 33, 51], 'randomforestregressor__n_estimators': [5, 10, 15, 20, 30, 50, 100, 200]}, verbose=1)
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('num', StandardScaler(), ['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']), ('cat', OneHotEncoder(handle_unknown='ignore'), ['exterior_condition', 'neighborhood'])])), ('randomforestregressor', RandomForestRegressor(random_state=42))])
ColumnTransformer(transformers=[('num', StandardScaler(), ['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']), ('cat', OneHotEncoder(handle_unknown='ignore'), ['exterior_condition', 'neighborhood'])])
['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']
StandardScaler()
['exterior_condition', 'neighborhood']
OneHotEncoder(handle_unknown='ignore')
RandomForestRegressor(random_state=42)
print("Getting 3-fold cross validation results.")
= cross_val_score(
scores
pipe,
train_set,
y_train,=3,
cv
)
print("Completed")
# # Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Standard deviation = ", scores.std())
Getting 3-fold cross validation results.
Completed
R^2 scores = [0.56575893 0.51311316 0.53833274]
Scores mean = 0.5390682769418876
Standard deviation = 0.021498838781376265
print(f"The optimal model of our 64 candiates had a max depth of {grid.best_estimator_.steps[1][1].max_depth} and an n_estimator count of {grid.best_estimator_.steps[1][1].n_estimators}\nleading to an accuracy score of {grid.best_estimator_.score(test_set,y_test)} or {grid.best_estimator_.score(test_set,y_test)*100:.2f}% on the test set.")
print("\n\nCV Score Report ...\n")
print("R^2 scores:\n\tFold 1 = ", scores[0],"\n\tFold 2 = ", scores[1],"\n\tFold 3 = ", scores[2],)
print("Scores mean = ", scores.mean())
print("Score standard deviation = ", scores.std())
The optimal model of our 64 candiates had a max depth of 51 and an n_estimator count of 200
leading to an accuracy score of 0.5573963547661122 or 55.74% on the test set.
CV Score Report ...
R^2 scores:
Fold 1 = 0.5657589318132974
Fold 2 = 0.5131131592492879
Fold 3 = 0.5383327397630778
Scores mean = 0.5390682769418876
Score standard deviation = 0.021498838781376265
#not sure why I am getting an error for transformer but the cells after still work
= grid.best_estimator_.named_steps["randomforestregressor"]
random_forest = numCols + list(transformer.named_transformers_['cat'].get_feature_names_out())
features
= pd.DataFrame(
importance "Feature": features, "Importance": random_forest.feature_importances_}
{
)
= importance.sort_values("Importance", ascending=False).iloc[:30]
importance
=20)
importance.head(n
="Feature", y="Importance", height=700, flip_yaxis=True) importance.hvplot.barh(x
AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'
2.5 Calculate the percent error of your model predictions for each sale in the test set
Fit your best model and use it to make predictions on the test set.
Note: This should be the percent error in terms of sale price. You’ll need to convert if your model predicted the log of sales price!
import math
= grid.best_estimator_.predict(test_set)
predictions
= predictions - y_test
errors
= math.e**(predictions) - math.e**(y_test)
precent_errors
print(errors[0])
print(precent_errors[0])
print()
print(test_set[0:1]["log(sale_price)"])
print()
print(predictions[0])
1.5150772918914885
74545.2282497846
16061 11.350407
Name: log(sale_price), dtype: float64
12.527490609299118
2.6 Make a data frame with percent errors and census tract info for each sale in the test set
Create a data frame that has the property geometries, census tract data, and percent errors for all of the sales in the test set.
Notes
- When using the “train_test_split()” function, the index of the test data frame includes the labels from the original sales data frame
- You can use this index to slice out the test data from the original sales data frame, which should include the census tract info and geometries
- Add a new column to this data frame holding the percent error data
- Make sure to use the percent error and not the absolute percent error
= saleDF.loc[test_set.index]
test_saleDF # test_set.index
"log(predictions)"] = predictions
test_saleDF["predictions"] = math.e**predictions
test_saleDF["error"] = precent_errors
test_saleDF["percent_error"] = (precent_errors)/test_saleDF["sale_price"]*100
test_saleDF[
= sales_full.merge(test_saleDF,how="right")
test_saleDF
"sale_price","log(sale_price)","log(predictions)","predictions","error","percent_error"]].tail() test_saleDF[[
sale_price | log(sale_price) | log(predictions) | predictions | error | percent_error | |
---|---|---|---|---|---|---|
5352 | 181000 | 12.106252 | 11.345868 | 84615.078520 | -96384.921480 | -53.251338 |
5353 | 300000 | 12.611538 | 12.959598 | 424895.325384 | 124895.325384 | 41.631775 |
5354 | 265000 | 12.487485 | 12.673232 | 319091.035201 | 54091.035201 | 20.411711 |
5355 | 380000 | 12.847927 | 12.646338 | 310624.010585 | -69375.989415 | -18.256839 |
5356 | 134000 | 11.805595 | 12.530408 | 276622.279170 | 142622.279170 | 106.434537 |
2.8 Plot a map of the median percent error by census tract
- You’ll want to group your data frame of test sales by the
GEOID10
column and take the median of you percent error column - Merge the census tract geometries back in and use geopandas to plot.
= tract.merge(test_saleDF.groupby("GEOID10").agg({"predictions": "median","error": "median","percent_error" : "median"}), on="GEOID10")
tractPE # sales_full.geometry
tractPE.columns
Index(['OBJECTID', 'STATEFP10', 'COUNTYFP10', 'TRACTCE10', 'GEOID10', 'NAME10',
'NAMELSAD10', 'MTFCC10', 'FUNCSTAT10', 'ALAND10', 'AWATER10',
'INTPTLAT10', 'INTPTLON10', 'LOGRECNO', 'geometry', 'predictions',
'error', 'percent_error'],
dtype='object')
={"NAME10":"Census Tract #","predictions":"Predicted Sale Price ($)","error":"Prediction Error ($)","percent_error":"Percent Error"}).explore(
tractPE.rename(columns= "Percent Error",
column = "viridis",
cmap = "CartoDB positron",
tiles = ["Census Tract #","Predicted Sale Price ($)","Prediction Error ($)","Percent Error"]
tooltip )
2.9 Compare the percent errors in Qualifying Census Tracts and other tracts
Qualifying Census Tracts are a poverty designation that HUD uses to allocate housing tax credits
- I’ve included a list of the census tract names that qualify in Philadelphia
- Add a new column to your dataframe of test set sales that is True/False depending on if the tract is a QCT
- Then, group by this new column and calculate the median percent error
You should find that the algorithm’s accuracy is significantly worse in these low-income, qualifying census tracts
= ['5',
qct '20',
'22',
'28.01',
'30.01',
'30.02',
'31',
'32',
'33',
'36',
'37.01',
'37.02',
'39.01',
'41.01',
'41.02',
'56',
'60',
'61',
'62',
'63',
'64',
'65',
'66',
'67',
'69',
'70',
'71.01',
'71.02',
'72',
'73',
'74',
'77',
'78',
'80',
'81.01',
'81.02',
'82',
'83.01',
'83.02',
'84',
'85',
'86.01',
'86.02',
'87.01',
'87.02',
'88.01',
'88.02',
'90',
'91',
'92',
'93',
'94',
'95',
'96',
'98.01',
'100',
'101',
'102',
'103',
'104',
'105',
'106',
'107',
'108',
'109',
'110',
'111',
'112',
'113',
'119',
'121',
'122.01',
'122.03',
'131',
'132',
'137',
'138',
'139',
'140',
'141',
'144',
'145',
'146',
'147',
'148',
'149',
'151.01',
'151.02',
'152',
'153',
'156',
'157',
'161',
'162',
'163',
'164',
'165',
'167.01',
'167.02',
'168',
'169.01',
'169.02',
'170',
'171',
'172.01',
'172.02',
'173',
'174',
'175',
'176.01',
'176.02',
'177.01',
'177.02',
'178',
'179',
'180.02',
'188',
'190',
'191',
'192',
'195.01',
'195.02',
'197',
'198',
'199',
'200',
'201.01',
'201.02',
'202',
'203',
'204',
'205',
'206',
'208',
'239',
'240',
'241',
'242',
'243',
'244',
'245',
'246',
'247',
'249',
'252',
'253',
'265',
'267',
'268',
'271',
'274.01',
'274.02',
'275',
'276',
'277',
'278',
'279.01',
'279.02',
'280',
'281',
'282',
'283',
'284',
'285',
'286',
'287',
'288',
'289.01',
'289.02',
'290',
'291',
'293',
'294',
'298',
'299',
'300',
'301',
'302',
'305.01',
'305.02',
'309',
'311.01',
'312',
'313',
'314.01',
'314.02',
'316',
'318',
'319',
'321',
'325',
'329',
'330',
'337.01',
'345.01',
'357.01',
'376',
'377',
'380',
'381',
'382',
'383',
'389',
'390']
"is_qct"] = tractPE["NAME10"].isin(qct)
tractPE[= tractPE.groupby("is_qct",as_index=False).agg({"predictions": "median","error": ["mean","median"],"percent_error" : ["mean","median"]})
qctGroup = qctGroup[1:]["percent_error"]["median"].squeeze()
qct_t = qctGroup[:1]["percent_error"]["median"].squeeze()
qct_f qctGroup