Predictive Modeling of Housing Prices in Philadelphia

Optimizing our hyperparameters during the modeling process using cross-validation and a grid search
Testing the fairness of our model by calculating the intersection of the model error rate and poverty rate across neighborhoods

Part 2: Modeling Philadelphia’s Housing Prices and Algorithmic Fairness

2.1 Load data from the Office of Property Assessment

Use the requests package to query the CARTO API for single-family property assessment data in Philadelphia for properties that had their last sale during 2022.

Sources: - OpenDataPhilly - Metadata

import geopandas as gpd
import holoviews as hv
import hvplot.pandas
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import requests
import missingno as msno

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Show all columns
pd.options.display.max_columns = 999

# The API endpoint
carto_url = "https://phl.carto.com/api/v2/sql"

# Only pull 2022 sales for single family residential properties
where = "sale_date >= '2022-01-01' and sale_date <= '2022-12-31'"
where = where + " and category_code_description IN ('SINGLE FAMILY', 'Single Family')"

# Create the query
query = f"SELECT * FROM opa_properties_public WHERE {where}"

# Make the request
params = {"q": query, "format": "geojson", "where": where}
response = requests.get(carto_url, params=params)

# Make the GeoDataFrame
salesRaw = gpd.GeoDataFrame.from_features(response.json(), crs="EPSG:4326")

salesRaw.head()

	geometry	cartodb_id	assessment_date	basements	beginning_point	book_and_page	building_code	building_code_description	category_code	category_code_description	census_tract	central_air	cross_reference	date_exterior_condition	depth	exterior_condition	frontage	fuel	garage_spaces	garage_type	general_construction	geographic_ward	house_extension	house_number	interior_condition	location	mailing_address_1	mailing_address_2	mailing_care_of	mailing_city_state	mailing_street	mailing_zip	market_value	market_value_date	number_of_bathrooms	number_of_bedrooms	number_of_rooms	number_stories	off_street_open	other_building	owner_1	owner_2	parcel_number	parcel_shape	quality_grade	recording_date	registry_number	sale_date	sale_price	separate_utilities	sewer	site_type	state_code	street_code	street_designation	street_direction	street_name	suffix	taxable_building	taxable_land	topography	total_area	total_livable_area	type_heater	unfinished	unit	utility	view_type	year_built	year_built_estimate	zip_code	zoning	pin	building_code_new	building_code_description_new	objectid
0	POINT (-75.13389 40.03928)	1056	2022-05-24T00:00:00Z	H	241' N OF CHEW ST	54230133	R30	ROW B/GAR 2 STY MASONRY	1	SINGLE FAMILY	275	N	None	None	95.0	7	15.0	None	1.0	None	B	61	None	5732	4	5732 N 7TH ST	WALKER MICHAEL	None	None	SICKLERVILLE NJ	44 FARMHOUSE RD	08081	133400	None	1.0	3.0	NaN	2.0	1920.0	None	WALKER MICHAEL	None	612234600	E	C	2023-10-04T00:00:00Z	135N7 61	2022-08-21T00:00:00Z	21000	None	None	None	NJ	87930	ST	N	7TH	None	106720.0	26680.0	F	1425.0	1164.0	H	None	None	None	I	1925	Y	19120	RSA5	1001602509	24	ROW PORCH FRONT	401090517
1	POINT (-75.14337 40.00957)	1145	2022-05-24T00:00:00Z	D	415' N OF ERIE AVE	54230032	O30	ROW 2 STY MASONRY	1	SINGLE FAMILY	198	N	None	None	45.0	4	16.0	None	0.0	None	A	43	None	3753	4	3753 N DELHI ST	None	None	None	DELRAY BEACH FL	4899 NW 6TH STREET	33445	73800	None	1.0	3.0	NaN	2.0	1683.0	None	RJ SIMPLE SOLUTION LLC	None	432345900	E	C	2023-10-04T00:00:00Z	100N040379	2022-06-13T00:00:00Z	35000	None	None	None	FL	28040	ST	N	DELHI	None	59040.0	14760.0	F	720.0	960.0	H	None	None	None	I	1942	Y	19140	RM1	1001175031	24	ROW PORCH FRONT	401090494
2	POINT (-75.07249 40.01381)	1357	2022-05-24T00:00:00Z	D	119'11 1/2" NE	54228837	H30	SEMI/DET 2 STY MASONRY	1	SINGLE FAMILY	299	N	None	None	76.0	4	20.0	None	0.0	None	B	62	None	5033	4	5033 DITMAN ST	CSC INGEO	None	None	PHILADELPHIA PA	5033 DITMAN ST	19124-2230	119100	None	1.0	3.0	NaN	2.0	698.0	None	LISHANSKY MARINA	None	622444400	E	C	2023-10-02T00:00:00Z	89N17 208	2022-12-28T00:00:00Z	1	None	None	None	PA	28660	ST	None	DITMAN	None	95280.0	23820.0	F	1523.0	1190.0	B	None	None	None	I	1945	None	19124	RSA5	1001181518	32	TWIN CONVENTIONAL	401090706
3	POINT (-75.12854 40.03916)	1766	2022-05-24T00:00:00Z	None	71'8" E LAWRENCE ST	54226519	O30	ROW 2 STY MASONRY	1	SINGLE FAMILY	274	None	None	None	88.0	4	14.0	None	0.0	None	A	61	None	416	4	416 W GRANGE AVE	None	None	None	PHILADELPHIA PA	416 W GRANGE AVE	19120-1854	124100	None	1.0	3.0	NaN	2.0	957.0	None	WALLACE DANE	None	612061100	E	C	2023-09-25T00:00:00Z	122N2 150	2022-10-26T00:00:00Z	1	None	None	None	PA	38040	AVE	W	GRANGE	None	99280.0	24820.0	F	1241.0	1104.0	None	None	None	None	I	1953	Y	19120	RSA5	1001249126	24	ROW PORCH FRONT	401091115
4	POINT (-75.17362 39.99887)	2005	2022-05-24T00:00:00Z	D	261'4" N OF SOMERSET	54217081	O50	ROW 3 STY MASONRY	1	SINGLE FAMILY	172	N	None	None	56.0	4	16.0	None	0.0	None	B	38	None	2834	4	2834 N 26TH ST	NEAL KIYONNA	None	None	PHILADELPHIA PA	6007 N FRONT ST	19120	92900	None	0.0	5.0	NaN	3.0	2457.0	None	NEAL KIYONNA	None	381152100	E	C+	2023-08-28T00:00:00Z	035N230348	2022-05-11T00:00:00Z	1	None	None	None	PA	88300	ST	N	26TH	None	74320.0	18580.0	F	896.0	1636.0	H	None	None	None	I	1940	Y	19132	RSA5	1001643492	24	ROW PORCH FRONT	401092557

len(salesRaw)

Clean data

# The feature columns we want to use
cols = [
    "sale_price",
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "exterior_condition",
    "zip_code",
    "geometry"
]

# Trim to these columns and remove NaNs
sales = salesRaw[cols].dropna()

# Trim zip code to only the first five digits
sales['zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5)

len(sales)

2.2 Load data for census tracts and neighborhoods

Load various Philadelphia-based regions that we will use in our analysis.

Census tracts can be downloaded from: https://opendata.arcgis.com/datasets/8bc0786524a4486bb3cf0f9862ad0fbf_0.geojson
Neighborhoods can be downloaded from: https://raw.githubusercontent.com/azavea/geo-data/master/Neighborhoods_Philadelphia/Neighborhoods_Philadelphia.geojson

neigh = gpd.read_file("https://raw.githubusercontent.com/azavea/geo-data/master/Neighborhoods_Philadelphia/Neighborhoods_Philadelphia.geojson")
tract = gpd.read_file("Census_Tracts_2010.geojson")

tract.explore()

Make this Notebook Trusted to load map: File -> Trust Notebook

2.3 Spatially join the sales data and neighborhoods/census tracts.

Perform a spatial join, such that each sale has an associated neighborhood and census tract.

Note: After performing the first spatial join, you will need to use the drop() function to remove the index_right column; otherwise an error will be raised on the second spatial join about duplicate columns.

join = gpd.sjoin(
        sales, tract.to_crs(tract.crs), predicate="within").drop(columns=["index_right"])
# print(join.columns)

sales_full = gpd.sjoin(
    join,
        neigh.to_crs(neigh.crs), predicate = "within").drop(columns=["index_right"])
sales_full.head()

	sale_price	total_livable_area	total_area	garage_spaces	number_of_bathrooms	number_of_bedrooms	number_stories	exterior_condition	zip_code	geometry	OBJECTID	STATEFP10	COUNTYFP10	TRACTCE10	GEOID10	NAME10	NAMELSAD10	MTFCC10	FUNCSTAT10	ALAND10	INTPTLAT10	INTPTLON10	LOGRECNO	name	listname	mapname	shape_leng	shape_area	cartodb_id	created_at	updated_at
0	21000	1164.0	1425.0	1.0	1.0	3.0	2.0	7	19120	POINT (-75.13389 40.03928)	350	42	101	027500	42101027500	275	Census Tract 275	G5020	S	606825	+40.0400497	-075.1322707	10588	OLNEY	Olney	Olney	32197.205271	5.030840e+07	8	2013-03-19 17:41:50.508000+00:00	2013-03-19 17:41:50.743000+00:00
586	1	1188.0	1575.0	0.0	1.0	3.0	2.0	3	19120	POINT (-75.13367 40.04039)	350	42	101	027500	42101027500	275	Census Tract 275	G5020	S	606825	+40.0400497	-075.1322707	10588	OLNEY	Olney	Olney	32197.205271	5.030840e+07	8	2013-03-19 17:41:50.508000+00:00	2013-03-19 17:41:50.743000+00:00
1041	137000	1176.0	1234.0	1.0	1.0	3.0	2.0	4	19120	POINT (-75.13277 40.04028)	350	42	101	027500	42101027500	275	Census Tract 275	G5020	S	606825	+40.0400497	-075.1322707	10588	OLNEY	Olney	Olney	32197.205271	5.030840e+07	8	2013-03-19 17:41:50.508000+00:00	2013-03-19 17:41:50.743000+00:00
1567	259000	1310.0	1754.0	1.0	2.0	3.0	2.0	4	19120	POINT (-75.13173 40.03868)	350	42	101	027500	42101027500	275	Census Tract 275	G5020	S	606825	+40.0400497	-075.1322707	10588	OLNEY	Olney	Olney	32197.205271	5.030840e+07	8	2013-03-19 17:41:50.508000+00:00	2013-03-19 17:41:50.743000+00:00
2000	220000	1272.0	1445.0	0.0	2.0	3.0	2.0	4	19120	POINT (-75.13245 40.03959)	350	42	101	027500	42101027500	275	Census Tract 275	G5020	S	606825	+40.0400497	-075.1322707	10588	OLNEY	Olney	Olney	32197.205271	5.030840e+07	8	2013-03-19 17:41:50.508000+00:00	2013-03-19 17:41:50.743000+00:00

sales_full.head()

	sale_price	total_livable_area	total_area	garage_spaces	number_of_bathrooms	number_of_bedrooms	number_stories	exterior_condition	zip_code	geometry	OBJECTID	STATEFP10	COUNTYFP10	TRACTCE10	GEOID10	NAME10	NAMELSAD10	MTFCC10	FUNCSTAT10	ALAND10	INTPTLAT10	INTPTLON10	LOGRECNO	name	listname	mapname	shape_leng	shape_area	cartodb_id	created_at	updated_at
0	21000	1164.0	1425.0	1.0	1.0	3.0	2.0	7	19120	POINT (-75.13389 40.03928)	350	42	101	027500	42101027500	275	Census Tract 275	G5020	S	606825	+40.0400497	-075.1322707	10588	OLNEY	Olney	Olney	32197.205271	5.030840e+07	8	2013-03-19 17:41:50.508000+00:00	2013-03-19 17:41:50.743000+00:00
586	1	1188.0	1575.0	0.0	1.0	3.0	2.0	3	19120	POINT (-75.13367 40.04039)	350	42	101	027500	42101027500	275	Census Tract 275	G5020	S	606825	+40.0400497	-075.1322707	10588	OLNEY	Olney	Olney	32197.205271	5.030840e+07	8	2013-03-19 17:41:50.508000+00:00	2013-03-19 17:41:50.743000+00:00
1041	137000	1176.0	1234.0	1.0	1.0	3.0	2.0	4	19120	POINT (-75.13277 40.04028)	350	42	101	027500	42101027500	275	Census Tract 275	G5020	S	606825	+40.0400497	-075.1322707	10588	OLNEY	Olney	Olney	32197.205271	5.030840e+07	8	2013-03-19 17:41:50.508000+00:00	2013-03-19 17:41:50.743000+00:00
1567	259000	1310.0	1754.0	1.0	2.0	3.0	2.0	4	19120	POINT (-75.13173 40.03868)	350	42	101	027500	42101027500	275	Census Tract 275	G5020	S	606825	+40.0400497	-075.1322707	10588	OLNEY	Olney	Olney	32197.205271	5.030840e+07	8	2013-03-19 17:41:50.508000+00:00	2013-03-19 17:41:50.743000+00:00
2000	220000	1272.0	1445.0	0.0	2.0	3.0	2.0	4	19120	POINT (-75.13245 40.03959)	350	42	101	027500	42101027500	275	Census Tract 275	G5020	S	606825	+40.0400497	-075.1322707	10588	OLNEY	Olney	Olney	32197.205271	5.030840e+07	8	2013-03-19 17:41:50.508000+00:00	2013-03-19 17:41:50.743000+00:00

2.4 Train a Random Forest on the sales data

In this step, you should follow the steps outlined in lecture to preprocess and train your model. We’ll extend our analysis to do a hyperparameter grid search to find the best model configuration. As you train your model, follow the following steps:

Preprocessing Requirements
Trim the sales data to those sales with prices between $3,000 and $1 million
Set up a pipeline that includes both numerical columns and categorical columns Include one-hot encoded variable for the neighborhood of the sale, instead of ZIP code. We don’t want to include multiple location based categories, since they encode much of the same information.

Training requirements - Use a 70/30% training/test split and predict the log of the sales price. - Use GridSearchCV to perform a k-fold cross validation that optimize at least 2 hyperparameters of the RandomForestRegressor - After fitting your model and finding the optimal hyperparameters, you should evaluate the score (R-squared) on the test set (the original 30% sample withheld)

Note: You don’t need to include additional features (such as spatial distance features) or perform any extra feature engineering beyond what is required above to receive full credit. Of course, you are always welcome to experiment!

trim_sales = sales_full.loc[sales_full["sale_price"]>3000].loc[sales_full["sale_price"]<1000000]
trim_sales.rename(columns={"mapname":"neighborhood"},inplace=True)

# The feature columns we want to use
cols = [
    "sale_price",
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "exterior_condition",
    "neighborhood",
    "log(sale_price)"
]

# Trim to these columns and remove NaNs
trim_sales["log(sale_price)"] = np.log(trim_sales["sale_price"])
saleDF = trim_sales[cols].dropna()


saleDF.head()

	sale_price	total_livable_area	total_area	garage_spaces	number_of_bathrooms	number_of_bedrooms	number_stories	exterior_condition	neighborhood	log(sale_price)
0	21000	1164.0	1425.0	1.0	1.0	3.0	2.0	7	Olney	9.952278
1041	137000	1176.0	1234.0	1.0	1.0	3.0	2.0	4	Olney	11.827736
1567	259000	1310.0	1754.0	1.0	2.0	3.0	2.0	4	Olney	12.464583
2000	220000	1272.0	1445.0	0.0	2.0	3.0	2.0	4	Olney	12.301383
2618	200000	1304.0	1360.0	1.0	1.0	3.0	2.0	4	Olney	12.206073

from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from itertools import compress

numCols = []
txtCols = []
for col in cols:

    numCols.append(is_numeric_dtype(saleDF[col]))
    txtCols.append(~numCols[-1])


numCols = list(compress(cols,numCols))[1:-1] 
catCols = ["exterior_condition","neighborhood"]


transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numCols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), catCols),
    ]
)
numCols

['total_livable_area',
 'total_area',
 'garage_spaces',
 'fireplaces',
 'number_of_bathrooms',
 'number_of_bedrooms',
 'number_stories']

# Split the data 70/30
train_set, test_set = train_test_split(saleDF, test_size=0.3, random_state=42)

# the target labels: log of sale price
y_train = train_set["log(sale_price)"]
y_test = test_set["log(sale_price)"]



pipe = make_pipeline(
    transformer, RandomForestRegressor(random_state=42)
                                       
)

model_step = "randomforestregressor"
param_grid = {
    f"{model_step}__n_estimators": [5, 10, 15, 20, 30, 50, 100, 200],
    f"{model_step}__max_depth": [2, 5, 7, 9, 13, 21, 33, 51],
}

# Create the grid and use 3-fold CV
grid = GridSearchCV(pipe, param_grid, cv=3, verbose=1)

# Run the search
grid.fit(train_set, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['total_livable_area',
                                                                          'total_area',
                                                                          'garage_spaces',
                                                                          'fireplaces',
                                                                          'number_of_bathrooms',
                                                                          'number_of_bedrooms',
                                                                          'number_stories']),
                                                                        ('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['exterior_condition',
                                                                          'neighborhood'])])),
                                       ('randomforestregressor',
                                        RandomForestRegressor(random_state=42))]),
             param_grid={'randomforestregressor__max_depth': [2, 5, 7, 9, 13,
                                                              21, 33, 51],
                         'randomforestregressor__n_estimators': [5, 10, 15, 20,
                                                                 30, 50, 100,
                                                                 200]},
             verbose=1)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

GridSearchCV

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['total_livable_area',
                                                                          'total_area',
                                                                          'garage_spaces',
                                                                          'fireplaces',
                                                                          'number_of_bathrooms',
                                                                          'number_of_bedrooms',
                                                                          'number_stories']),
                                                                        ('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['exterior_condition',
                                                                          'neighborhood'])])),
                                       ('randomforestregressor',
                                        RandomForestRegressor(random_state=42))]),
             param_grid={'randomforestregressor__max_depth': [2, 5, 7, 9, 13,
                                                              21, 33, 51],
                         'randomforestregressor__n_estimators': [5, 10, 15, 20,
                                                                 30, 50, 100,
                                                                 200]},
             verbose=1)

estimator: Pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['total_livable_area',
                                                   'total_area',
                                                   'garage_spaces',
                                                   'fireplaces',
                                                   'number_of_bathrooms',
                                                   'number_of_bedrooms',
                                                   'number_stories']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['exterior_condition',
                                                   'neighborhood'])])),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

columntransformer: ColumnTransformer

ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['total_livable_area', 'total_area',
                                  'garage_spaces', 'fireplaces',
                                  'number_of_bathrooms', 'number_of_bedrooms',
                                  'number_stories']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['exterior_condition', 'neighborhood'])])

num

['total_livable_area', 'total_area', 'garage_spaces', 'fireplaces', 'number_of_bathrooms', 'number_of_bedrooms', 'number_stories']

StandardScaler

StandardScaler()

cat

['exterior_condition', 'neighborhood']

OneHotEncoder

OneHotEncoder(handle_unknown='ignore')

RandomForestRegressor

RandomForestRegressor(random_state=42)

print("Getting 3-fold cross validation results.")

scores = cross_val_score(
    pipe,
    train_set,
    y_train,
    cv=3,
)

print("Completed")
# # Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Standard deviation = ", scores.std())

Getting 3-fold cross validation results.
Completed
R^2 scores =  [0.56575893 0.51311316 0.53833274]
Scores mean =  0.5390682769418876
Standard deviation =  0.021498838781376265

print(f"The optimal model of our 64 candiates had a max depth of {grid.best_estimator_.steps[1][1].max_depth} and an n_estimator count of {grid.best_estimator_.steps[1][1].n_estimators}\nleading to an accuracy score of {grid.best_estimator_.score(test_set,y_test)} or {grid.best_estimator_.score(test_set,y_test)*100:.2f}% on the test set.")



print("\n\nCV Score Report ...\n")
print("R^2 scores:\n\tFold 1 = ", scores[0],"\n\tFold 2 = ", scores[1],"\n\tFold 3 = ", scores[2],)
print("Scores mean = ", scores.mean())
print("Score standard deviation = ", scores.std())

The optimal model of our 64 candiates had a max depth of 51 and an n_estimator count of 200
leading to an accuracy score of 0.5573963547661122 or 55.74% on the test set.


CV Score Report ...

R^2 scores:
    Fold 1 =  0.5657589318132974 
    Fold 2 =  0.5131131592492879 
    Fold 3 =  0.5383327397630778
Scores mean =  0.5390682769418876
Score standard deviation =  0.021498838781376265

#not sure why I am getting an error for transformer but the cells after still work
random_forest = grid.best_estimator_.named_steps["randomforestregressor"]
features = numCols + list(transformer.named_transformers_['cat'].get_feature_names_out())


importance = pd.DataFrame(
    {"Feature": features, "Importance": random_forest.feature_importances_}
)

importance = importance.sort_values("Importance", ascending=False).iloc[:30]

importance.head(n=20)


importance.hvplot.barh(x="Feature", y="Importance", height=700, flip_yaxis=True)

AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'

2.5 Calculate the percent error of your model predictions for each sale in the test set

Fit your best model and use it to make predictions on the test set.

Note: This should be the percent error in terms of sale price. You’ll need to convert if your model predicted the log of sales price!

import math

predictions = grid.best_estimator_.predict(test_set)


errors = predictions - y_test 

precent_errors = math.e**(predictions) - math.e**(y_test) 

print(errors[0])
print(precent_errors[0])
print()

print(test_set[0:1]["log(sale_price)"])
print()
print(predictions[0])

1.5150772918914885
74545.2282497846

16061    11.350407
Name: log(sale_price), dtype: float64

12.527490609299118

2.6 Make a data frame with percent errors and census tract info for each sale in the test set

Create a data frame that has the property geometries, census tract data, and percent errors for all of the sales in the test set.

Notes

When using the “train_test_split()” function, the index of the test data frame includes the labels from the original sales data frame
You can use this index to slice out the test data from the original sales data frame, which should include the census tract info and geometries
Add a new column to this data frame holding the percent error data
Make sure to use the percent error and not the absolute percent error

test_saleDF = saleDF.loc[test_set.index]
# test_set.index

test_saleDF["log(predictions)"] = predictions
test_saleDF["predictions"] = math.e**predictions
test_saleDF["error"] = precent_errors
test_saleDF["percent_error"] = (precent_errors)/test_saleDF["sale_price"]*100

test_saleDF = sales_full.merge(test_saleDF,how="right")


test_saleDF[["sale_price","log(sale_price)","log(predictions)","predictions","error","percent_error"]].tail()

	sale_price	log(sale_price)	log(predictions)	predictions	error	percent_error
5352	181000	12.106252	11.345868	84615.078520	-96384.921480	-53.251338
5353	300000	12.611538	12.959598	424895.325384	124895.325384	41.631775
5354	265000	12.487485	12.673232	319091.035201	54091.035201	20.411711
5355	380000	12.847927	12.646338	310624.010585	-69375.989415	-18.256839
5356	134000	11.805595	12.530408	276622.279170	142622.279170	106.434537

2.8 Plot a map of the median percent error by census tract

You’ll want to group your data frame of test sales by the GEOID10 column and take the median of you percent error column
Merge the census tract geometries back in and use geopandas to plot.

tractPE = tract.merge(test_saleDF.groupby("GEOID10").agg({"predictions": "median","error": "median","percent_error" : "median"}), on="GEOID10")
# sales_full.geometry
tractPE.columns

Index(['OBJECTID', 'STATEFP10', 'COUNTYFP10', 'TRACTCE10', 'GEOID10', 'NAME10',
       'NAMELSAD10', 'MTFCC10', 'FUNCSTAT10', 'ALAND10', 'AWATER10',
       'INTPTLAT10', 'INTPTLON10', 'LOGRECNO', 'geometry', 'predictions',
       'error', 'percent_error'],
      dtype='object')

tractPE.rename(columns={"NAME10":"Census Tract #","predictions":"Predicted Sale Price ($)","error":"Prediction Error ($)","percent_error":"Percent Error"}).explore(
    column = "Percent Error",
    cmap = "viridis",
    tiles = "CartoDB positron",
    tooltip = ["Census Tract #","Predicted Sale Price ($)","Prediction Error ($)","Percent Error"]
)

Make this Notebook Trusted to load map: File -> Trust Notebook

2.9 Compare the percent errors in Qualifying Census Tracts and other tracts

Qualifying Census Tracts are a poverty designation that HUD uses to allocate housing tax credits

I’ve included a list of the census tract names that qualify in Philadelphia
Add a new column to your dataframe of test set sales that is True/False depending on if the tract is a QCT
Then, group by this new column and calculate the median percent error

You should find that the algorithm’s accuracy is significantly worse in these low-income, qualifying census tracts

qct = ['5',
 '20',
 '22',
 '28.01',
 '30.01',
 '30.02',
 '31',
 '32',
 '33',
 '36',
 '37.01',
 '37.02',
 '39.01',
 '41.01',
 '41.02',
 '56',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '69',
 '70',
 '71.01',
 '71.02',
 '72',
 '73',
 '74',
 '77',
 '78',
 '80',
 '81.01',
 '81.02',
 '82',
 '83.01',
 '83.02',
 '84',
 '85',
 '86.01',
 '86.02',
 '87.01',
 '87.02',
 '88.01',
 '88.02',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '98.01',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '119',
 '121',
 '122.01',
 '122.03',
 '131',
 '132',
 '137',
 '138',
 '139',
 '140',
 '141',
 '144',
 '145',
 '146',
 '147',
 '148',
 '149',
 '151.01',
 '151.02',
 '152',
 '153',
 '156',
 '157',
 '161',
 '162',
 '163',
 '164',
 '165',
 '167.01',
 '167.02',
 '168',
 '169.01',
 '169.02',
 '170',
 '171',
 '172.01',
 '172.02',
 '173',
 '174',
 '175',
 '176.01',
 '176.02',
 '177.01',
 '177.02',
 '178',
 '179',
 '180.02',
 '188',
 '190',
 '191',
 '192',
 '195.01',
 '195.02',
 '197',
 '198',
 '199',
 '200',
 '201.01',
 '201.02',
 '202',
 '203',
 '204',
 '205',
 '206',
 '208',
 '239',
 '240',
 '241',
 '242',
 '243',
 '244',
 '245',
 '246',
 '247',
 '249',
 '252',
 '253',
 '265',
 '267',
 '268',
 '271',
 '274.01',
 '274.02',
 '275',
 '276',
 '277',
 '278',
 '279.01',
 '279.02',
 '280',
 '281',
 '282',
 '283',
 '284',
 '285',
 '286',
 '287',
 '288',
 '289.01',
 '289.02',
 '290',
 '291',
 '293',
 '294',
 '298',
 '299',
 '300',
 '301',
 '302',
 '305.01',
 '305.02',
 '309',
 '311.01',
 '312',
 '313',
 '314.01',
 '314.02',
 '316',
 '318',
 '319',
 '321',
 '325',
 '329',
 '330',
 '337.01',
 '345.01',
 '357.01',
 '376',
 '377',
 '380',
 '381',
 '382',
 '383',
 '389',
 '390']

tractPE["is_qct"] = tractPE["NAME10"].isin(qct)
qctGroup = tractPE.groupby("is_qct",as_index=False).agg({"predictions": "median","error": ["mean","median"],"percent_error" : ["mean","median"]})
qct_t = qctGroup[1:]["percent_error"]["median"].squeeze()
qct_f = qctGroup[:1]["percent_error"]["median"].squeeze()
qctGroup