Predictive Modeling of Housing Prices in Philadelphia

Part 2: Modeling Philadelphia’s Housing Prices and Algorithmic Fairness

2.1 Load data from the Office of Property Assessment

Use the requests package to query the CARTO API for single-family property assessment data in Philadelphia for properties that had their last sale during 2022.

Sources: - OpenDataPhilly - Metadata

import geopandas as gpd
import holoviews as hv
import hvplot.pandas
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import requests
import missingno as msno

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Show all columns
pd.options.display.max_columns = 999
# The API endpoint
carto_url = "https://phl.carto.com/api/v2/sql"

# Only pull 2022 sales for single family residential properties
where = "sale_date >= '2022-01-01' and sale_date <= '2022-12-31'"
where = where + " and category_code_description IN ('SINGLE FAMILY', 'Single Family')"
# Create the query
query = f"SELECT * FROM opa_properties_public WHERE {where}"

# Make the request
params = {"q": query, "format": "geojson", "where": where}
response = requests.get(carto_url, params=params)

# Make the GeoDataFrame
salesRaw = gpd.GeoDataFrame.from_features(response.json(), crs="EPSG:4326")
salesRaw.head()
geometry cartodb_id assessment_date basements beginning_point book_and_page building_code building_code_description category_code category_code_description census_tract central_air cross_reference date_exterior_condition depth exempt_building exempt_land exterior_condition fireplaces frontage fuel garage_spaces garage_type general_construction geographic_ward homestead_exemption house_extension house_number interior_condition location mailing_address_1 mailing_address_2 mailing_care_of mailing_city_state mailing_street mailing_zip market_value market_value_date number_of_bathrooms number_of_bedrooms number_of_rooms number_stories off_street_open other_building owner_1 owner_2 parcel_number parcel_shape quality_grade recording_date registry_number sale_date sale_price separate_utilities sewer site_type state_code street_code street_designation street_direction street_name suffix taxable_building taxable_land topography total_area total_livable_area type_heater unfinished unit utility view_type year_built year_built_estimate zip_code zoning pin building_code_new building_code_description_new objectid
0 POINT (-75.13389 40.03928) 1056 2022-05-24T00:00:00Z H 241' N OF CHEW ST 54230133 R30 ROW B/GAR 2 STY MASONRY 1 SINGLE FAMILY 275 N None None 95.0 0.0 0.0 7 0.0 15.0 None 1.0 None B 61 0 None 5732 4 5732 N 7TH ST WALKER MICHAEL None None SICKLERVILLE NJ 44 FARMHOUSE RD 08081 133400 None 1.0 3.0 NaN 2.0 1920.0 None WALKER MICHAEL None 612234600 E C 2023-10-04T00:00:00Z 135N7 61 2022-08-21T00:00:00Z 21000 None None None NJ 87930 ST N 7TH None 106720.0 26680.0 F 1425.0 1164.0 H None None None I 1925 Y 19120 RSA5 1001602509 24 ROW PORCH FRONT 401090517
1 POINT (-75.14337 40.00957) 1145 2022-05-24T00:00:00Z D 415' N OF ERIE AVE 54230032 O30 ROW 2 STY MASONRY 1 SINGLE FAMILY 198 N None None 45.0 0.0 0.0 4 0.0 16.0 None 0.0 None A 43 0 None 3753 4 3753 N DELHI ST None None None DELRAY BEACH FL 4899 NW 6TH STREET 33445 73800 None 1.0 3.0 NaN 2.0 1683.0 None RJ SIMPLE SOLUTION LLC None 432345900 E C 2023-10-04T00:00:00Z 100N040379 2022-06-13T00:00:00Z 35000 None None None FL 28040 ST N DELHI None 59040.0 14760.0 F 720.0 960.0 H None None None I 1942 Y 19140 RM1 1001175031 24 ROW PORCH FRONT 401090494
2 POINT (-75.07249 40.01381) 1357 2022-05-24T00:00:00Z D 119'11 1/2" NE 54228837 H30 SEMI/DET 2 STY MASONRY 1 SINGLE FAMILY 299 N None None 76.0 0.0 0.0 4 0.0 20.0 None 0.0 None B 62 0 None 5033 4 5033 DITMAN ST CSC INGEO None None PHILADELPHIA PA 5033 DITMAN ST 19124-2230 119100 None 1.0 3.0 NaN 2.0 698.0 None LISHANSKY MARINA None 622444400 E C 2023-10-02T00:00:00Z 89N17 208 2022-12-28T00:00:00Z 1 None None None PA 28660 ST None DITMAN None 95280.0 23820.0 F 1523.0 1190.0 B None None None I 1945 None 19124 RSA5 1001181518 32 TWIN CONVENTIONAL 401090706
3 POINT (-75.12854 40.03916) 1766 2022-05-24T00:00:00Z None 71'8" E LAWRENCE ST 54226519 O30 ROW 2 STY MASONRY 1 SINGLE FAMILY 274 None None None 88.0 0.0 0.0 4 0.0 14.0 None 0.0 None A 61 0 None 416 4 416 W GRANGE AVE None None None PHILADELPHIA PA 416 W GRANGE AVE 19120-1854 124100 None 1.0 3.0 NaN 2.0 957.0 None WALLACE DANE None 612061100 E C 2023-09-25T00:00:00Z 122N2 150 2022-10-26T00:00:00Z 1 None None None PA 38040 AVE W GRANGE None 99280.0 24820.0 F 1241.0 1104.0 None None None None I 1953 Y 19120 RSA5 1001249126 24 ROW PORCH FRONT 401091115
4 POINT (-75.17362 39.99887) 2005 2022-05-24T00:00:00Z D 261'4" N OF SOMERSET 54217081 O50 ROW 3 STY MASONRY 1 SINGLE FAMILY 172 N None None 56.0 0.0 0.0 4 0.0 16.0 None 0.0 None B 38 0 None 2834 4 2834 N 26TH ST NEAL KIYONNA None None PHILADELPHIA PA 6007 N FRONT ST 19120 92900 None 0.0 5.0 NaN 3.0 2457.0 None NEAL KIYONNA None 381152100 E C+ 2023-08-28T00:00:00Z 035N230348 2022-05-11T00:00:00Z 1 None None None PA 88300 ST N 26TH None 74320.0 18580.0 F 896.0 1636.0 H None None None I 1940 Y 19132 RSA5 1001643492 24 ROW PORCH FRONT 401092557
len(salesRaw)
24456

Clean data

# The feature columns we want to use
cols = [
    "sale_price",
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "exterior_condition",
    "zip_code",
    "geometry"
]

# Trim to these columns and remove NaNs
sales = salesRaw[cols].dropna()

# Trim zip code to only the first five digits
sales['zip_code'] = sales['zip_code'].astype(str).str.slice(0, 5)
len(sales)
23463

2.2 Load data for census tracts and neighborhoods

Load various Philadelphia-based regions that we will use in our analysis.

neigh = gpd.read_file("https://raw.githubusercontent.com/azavea/geo-data/master/Neighborhoods_Philadelphia/Neighborhoods_Philadelphia.geojson")
tract = gpd.read_file("Census_Tracts_2010.geojson")
tract.explore()
Make this Notebook Trusted to load map: File -> Trust Notebook

2.3 Spatially join the sales data and neighborhoods/census tracts.

Perform a spatial join, such that each sale has an associated neighborhood and census tract.

Note: After performing the first spatial join, you will need to use the drop() function to remove the index_right column; otherwise an error will be raised on the second spatial join about duplicate columns.

join = gpd.sjoin(
        sales, tract.to_crs(tract.crs), predicate="within").drop(columns=["index_right"])
# print(join.columns)

sales_full = gpd.sjoin(
    join,
        neigh.to_crs(neigh.crs), predicate = "within").drop(columns=["index_right"])
sales_full.head()
sale_price total_livable_area total_area garage_spaces fireplaces number_of_bathrooms number_of_bedrooms number_stories exterior_condition zip_code geometry OBJECTID STATEFP10 COUNTYFP10 TRACTCE10 GEOID10 NAME10 NAMELSAD10 MTFCC10 FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10 INTPTLON10 LOGRECNO name listname mapname shape_leng shape_area cartodb_id created_at updated_at
0 21000 1164.0 1425.0 1.0 0.0 1.0 3.0 2.0 7 19120 POINT (-75.13389 40.03928) 350 42 101 027500 42101027500 275 Census Tract 275 G5020 S 606825 0 +40.0400497 -075.1322707 10588 OLNEY Olney Olney 32197.205271 5.030840e+07 8 2013-03-19 17:41:50.508000+00:00 2013-03-19 17:41:50.743000+00:00
586 1 1188.0 1575.0 0.0 0.0 1.0 3.0 2.0 3 19120 POINT (-75.13367 40.04039) 350 42 101 027500 42101027500 275 Census Tract 275 G5020 S 606825 0 +40.0400497 -075.1322707 10588 OLNEY Olney Olney 32197.205271 5.030840e+07 8 2013-03-19 17:41:50.508000+00:00 2013-03-19 17:41:50.743000+00:00
1041 137000 1176.0 1234.0 1.0 0.0 1.0 3.0 2.0 4 19120 POINT (-75.13277 40.04028) 350 42 101 027500 42101027500 275 Census Tract 275 G5020 S 606825 0 +40.0400497 -075.1322707 10588 OLNEY Olney Olney 32197.205271 5.030840e+07 8 2013-03-19 17:41:50.508000+00:00 2013-03-19 17:41:50.743000+00:00
1567 259000 1310.0 1754.0 1.0 0.0 2.0 3.0 2.0 4 19120 POINT (-75.13173 40.03868) 350 42 101 027500 42101027500 275 Census Tract 275 G5020 S 606825 0 +40.0400497 -075.1322707 10588 OLNEY Olney Olney 32197.205271 5.030840e+07 8 2013-03-19 17:41:50.508000+00:00 2013-03-19 17:41:50.743000+00:00
2000 220000 1272.0 1445.0 0.0 0.0 2.0 3.0 2.0 4 19120 POINT (-75.13245 40.03959) 350 42 101 027500 42101027500 275 Census Tract 275 G5020 S 606825 0 +40.0400497 -075.1322707 10588 OLNEY Olney Olney 32197.205271 5.030840e+07 8 2013-03-19 17:41:50.508000+00:00 2013-03-19 17:41:50.743000+00:00
sales_full.head()
sale_price total_livable_area total_area garage_spaces fireplaces number_of_bathrooms number_of_bedrooms number_stories exterior_condition zip_code geometry OBJECTID STATEFP10 COUNTYFP10 TRACTCE10 GEOID10 NAME10 NAMELSAD10 MTFCC10 FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10 INTPTLON10 LOGRECNO name listname mapname shape_leng shape_area cartodb_id created_at updated_at
0 21000 1164.0 1425.0 1.0 0.0 1.0 3.0 2.0 7 19120 POINT (-75.13389 40.03928) 350 42 101 027500 42101027500 275 Census Tract 275 G5020 S 606825 0 +40.0400497 -075.1322707 10588 OLNEY Olney Olney 32197.205271 5.030840e+07 8 2013-03-19 17:41:50.508000+00:00 2013-03-19 17:41:50.743000+00:00
586 1 1188.0 1575.0 0.0 0.0 1.0 3.0 2.0 3 19120 POINT (-75.13367 40.04039) 350 42 101 027500 42101027500 275 Census Tract 275 G5020 S 606825 0 +40.0400497 -075.1322707 10588 OLNEY Olney Olney 32197.205271 5.030840e+07 8 2013-03-19 17:41:50.508000+00:00 2013-03-19 17:41:50.743000+00:00
1041 137000 1176.0 1234.0 1.0 0.0 1.0 3.0 2.0 4 19120 POINT (-75.13277 40.04028) 350 42 101 027500 42101027500 275 Census Tract 275 G5020 S 606825 0 +40.0400497 -075.1322707 10588 OLNEY Olney Olney 32197.205271 5.030840e+07 8 2013-03-19 17:41:50.508000+00:00 2013-03-19 17:41:50.743000+00:00
1567 259000 1310.0 1754.0 1.0 0.0 2.0 3.0 2.0 4 19120 POINT (-75.13173 40.03868) 350 42 101 027500 42101027500 275 Census Tract 275 G5020 S 606825 0 +40.0400497 -075.1322707 10588 OLNEY Olney Olney 32197.205271 5.030840e+07 8 2013-03-19 17:41:50.508000+00:00 2013-03-19 17:41:50.743000+00:00
2000 220000 1272.0 1445.0 0.0 0.0 2.0 3.0 2.0 4 19120 POINT (-75.13245 40.03959) 350 42 101 027500 42101027500 275 Census Tract 275 G5020 S 606825 0 +40.0400497 -075.1322707 10588 OLNEY Olney Olney 32197.205271 5.030840e+07 8 2013-03-19 17:41:50.508000+00:00 2013-03-19 17:41:50.743000+00:00

2.4 Train a Random Forest on the sales data

In this step, you should follow the steps outlined in lecture to preprocess and train your model. We’ll extend our analysis to do a hyperparameter grid search to find the best model configuration. As you train your model, follow the following steps:

Preprocessing Requirements
Trim the sales data to those sales with prices between $3,000 and $1 million
Set up a pipeline that includes both numerical columns and categorical columns Include one-hot encoded variable for the neighborhood of the sale, instead of ZIP code. We don’t want to include multiple location based categories, since they encode much of the same information.

Training requirements - Use a 70/30% training/test split and predict the log of the sales price. - Use GridSearchCV to perform a k-fold cross validation that optimize at least 2 hyperparameters of the RandomForestRegressor - After fitting your model and finding the optimal hyperparameters, you should evaluate the score (R-squared) on the test set (the original 30% sample withheld)

Note: You don’t need to include additional features (such as spatial distance features) or perform any extra feature engineering beyond what is required above to receive full credit. Of course, you are always welcome to experiment!

trim_sales = sales_full.loc[sales_full["sale_price"]>3000].loc[sales_full["sale_price"]<1000000]
trim_sales.rename(columns={"mapname":"neighborhood"},inplace=True)

# The feature columns we want to use
cols = [
    "sale_price",
    "total_livable_area",
    "total_area",
    "garage_spaces",
    "fireplaces",
    "number_of_bathrooms",
    "number_of_bedrooms",
    "number_stories",
    "exterior_condition",
    "neighborhood",
    "log(sale_price)"
]

# Trim to these columns and remove NaNs
trim_sales["log(sale_price)"] = np.log(trim_sales["sale_price"])
saleDF = trim_sales[cols].dropna()


saleDF.head()
sale_price total_livable_area total_area garage_spaces fireplaces number_of_bathrooms number_of_bedrooms number_stories exterior_condition neighborhood log(sale_price)
0 21000 1164.0 1425.0 1.0 0.0 1.0 3.0 2.0 7 Olney 9.952278
1041 137000 1176.0 1234.0 1.0 0.0 1.0 3.0 2.0 4 Olney 11.827736
1567 259000 1310.0 1754.0 1.0 0.0 2.0 3.0 2.0 4 Olney 12.464583
2000 220000 1272.0 1445.0 0.0 0.0 2.0 3.0 2.0 4 Olney 12.301383
2618 200000 1304.0 1360.0 1.0 0.0 1.0 3.0 2.0 4 Olney 12.206073
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from itertools import compress

numCols = []
txtCols = []
for col in cols:

    numCols.append(is_numeric_dtype(saleDF[col]))
    txtCols.append(~numCols[-1])


numCols = list(compress(cols,numCols))[1:-1] 
catCols = ["exterior_condition","neighborhood"]


transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numCols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), catCols),
    ]
)
numCols
['total_livable_area',
 'total_area',
 'garage_spaces',
 'fireplaces',
 'number_of_bathrooms',
 'number_of_bedrooms',
 'number_stories']
# Split the data 70/30
train_set, test_set = train_test_split(saleDF, test_size=0.3, random_state=42)

# the target labels: log of sale price
y_train = train_set["log(sale_price)"]
y_test = test_set["log(sale_price)"]



pipe = make_pipeline(
    transformer, RandomForestRegressor(random_state=42)
                                       
)

model_step = "randomforestregressor"
param_grid = {
    f"{model_step}__n_estimators": [5, 10, 15, 20, 30, 50, 100, 200],
    f"{model_step}__max_depth": [2, 5, 7, 9, 13, 21, 33, 51],
}

# Create the grid and use 3-fold CV
grid = GridSearchCV(pipe, param_grid, cv=3, verbose=1)

# Run the search
grid.fit(train_set, y_train)
Fitting 3 folds for each of 64 candidates, totalling 192 fits
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['total_livable_area',
                                                                          'total_area',
                                                                          'garage_spaces',
                                                                          'fireplaces',
                                                                          'number_of_bathrooms',
                                                                          'number_of_bedrooms',
                                                                          'number_stories']),
                                                                        ('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['exterior_condition',
                                                                          'neighborhood'])])),
                                       ('randomforestregressor',
                                        RandomForestRegressor(random_state=42))]),
             param_grid={'randomforestregressor__max_depth': [2, 5, 7, 9, 13,
                                                              21, 33, 51],
                         'randomforestregressor__n_estimators': [5, 10, 15, 20,
                                                                 30, 50, 100,
                                                                 200]},
             verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
print("Getting 3-fold cross validation results.")

scores = cross_val_score(
    pipe,
    train_set,
    y_train,
    cv=3,
)

print("Completed")
# # Report
print("R^2 scores = ", scores)
print("Scores mean = ", scores.mean())
print("Standard deviation = ", scores.std())
Getting 3-fold cross validation results.
Completed
R^2 scores =  [0.56575893 0.51311316 0.53833274]
Scores mean =  0.5390682769418876
Standard deviation =  0.021498838781376265
print(f"The optimal model of our 64 candiates had a max depth of {grid.best_estimator_.steps[1][1].max_depth} and an n_estimator count of {grid.best_estimator_.steps[1][1].n_estimators}\nleading to an accuracy score of {grid.best_estimator_.score(test_set,y_test)} or {grid.best_estimator_.score(test_set,y_test)*100:.2f}% on the test set.")



print("\n\nCV Score Report ...\n")
print("R^2 scores:\n\tFold 1 = ", scores[0],"\n\tFold 2 = ", scores[1],"\n\tFold 3 = ", scores[2],)
print("Scores mean = ", scores.mean())
print("Score standard deviation = ", scores.std())
The optimal model of our 64 candiates had a max depth of 51 and an n_estimator count of 200
leading to an accuracy score of 0.5573963547661122 or 55.74% on the test set.


CV Score Report ...

R^2 scores:
    Fold 1 =  0.5657589318132974 
    Fold 2 =  0.5131131592492879 
    Fold 3 =  0.5383327397630778
Scores mean =  0.5390682769418876
Score standard deviation =  0.021498838781376265
#not sure why I am getting an error for transformer but the cells after still work
random_forest = grid.best_estimator_.named_steps["randomforestregressor"]
features = numCols + list(transformer.named_transformers_['cat'].get_feature_names_out())


importance = pd.DataFrame(
    {"Feature": features, "Importance": random_forest.feature_importances_}
)

importance = importance.sort_values("Importance", ascending=False).iloc[:30]

importance.head(n=20)


importance.hvplot.barh(x="Feature", y="Importance", height=700, flip_yaxis=True)
AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'

2.5 Calculate the percent error of your model predictions for each sale in the test set

Fit your best model and use it to make predictions on the test set.

Note: This should be the percent error in terms of sale price. You’ll need to convert if your model predicted the log of sales price!

import math

predictions = grid.best_estimator_.predict(test_set)


errors = predictions - y_test 

precent_errors = math.e**(predictions) - math.e**(y_test) 

print(errors[0])
print(precent_errors[0])
print()

print(test_set[0:1]["log(sale_price)"])
print()
print(predictions[0])
1.5150772918914885
74545.2282497846

16061    11.350407
Name: log(sale_price), dtype: float64

12.527490609299118

2.6 Make a data frame with percent errors and census tract info for each sale in the test set

Create a data frame that has the property geometries, census tract data, and percent errors for all of the sales in the test set.

Notes

  • When using the “train_test_split()” function, the index of the test data frame includes the labels from the original sales data frame
  • You can use this index to slice out the test data from the original sales data frame, which should include the census tract info and geometries
  • Add a new column to this data frame holding the percent error data
  • Make sure to use the percent error and not the absolute percent error
test_saleDF = saleDF.loc[test_set.index]
# test_set.index

test_saleDF["log(predictions)"] = predictions
test_saleDF["predictions"] = math.e**predictions
test_saleDF["error"] = precent_errors
test_saleDF["percent_error"] = (precent_errors)/test_saleDF["sale_price"]*100

test_saleDF = sales_full.merge(test_saleDF,how="right")


test_saleDF[["sale_price","log(sale_price)","log(predictions)","predictions","error","percent_error"]].tail()
sale_price log(sale_price) log(predictions) predictions error percent_error
5352 181000 12.106252 11.345868 84615.078520 -96384.921480 -53.251338
5353 300000 12.611538 12.959598 424895.325384 124895.325384 41.631775
5354 265000 12.487485 12.673232 319091.035201 54091.035201 20.411711
5355 380000 12.847927 12.646338 310624.010585 -69375.989415 -18.256839
5356 134000 11.805595 12.530408 276622.279170 142622.279170 106.434537

2.8 Plot a map of the median percent error by census tract

  • You’ll want to group your data frame of test sales by the GEOID10 column and take the median of you percent error column
  • Merge the census tract geometries back in and use geopandas to plot.
tractPE = tract.merge(test_saleDF.groupby("GEOID10").agg({"predictions": "median","error": "median","percent_error" : "median"}), on="GEOID10")
# sales_full.geometry
tractPE.columns
Index(['OBJECTID', 'STATEFP10', 'COUNTYFP10', 'TRACTCE10', 'GEOID10', 'NAME10',
       'NAMELSAD10', 'MTFCC10', 'FUNCSTAT10', 'ALAND10', 'AWATER10',
       'INTPTLAT10', 'INTPTLON10', 'LOGRECNO', 'geometry', 'predictions',
       'error', 'percent_error'],
      dtype='object')
tractPE.rename(columns={"NAME10":"Census Tract #","predictions":"Predicted Sale Price ($)","error":"Prediction Error ($)","percent_error":"Percent Error"}).explore(
    column = "Percent Error",
    cmap = "viridis",
    tiles = "CartoDB positron",
    tooltip = ["Census Tract #","Predicted Sale Price ($)","Prediction Error ($)","Percent Error"]
)
Make this Notebook Trusted to load map: File -> Trust Notebook

2.9 Compare the percent errors in Qualifying Census Tracts and other tracts

Qualifying Census Tracts are a poverty designation that HUD uses to allocate housing tax credits

  • I’ve included a list of the census tract names that qualify in Philadelphia
  • Add a new column to your dataframe of test set sales that is True/False depending on if the tract is a QCT
  • Then, group by this new column and calculate the median percent error

You should find that the algorithm’s accuracy is significantly worse in these low-income, qualifying census tracts

qct = ['5',
 '20',
 '22',
 '28.01',
 '30.01',
 '30.02',
 '31',
 '32',
 '33',
 '36',
 '37.01',
 '37.02',
 '39.01',
 '41.01',
 '41.02',
 '56',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '69',
 '70',
 '71.01',
 '71.02',
 '72',
 '73',
 '74',
 '77',
 '78',
 '80',
 '81.01',
 '81.02',
 '82',
 '83.01',
 '83.02',
 '84',
 '85',
 '86.01',
 '86.02',
 '87.01',
 '87.02',
 '88.01',
 '88.02',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '98.01',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '119',
 '121',
 '122.01',
 '122.03',
 '131',
 '132',
 '137',
 '138',
 '139',
 '140',
 '141',
 '144',
 '145',
 '146',
 '147',
 '148',
 '149',
 '151.01',
 '151.02',
 '152',
 '153',
 '156',
 '157',
 '161',
 '162',
 '163',
 '164',
 '165',
 '167.01',
 '167.02',
 '168',
 '169.01',
 '169.02',
 '170',
 '171',
 '172.01',
 '172.02',
 '173',
 '174',
 '175',
 '176.01',
 '176.02',
 '177.01',
 '177.02',
 '178',
 '179',
 '180.02',
 '188',
 '190',
 '191',
 '192',
 '195.01',
 '195.02',
 '197',
 '198',
 '199',
 '200',
 '201.01',
 '201.02',
 '202',
 '203',
 '204',
 '205',
 '206',
 '208',
 '239',
 '240',
 '241',
 '242',
 '243',
 '244',
 '245',
 '246',
 '247',
 '249',
 '252',
 '253',
 '265',
 '267',
 '268',
 '271',
 '274.01',
 '274.02',
 '275',
 '276',
 '277',
 '278',
 '279.01',
 '279.02',
 '280',
 '281',
 '282',
 '283',
 '284',
 '285',
 '286',
 '287',
 '288',
 '289.01',
 '289.02',
 '290',
 '291',
 '293',
 '294',
 '298',
 '299',
 '300',
 '301',
 '302',
 '305.01',
 '305.02',
 '309',
 '311.01',
 '312',
 '313',
 '314.01',
 '314.02',
 '316',
 '318',
 '319',
 '321',
 '325',
 '329',
 '330',
 '337.01',
 '345.01',
 '357.01',
 '376',
 '377',
 '380',
 '381',
 '382',
 '383',
 '389',
 '390']
tractPE["is_qct"] = tractPE["NAME10"].isin(qct)
qctGroup = tractPE.groupby("is_qct",as_index=False).agg({"predictions": "median","error": ["mean","median"],"percent_error" : ["mean","median"]})
qct_t = qctGroup[1:]["percent_error"]["median"].squeeze()
qct_f = qctGroup[:1]["percent_error"]["median"].squeeze()
qctGroup