Median house prices for California districts derived from the 1990 census.
This is the dataset used in the second chapter of Aurélien Géron's recent book 'Hands-On Machine learning with Scikit-Learn and TensorFlow'. It serves as an excellent introduction to implementing machine learning algorithms because it requires rudimentary data cleaning, has an easily understandable list of variables and sits at an optimal size between being to toyish and too cumbersome.
The data contains information from the 1990 California census. So although it may not help you with predicting current housing prices like the Zillow Zestimate dataset, it does provide an accessible introductory dataset for teaching people about the basics of machine learning.
The data pertains to the houses found in a given California district and some summary stats about them based on the 1990 census data. Be warned the data aren't cleaned so there are some preprocessing steps required! The columns are as follows, their names are pretty self explanitory:
longitude
latitude
housing_median_age
total_rooms
total_bedrooms
population
households
median_income
median_house_value
ocean_proximity
This data was initially featured in the following paper: Pace, R. Kelley, and Ronald Barry. "Sparse spatial autoregressions." Statistics & Probability Letters 33.3 (1997): 291-297.
and I encountered it in 'Hands-On Machine learning with Scikit-Learn and TensorFlow' by Aurélien Géron. Aurélien Géron wrote: This dataset is a modified version of the California Housing dataset available from: LuÃs Torgo's page (University of Porto)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv("housing.csv")
data.head()
data.info()
data.describe()
data.hist(bins=50, figsize=(20,15))
add a new categorical feature in order to split the dataset properly
This new feature will be deleted afterwards.
data['income_cat'] = np.ceil(data['median_income']/1.5)
data['income_cat'].where(data['income_cat']<5, 5, inplace=True)
plt.hist(data.income_cat)
data.income_cat.value_counts()/len(data.income_cat)
**split the data into train and test sets based on the new feature
Method 1: using the package train_test_split without the new feature
Method 2: using the package StratifiedShuffleSplit with the new feature
from sklearn.model_selection import train_test_split
data4train,data4test = train_test_split(data, test_size=0.2, random_state=42)
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state = 42)
for train_index, test_index in split.split(data, data['income_cat']):
train_set = data.loc[train_index]
test_set = data.loc[test_index]
compare the two split methods
def table4income_cat(dataset,df,label):
df[label]=pd.Series(dataset['income_cat'].value_counts()/len(dataset['income_cat']))
return df
df = pd.DataFrame()
df = table4income_cat(train_set,df,'All_set')
df = table4income_cat(train_set,df,'train_set_Shuff')
df = table4income_cat(test_set,df,'test_set_Shuff')
df = table4income_cat(data4train,df,'train_set_split')
df = table4income_cat(data4test,df,'test_set_split')
df
It is shown above that the StratifiedShuffleSplit method works a little bit better as the income category propotions in train data and test data are closer to that in the all dataset.
After split the data into train and test, we delete the newly added feature.
# delete the new feature
for set_ in (data, data4train, data4test, train_set, test_set):
set_.drop("income_cat",axis=1,inplace=True)
data4train.columns
housing4train = train_set.copy()
housing4train.head()
housing4train.plot(kind='scatter', x='longitude', y='latitude', alpha=0.3,
s=housing4train['population']/100, label='population', # set symbol size on population
c=housing4train['median_house_value'], # set symbol color on house value
cmap=plt.get_cmap('jet'),
colorbar=True,
figsize=(10,7))
plt.legend()
import folium
from geopy.geocoders import Nominatim
def get_latlon(address):
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
return latitude, longitude
# latitude, longitude = get_latlon("1070 E Arques Ave, Sunnyvale, CA")
# California_map = folium.Map(location=[latitude, longitude], zoom_start=12)
# for lat, lng, house_value, population in zip(data['latitude'], data['longitude'], data['median_house_value'],
# data['population']):
# labels = 'House_Value:{}, Population: {}'.format(house_value,population)
# label = folium.Popup(labels,parse_html=True)
# folium.CircleMarker(
# [lat, lng],
# radius=5,
# color='blue',
# popup=label,
# fill=True,
# #fill_color='#3186cc',
# fill_opacity=0.7,
# parse_html=False).add_to(California_map)
# California_map
Check the correlations between pairs of features
analysis correlations
sns.heatmap(housing4train.corr(),annot=True)
corr_matrix = housing4train.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)
corr_matrix
df = corr_matrix.tail(1).T
df
df.sort_values(by='median_house_value',inplace=True)
df
features = list(df[abs(df['median_house_value'])>0.1].T.columns)
features
Large correlations are found between the median house value and income, total_rooms, and house age
from pandas.plotting import scatter_matrix
#scatter_matrix(data[features],figsize=(12,8))
Iteractive plot
from plotly.graph_objs import Scatter,Layout
import plotly
import plotly.offline as py
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)
trace0 = go.Scatter(x=data['median_income'], y=data['median_house_value'], mode = 'markers', name = 'value vs income')
#py.iplot([trace0])
The total rooms in a particular district is not much related to the house price if the population is not specified. The more relevant variable for the house price is the number of people in a house. Therefore, one may introduce a new attribute population_per_household.
housing4train['population_per_household'] = housing4train['population']/housing4train['households']
In addition, the number of rooms per household and bedrooms per rooms are introduced.
housing4train['bedrooms_per_room'] = housing4train['total_bedrooms']/housing4train['total_rooms']
housing4train['rooms_per_household'] = housing4train['total_rooms']/housing4train['households']
corr_matrix = housing4train.corr()
df = pd.DataFrame(corr_matrix['median_house_value'])
df.sort_values(by='median_house_value',inplace=True)
df.drop(index='median_house_value',inplace=True)
df.plot(kind='bar')
features = list(df[abs(df['median_house_value'])>0.15].T.columns)
features
X_train = data4train.drop("median_house_value",axis=1)
y_train = data4train["median_house_value"].copy()
X_train.shape,y_train.shape
X_test = data4test.drop("median_house_value",axis=1)
y_test = data4test["median_house_value"].copy()
#missing data
def report_missing_data(dataset):
total = dataset.isnull().sum().sort_values(ascending=False)
percent = dataset.isnull().sum()/total
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.plot(kind='bar',y='Total',figsize=(10,6),fontsize=20)
print(missing_data)
report_missing_data(X_test)
report_missing_data(X_train)
X_train = train_set.drop("median_house_value",axis=1)
y_train = train_set["median_house_value"].copy()
X_test = test_set.drop("median_house_value",axis=1)
y_test = test_set["median_house_value"].copy()
report_missing_data(X_train)
Three ways to take care of the missing value/"NaN":
X_train['total_bedrooms'].isnull().sum()
missing_feature = pd.DataFrame(X_train.isnull().sum().sort_values(ascending=False)).index[0]
missing_feature
Only the total bedrooms contains missing values.
median=X_train[missing_feature].median()
X_train[missing_feature]=X_train[missing_feature].replace(np.nan,median)
X_train[missing_feature].isnull().sum()
median=X_test[missing_feature].median()
X_test[missing_feature]=X_test[missing_feature].replace(np.nan,median)
X_test[missing_feature].isnull().sum()
X_train['ocean_proximity'].unique()
housing_num = X_train.drop("ocean_proximity",axis=1)
num_attribs = list(housing_num)
num_attribs
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.preprocessing import Imputer
from sklearn.pipeline import FeatureUnion
#CategoricalEncoder(encoding='onehot-dense')
from sklearn.base import BaseEstimator,TransformerMixin
#select columns and transit to array
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
class CategoricalEncoder(BaseEstimator, TransformerMixin):
def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
handle_unknown='error'):
self.encoding = encoding
self.categories = categories
self.dtype = dtype
self.handle_unknown = handle_unknown
def fit(self, X, y=None):
if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
template = ("encoding should be either 'onehot', 'onehot-dense' "
"or 'ordinal', got %s")
raise ValueError(template % self.handle_unknown)
if self.handle_unknown not in ['error', 'ignore']:
template = ("handle_unknown should be either 'error' or "
"'ignore', got %s")
raise ValueError(template % self.handle_unknown)
if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
raise ValueError("handle_unknown='ignore' is not supported for"
" encoding='ordinal'")
X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
n_samples, n_features = X.shape
self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]
for i in range(n_features):
le = self._label_encoders_[i]
Xi = X[:, i]
if self.categories == 'auto':
le.fit(Xi)
else:
valid_mask = np.in1d(Xi, self.categories[i])
if not np.all(valid_mask):
if self.handle_unknown == 'error':
diff = np.unique(Xi[~valid_mask])
msg = ("Found unknown categories {0} in column {1}"
" during fit".format(diff, i))
raise ValueError(msg)
le.classes_ = np.array(np.sort(self.categories[i]))
self.categories_ = [le.classes_ for le in self._label_encoders_]
return self
def transform(self, X):
X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
n_samples, n_features = X.shape
X_int = np.zeros_like(X, dtype=np.int)
X_mask = np.ones_like(X, dtype=np.bool)
for i in range(n_features):
valid_mask = np.in1d(X[:, i], self.categories_[i])
if not np.all(valid_mask):
if self.handle_unknown == 'error':
diff = np.unique(X[~valid_mask, i])
msg = ("Found unknown categories {0} in column {1}"
" during transform".format(diff, i))
raise ValueError(msg)
else:
X_mask[:, i] = valid_mask
X[:, i][~valid_mask] = self.categories_[i][0]
X_int[:, i] = self._label_encoders_[i].transform(X[:, i])
if self.encoding == 'ordinal':
return X_int.astype(self.dtype, copy=False)
mask = X_mask.ravel()
n_values = [cats.shape[0] for cats in self.categories_]
n_values = np.array([0] + n_values)
indices = np.cumsum(n_values)
column_indices = (X_int + indices[:-1]).ravel()[mask]
row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
n_features)[mask]
data = np.ones(n_samples * n_features)[mask]
out = sparse.csc_matrix((data, (row_indices, column_indices)),
shape=(n_samples, indices[-1]),
dtype=self.dtype).tocsr()
if self.encoding == 'onehot-dense':
return out.toarray()
else:
return out
class DataFrameSelector(BaseEstimator,TransformerMixin):
def __init__(self,feature_names):
self.feature_names = feature_names
def fit(self,X,y=None):
return self
def transform(self,X):
return X[self.feature_names].values
# build pipelines
cat_attribs = ['ocean_proximity']
num_attribs = list(housing_num)
num_pipeline = Pipeline([
('selector',DataFrameSelector(num_attribs)),
('std_scaler',StandardScaler()),
])
# build categorical pipeline
cat_pipeline = Pipeline([
('selector',DataFrameSelector(cat_attribs)),
('cat_encoder',CategoricalEncoder(encoding='onehot-dense')),
])
# concatenate all the transforms using "FeatureUnion"
pipelines = FeatureUnion(transformer_list=
[
('num_pipeline',num_pipeline),
('cat_pipeline',cat_pipeline),
])
X_train_prepared = pipelines.fit_transform(X_train)
X_train_prepared.shape
type(X_train_prepared)
from sklearn.metrics import mean_absolute_error,mean_squared_error
def model_performace(model,X,y):
model.fit(X,y)
pred = model.predict(X)
scores = np.sqrt(mean_squared_error(pred,y))
print("scores:",scores)
print("Mean:",scores.mean())
print("Standard Deviation:",scores.std())
return model,pred
def plot_pred_true(ypred,ytrue):
df = pd.DataFrame([ypred,ytrue]).T
df.columns=['pred','true']
plt.scatter(df['pred'],df['true'])
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr_model,ypred = model_performace(lr,X_train_prepared, y_train)
plot_pred_true(ypred,y_train)
from sklearn import svm
clf = svm.SVR(kernel='poly',degree=3)
svr_model,ypred = model_performace(clf,X_train_prepared,y_train)
plot_pred_true(ypred,y_train)
# svm_rmse = 1
# for C in range(2000,100000,20000):
# for gamma in np.logspace(-8, -6, 3):
# clf = svm.SVR(kernel='rbf', degree=3, C=C, gamma=gamma).fit(X_train_prepared,y_train)
# #y_test_pred = clf.predict(X_test)
# #rmse = np.sqrt(mean_squared_error(y_test_pred,y_test))
# y_pred = clf.predict(X_train_prepared)
# rmse = np.sqrt(mean_squared_error(y_pred,y_train))
# print("C:{}, gamma:{} rmse:{}".format(C,gamma, rmse))
# if(svm_rmse > rmse):
# svm_best = clf
# svm_rmse = rmse
# print("The best score with rmse={}".format(svm_rmse))
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
tree_model,ypred= model_performace(tree,X_train_prepared, y_train)
plot_pred_true(ypred,y_train)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree,X_train_prepared,y_train, scoring="neg_mean_squared_error",cv=10)
tree_rmse_score=np.sqrt(-scores)
def display_scores(scores):
print("scores:",scores)
print("Mean:",scores.mean())
print("Standard Deviation:",scores.std())
display_scores(tree_rmse_score)
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor()
forest_model,y_pred = model_performace(forest,X_train_prepared,y_train)
plot_pred_true(y_pred,y_train)
Transform the test data in the same way
X_test_prepared = pipelines.transform(X_test)
models = [lr_model,svr_model,tree_model,forest_model]
for model in models:
test_pred = model.predict(X_test_prepared)
plot_pred_true(test_pred,y_test)
plt.show()
from sklearn.externals import joblib
# joblib.dump(model, "my_model.pkl")
# my_model_load = joblib.load("my_model.pkl")