import pandas as pd
import numpy as np

import altair as alt
import seaborn as sns

from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    ShuffleSplit,
    cross_val_score,
    cross_validate,
    train_test_split,
)

from sklearn.linear_model import Lasso, Ridge, RidgeCV
from sklearn.svm import SVR

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures,
    StandardScaler
)
# Feature selection
from sklearn.feature_selection import RFE, RFECV
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, median_absolute_error
from mlxtend.feature_selection import SequentialFeatureSelector

from xgboost import XGBRegressor


listing_df = pd.read_csv('listings.csv')


listing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18553 entries, 0 to 18552
Data columns (total 74 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            18553 non-null  int64  
 1   listing_url                                   18553 non-null  object 
 2   scrape_id                                     18553 non-null  int64  
 3   last_scraped                                  18553 non-null  object 
 4   name                                          18552 non-null  object 
 5   description                                   17865 non-null  object 
 6   neighborhood_overview                         11581 non-null  object 
 7   picture_url                                   18553 non-null  object 
 8   host_id                                       18553 non-null  int64  
 9   host_url                                      18553 non-null  object 
 10  host_name                                     18544 non-null  object 
 11  host_since                                    18544 non-null  object 
 12  host_location                                 18534 non-null  object 
 13  host_about                                    10335 non-null  object 
 14  host_response_time                            10167 non-null  object 
 15  host_response_rate                            10167 non-null  object 
 16  host_acceptance_rate                          12392 non-null  object 
 17  host_is_superhost                             18544 non-null  object 
 18  host_thumbnail_url                            18544 non-null  object 
 19  host_picture_url                              18544 non-null  object 
 20  host_neighbourhood                            14409 non-null  object 
 21  host_listings_count                           18544 non-null  float64
 22  host_total_listings_count                     18544 non-null  float64
 23  host_verifications                            18553 non-null  object 
 24  host_has_profile_pic                          18544 non-null  object 
 25  host_identity_verified                        18544 non-null  object 
 26  neighbourhood                                 11581 non-null  object 
 27  neighbourhood_cleansed                        18553 non-null  object 
 28  neighbourhood_group_cleansed                  0 non-null      float64
 29  latitude                                      18553 non-null  float64
 30  longitude                                     18553 non-null  float64
 31  property_type                                 18553 non-null  object 
 32  room_type                                     18553 non-null  object 
 33  accommodates                                  18553 non-null  int64  
 34  bathrooms                                     0 non-null      float64
 35  bathrooms_text                                18531 non-null  object 
 36  bedrooms                                      17156 non-null  float64
 37  beds                                          18325 non-null  float64
 38  amenities                                     18553 non-null  object 
 39  price                                         18553 non-null  object 
 40  minimum_nights                                18553 non-null  int64  
 41  maximum_nights                                18553 non-null  int64  
 42  minimum_minimum_nights                        18553 non-null  int64  
 43  maximum_minimum_nights                        18553 non-null  int64  
 44  minimum_maximum_nights                        18553 non-null  int64  
 45  maximum_maximum_nights                        18553 non-null  int64  
 46  minimum_nights_avg_ntm                        18553 non-null  float64
 47  maximum_nights_avg_ntm                        18553 non-null  float64
 48  calendar_updated                              0 non-null      float64
 49  has_availability                              18553 non-null  object 
 50  availability_30                               18553 non-null  int64  
 51  availability_60                               18553 non-null  int64  
 52  availability_90                               18553 non-null  int64  
 53  availability_365                              18553 non-null  int64  
 54  calendar_last_scraped                         18553 non-null  object 
 55  number_of_reviews                             18553 non-null  int64  
 56  number_of_reviews_ltm                         18553 non-null  int64  
 57  number_of_reviews_l30d                        18553 non-null  int64  
 58  first_review                                  14481 non-null  object 
 59  last_review                                   14481 non-null  object 
 60  review_scores_rating                          14223 non-null  float64
 61  review_scores_accuracy                        14190 non-null  float64
 62  review_scores_cleanliness                     14190 non-null  float64
 63  review_scores_checkin                         14188 non-null  float64
 64  review_scores_communication                   14192 non-null  float64
 65  review_scores_location                        14185 non-null  float64
 66  review_scores_value                           14186 non-null  float64
 67  license                                       0 non-null      float64
 68  instant_bookable                              18553 non-null  object 
 69  calculated_host_listings_count                18553 non-null  int64  
 70  calculated_host_listings_count_entire_homes   18553 non-null  int64  
 71  calculated_host_listings_count_private_rooms  18553 non-null  int64  
 72  calculated_host_listings_count_shared_rooms   18553 non-null  int64  
 73  reviews_per_month                             14481 non-null  float64
dtypes: float64(20), int64(21), object(33)
memory usage: 10.5+ MB


new_list_df = listing_df.loc[:, ['id', 'neighbourhood_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'reviews_per_month']]

new_list_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18553 entries, 0 to 18552
Data columns (total 33 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            18553 non-null  int64  
 1   neighbourhood_cleansed                        18553 non-null  object 
 2   latitude                                      18553 non-null  float64
 3   longitude                                     18553 non-null  float64
 4   property_type                                 18553 non-null  object 
 5   room_type                                     18553 non-null  object 
 6   accommodates                                  18553 non-null  int64  
 7   bathrooms_text                                18531 non-null  object 
 8   bedrooms                                      17156 non-null  float64
 9   beds                                          18325 non-null  float64
 10  amenities                                     18553 non-null  object 
 11  price                                         18553 non-null  object 
 12  minimum_nights                                18553 non-null  int64  
 13  maximum_nights                                18553 non-null  int64  
 14  minimum_minimum_nights                        18553 non-null  int64  
 15  maximum_minimum_nights                        18553 non-null  int64  
 16  minimum_maximum_nights                        18553 non-null  int64  
 17  maximum_maximum_nights                        18553 non-null  int64  
 18  minimum_nights_avg_ntm                        18553 non-null  float64
 19  maximum_nights_avg_ntm                        18553 non-null  float64
 20  review_scores_rating                          14223 non-null  float64
 21  review_scores_accuracy                        14190 non-null  float64
 22  review_scores_cleanliness                     14190 non-null  float64
 23  review_scores_checkin                         14188 non-null  float64
 24  review_scores_communication                   14192 non-null  float64
 25  review_scores_location                        14185 non-null  float64
 26  review_scores_value                           14186 non-null  float64
 27  instant_bookable                              18553 non-null  object 
 28  calculated_host_listings_count                18553 non-null  int64  
 29  calculated_host_listings_count_entire_homes   18553 non-null  int64  
 30  calculated_host_listings_count_private_rooms  18553 non-null  int64  
 31  calculated_host_listings_count_shared_rooms   18553 non-null  int64  
 32  reviews_per_month                             14481 non-null  float64
dtypes: float64(14), int64(12), object(7)
memory usage: 4.7+ MB


new_list_df.price = new_list_df.price.apply(lambda x: x.replace('$','')).apply(lambda x: x.replace(',','')).astype(np.float)
sample = new_list_df.sample(n=2000, random_state=2021).dropna(subset=['price'])


alt.Chart(sample).encode(
    x = alt.X('price', bin=alt.Bin(maxbins=50)),
    y = 'count()'
).mark_bar()


corr_df = sample.select_dtypes(include='number').drop(columns=['id']).corr(method='spearman').stack().reset_index(name='corr')

corr_df['highlight'] = False
corr_df.loc[corr_df['level_0'] == 'price', 'highlight'] = True
corr_df.loc[corr_df['level_1'] == 'price', 'highlight'] = True

range_ = ['#ffffff00', 'black']

alt.Chart(corr_df).mark_rect().encode(
    alt.X('level_0', title=''),
    alt.Y('level_1', title=''),
    color=alt.Color('corr', scale=alt.Scale(domain=(-1,1), scheme='blueorange'), title='Correlation')
).properties(
    height=600,
    width=600
) + alt.Chart(corr_df).mark_rect(opacity=0.1).encode(
    alt.X('level_0', title=''),
    alt.Y('level_1', title=''),
    color=alt.Color('highlight', scale=alt.Scale(scheme='set1', range=range_), title='', legend=None),
).properties(
    height=600,
    width=600
)


new_list_df = new_list_df.assign(wifi=new_list_df['amenities'].apply(lambda x : 'Wifi' in x))
new_list_df = new_list_df.assign(parking=new_list_df['amenities'].apply(lambda x : 'Free parking on premises' in x))
new_list_df = new_list_df.assign(heating=new_list_df['amenities'].apply(lambda x : 'Heating' in x))
new_list_df = new_list_df.assign(ac=new_list_df['amenities'].apply(lambda x : 'Air conditioning' in x))
new_list_df = new_list_df.assign(fireplace=new_list_df['amenities'].apply(lambda x : 'Indoor fireplace' in x))

num_feats = ['latitude', 'longitude', 'accommodates',
       'bedrooms', 'beds', 'minimum_nights',
       'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'reviews_per_month']

ohe_feats = [
    'property_type', 'neighbourhood_cleansed', 
]

ord_feats = [
    'room_type', 'bathrooms_text'
]

room_ordinal = [
    'Entire home/apt', 
    'Hotel room',
    'Private room', 
    'Shared room'
]

bathroom_ordinal = [
    '0 baths',
    '0 shared baths',
    'Shared half-bath',
    'Half-bath',
    'Private half-bath',
    '1 shared bath',
    '1 bath',
    '1 private bath',
    '1.5 shared baths',
    '1.5 baths',
    '2 shared baths',
    '2 baths',
    '2.5 shared baths',
    '2.5 baths',
    '3 shared baths',
    '3 baths',
    '3.5 shared baths',
    '3.5 baths',
    '4.5 shared baths',
    '4 shared baths',
    '4 baths',
    '4.5 baths',
    '5 baths',
    '5.5 baths',
    '6.5 baths',
    '6 baths',
    '7.5 baths',
    '8 baths'   
]

binary_feats = [
    'instant_bookable', 'wifi', 'parking', 'heating', 'ac', 'fireplace'
]

drop_feats = [
    'id', 'amenities'
]


train_df, test_df = train_test_split(new_list_df, test_size=0.2, random_state=2021)
train_df.head()


X_train, y_train = train_df.drop(columns=['price']), train_df.price
X_test, y_test = test_df.drop(columns=['price']), test_df.price


num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

ohe_pipe = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OneHotEncoder(handle_unknown='ignore')
)

bin_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='if_binary')
)

room_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OrdinalEncoder(categories=[room_ordinal], dtype=int)
)

bathroom_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OrdinalEncoder(categories=[bathroom_ordinal], dtype=int)
)

preproc = ColumnTransformer(
    transformers=[
        ('drop', 'drop', drop_feats),
        ('numeric', num_pipe, num_feats),
        ('onehot', ohe_pipe, ohe_feats),
        ('binary', bin_pipe, binary_feats),
        ('room', room_pipe, ['room_type']),
        ('bathroom', bathroom_pipe, ['bathrooms_text'])
    ]
)

def mape(true, pred):
    return 100.*np.mean(np.abs((pred - true)/true))

mape_scorer = make_scorer(mape)


pipeline = make_pipeline(preproc,RFECV(Ridge()), XGBRegressor())
cv_scores = cross_validate(pipeline, X_train, y_train, return_train_score=True, scoring='neg_median_absolute_error')
pd.DataFrame(cv_scores).T


param_space = {
    "xgbregressor__learning_rate": np.linspace(0.001, 0.5, 20),
    "xgbregressor__gamma": [0,1,5],
    "xgbregressor__colsample_bytree": np.linspace(0.3, 0.8, 20),
    "xgbregressor__subsample": np.linspace(0.8, 1, 5),
    "rfecv__step": np.linspace(1, 6, 5)
}

search = RandomizedSearchCV(pipeline, param_distributions=param_space, n_iter=10, n_jobs=-1, return_train_score=True, scoring='neg_median_absolute_error')
search.fit(X_train, y_train)

RandomizedSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('drop',
                                                                               'drop',
                                                                               ['id',
                                                                                'amenities']),
                                                                              ('numeric',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               ['latitude',
                                                                                'longitude',
                                                                                'accommodates',
                                                                                'bedrooms',
                                                                                'beds',
                                                                                'minimum_nights',
                                                                                'maximum_nights',
                                                                                'minimum_m...
                                        'xgbregressor__learning_rate': array([0.001     , 0.02726316, 0.05352632, 0.07978947, 0.10605263,
       0.13231579, 0.15857895, 0.18484211, 0.21110526, 0.23736842,
       0.26363158, 0.28989474, 0.31615789, 0.34242105, 0.36868421,
       0.39494737, 0.42121053, 0.44747368, 0.47373684, 0.5       ]),
                                        'xgbregressor__subsample': array([0.8 , 0.85, 0.9 , 0.95, 1.  ])},
                   return_train_score=True,
                   scoring='neg_median_absolute_error')


search.best_score_

-28.2950008392334


median_absolute_error(y_test, search.best_estimator_.predict(X_test))

28.154464721679688

	0	1	2	3	4
fit_time	75.274904	69.811452	62.809099	82.908820	62.802000
score_time	0.086000	0.084006	0.082494	0.086003	0.089000
test_score	-28.426819	-28.503296	-28.497826	-27.374752	-27.879246
train_score	-23.387978	-23.565842	-23.052769	-22.015890	-21.723267

Price recommendations for AirBnB listings¶

Motivation¶

The Data¶

Imports¶

Reading in the Data¶

EDA¶

Data types and Non-null counts¶

Target Distribution¶

Correlation Matrix¶

Feature Engineering¶

Splitting the data¶

Model Tuning¶

Scoring On Test Data¶

Wrap up and improvements¶

	id	neighbourhood_cleansed	latitude	longitude	property_type	room_type	accommodates	bathrooms_text	bedrooms	beds	...	calculated_host_listings_count	calculated_host_listings_count_entire_homes	calculated_host_listings_count_private_rooms	reviews_per_month	wifi	parking	heating	ac	fireplace
11410	33878516	Willowdale East	43.75572	-79.40667	Private room in condominium	Private room	1	1 private bath	1.0	1.0	...	1	0	1	NaN	True	False	True	True	False
8818	27252861	Little Portugal	43.64199	-79.42393	Private room in condominium	Private room	1	1 private bath	1.0	1.0	...	1	0	1	NaN	True	False	True	True	False
5931	20036854	Newtonbrook East	43.78410	-79.40514	Private room in house	Private room	2	1 shared bath	1.0	0.0	...	13	1	12	0.51	True	True	True	True	False
10377	31168857	High Park North	43.65810	-79.47048	Private room in house	Private room	2	1 shared bath	1.0	1.0	...	1	0	1	0.22	True	False	True	True	False
3074	12585239	Waterfront Communities-The Island	43.64397	-79.39153	Entire condominium	Entire home/apt	4	1 bath	1.0	3.0	...	64	61	3	2.71	True	True	True	True	False