Analysis in Python
Loading packages
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_splitLoading data and doing some cleaning
# Data manipulation code below here
data = pd.read_csv('train.csv', nrows=50000)
# Remove data with extreme outlier coordinates or negative fares
data = data.query('pickup_latitude > 40.7 and pickup_latitude < 40.8 and ' +
                  'dropoff_latitude > 40.7 and dropoff_latitude < 40.8 and ' +
                  'pickup_longitude > -74 and pickup_longitude < -73.9 and ' +
                  'dropoff_longitude > -74 and dropoff_longitude < -73.9 and ' +
                  'fare_amount > 0'
                  )
y = data.fare_amount
base_features = ['pickup_longitude',
                 'pickup_latitude',
                 'dropoff_longitude',
                 'dropoff_latitude']
X = data[base_features]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
first_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(train_X, train_y)
print("Data sample:")## Data sample:data.head()##                              key  ...  passenger_count
## 2   2011-08-18 00:35:00.00000049  ...                2
## 3    2012-04-21 04:30:42.0000001  ...                1
## 4  2010-03-09 07:51:00.000000135  ...                1
## 6    2012-11-20 20:35:00.0000001  ...                1
## 7   2012-01-04 17:22:00.00000081  ...                1
## 
## [5 rows x 8 columns]from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots
feat_name = 'pickup_longitude'
pdp_dist = pdp.pdp_isolate(model=first_model, dataset=val_X, model_features=base_features, feature=feat_name)
pdp.pdp_plot(pdp_dist, feat_name)## (<Figure size 1500x950 with 2 Axes>, {'title_ax': <AxesSubplot:>, 'pdp_ax': <AxesSubplot:xlabel='pickup_longitude'>})plt.show()We see a U-shaped kind of a plot which would suggest that being picked up near the center of the longitude values lowers predicted fares on average, because it means shorter trips (on average).
for feat_name in base_features:
    pdp_dist = pdp.pdp_isolate(model = first_model, dataset=val_X, model_features=base_features, feature=feat_name)
    pdp.pdp_plot(pdp_dist, feat_name)
    plt.show()## (<Figure size 1500x950 with 2 Axes>, {'title_ax': <AxesSubplot:>, 'pdp_ax': <AxesSubplot:xlabel='pickup_longitude'>})
## (<Figure size 1500x950 with 2 Axes>, {'title_ax': <AxesSubplot:>, 'pdp_ax': <AxesSubplot:xlabel='pickup_latitude'>})
## (<Figure size 1500x950 with 2 Axes>, {'title_ax': <AxesSubplot:>, 'pdp_ax': <AxesSubplot:xlabel='dropoff_longitude'>})
## (<Figure size 1500x950 with 2 Axes>, {'title_ax': <AxesSubplot:>, 'pdp_ax': <AxesSubplot:xlabel='dropoff_latitude'>})
## 
## Traceback (most recent call last):
##   File "C:\Users\user\ANACON~1\lib\site-packages\matplotlib\backends\backend_qt5.py", line 480, in _draw_idle
##     self.draw()
##   File "C:\Users\user\ANACON~1\lib\site-packages\matplotlib\backends\backend_agg.py", line 407, in draw
##     self.figure.draw(self.renderer)
##   File "C:\Users\user\ANACON~1\lib\site-packages\matplotlib\artist.py", line 41, in draw_wrapper
##     return draw(artist, renderer, *args, **kwargs)
##   File "C:\Users\user\ANACON~1\lib\site-packages\matplotlib\figure.py", line 1863, in draw
##     mimage._draw_list_compositing_images(
##   File "C:\Users\user\ANACON~1\lib\site-packages\matplotlib\image.py", line 131, in _draw_list_compositing_images
##     a.draw(renderer)
##   File "C:\Users\user\ANACON~1\lib\site-packages\matplotlib\artist.py", line 41, in draw_wrapper
##     return draw(artist, renderer, *args, **kwargs)
##   File "C:\Users\user\ANACON~1\lib\site-packages\matplotlib\cbook\deprecation.py", line 411, in wrapper
##     return func(*inner_args, **inner_kwargs)
##   File "C:\Users\user\ANACON~1\lib\site-packages\matplotlib\axes\_base.py", line 2707, in draw
##     self._update_title_position(renderer)
##   File "C:\Users\user\ANACON~1\lib\site-packages\matplotlib\axes\_base.py", line 2636, in _update_title_position
##     if (ax.xaxis.get_ticks_position() in ['top', 'unknown']
##   File "C:\Users\user\ANACON~1\lib\site-packages\matplotlib\axis.py", line 2210, in get_ticks_position
##     self._get_ticks_position()]
##   File "C:\Users\user\ANACON~1\lib\site-packages\matplotlib\axis.py", line 1896, in _get_ticks_position
##     minor = self.minorTicks[0]
## IndexError: list index out of rangeCreating a 2D plot for the features pickup_longitude and dropoff_longitude
fnames = ['pickup_longitude', 'dropoff_longitude']
longitudes_partial_plot  =  pdp.pdp_interact(model=first_model, dataset=val_X,
                                            model_features=base_features, features=fnames)
pdp.pdp_interact_plot(pdp_interact_out=longitudes_partial_plot,
                      feature_names=fnames, plot_type='contour')## (<Figure size 750x950 with 3 Axes>, {'title_ax': <AxesSubplot:>, 'pdp_inter_ax': <AxesSubplot:xlabel='pickup_longitude', ylabel='dropoff_longitude'>})plt.show()We expect the contours to run along the diagonals. We see that prices increase as we move further up to the upper right side of the plot.
Direct measures of distances using the absolute distances.
# This is the PDP for pickup_longitude without the absolute difference features. Included here to help compare it to the new PDP you create
feat_name = 'pickup_longitude'
pdp_dist_original = pdp.pdp_isolate(model=first_model, dataset=val_X, model_features=base_features, feature=feat_name)
pdp.pdp_plot(pdp_dist_original, feat_name)## (<Figure size 1500x950 with 2 Axes>, {'title_ax': <AxesSubplot:>, 'pdp_ax': <AxesSubplot:xlabel='pickup_longitude'>})plt.show()
# create new featuresdata['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude)
data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude)
features_2  = ['pickup_longitude',
               'pickup_latitude',
               'dropoff_longitude',
               'dropoff_latitude',
               'abs_lat_change',
               'abs_lon_change']
X = data[features_2]
new_train_X, new_val_X, new_train_y, new_val_y = train_test_split(X, y, random_state=1)
second_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(new_train_X, new_train_y)
feat_name = 'pickup_longitude'
pdp_dist = pdp.pdp_isolate(model=second_model, dataset=new_val_X, model_features=features_2, feature=feat_name)
pdp.pdp_plot(pdp_dist, feat_name)## (<Figure size 1500x950 with 2 Axes>, {'title_ax': <AxesSubplot:>, 'pdp_ax': <AxesSubplot:xlabel='pickup_longitude'>})plt.show()We see that controlling for absolute distance traveled, the pick up longitude has a very small impact on predictions.