A classic example of using a (random) forest classifier to sort features.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import numpy as np from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor from sklearn.tree import DecisionTreeRegressor from sklearn import datasets from sklearn.metrics import mean_squared_error, explained_variance_score from sklearn.utils import shuffle import matplotlib.pyplot as plt def plot_feature_importances(feature_importances, title, feature_names): # Normalize the importance values feature_importances = 100.0 * (feature_importances / max(feature_importances)) # Sort the values and flip them index_sorted = np.flipud(np.argsort(feature_importances)) # Arrange the X ticks pos = np.arange(index_sorted.shape[0]) + 0.5 # Plot the bar graph plt.figure() plt.bar(pos, feature_importances[index_sorted], align='center') plt.xticks(pos, feature_names[index_sorted]) plt.ylabel('Relative Importance') plt.title(title) plt.show() # Load housing data housing_data = datasets.load_boston() # Shuffle the data X, y = shuffle(housing_data.data, housing_data.target, random_state=7) # Split the data 80/20 (80% for training, 20% for testing) num_training = int(0.8 * len(X)) X_train, y_train = X[:num_training], y[:num_training] X_test, y_test = X[num_training:], y[num_training:] # Fit decision tree regression model dt_regressor = DecisionTreeRegressor(max_depth=4) dt_regressor.fit(X_train, y_train) # Fit decision tree regression model with AdaBoost ab_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=400, random_state=7) ab_regressor.fit(X_train, y_train) # Evaluate performance of Decision Tree regressor y_pred_dt = dt_regressor.predict(X_test) mse = mean_squared_error(y_test, y_pred_dt) evs = explained_variance_score(y_test, y_pred_dt) print("\n#### Decision Tree performance ####") print(("Mean squared error =", round(mse, 2))) print(("Explained variance score =", round(evs, 2))) # Evaluate performance of AdaBoost y_pred_ab = ab_regressor.predict(X_test) mse = mean_squared_error(y_test, y_pred_ab) evs = explained_variance_score(y_test, y_pred_ab) print("\n#### AdaBoost performance ####") print(("Mean squared error =", round(mse, 2))) print(("Explained variance score =", round(evs, 2))) # Plot relative feature importances plot_feature_importances(dt_regressor.feature_importances_, 'Decision Tree regressor', housing_data.feature_names) plot_feature_importances(ab_regressor.feature_importances_, 'AdaBoost regressor', housing_data.feature_names) #### Decision Tree performance #### ('Mean squared error =', 14.79) ('Explained variance score =', 0.82) #### AdaBoost performance #### ('Mean squared error =', 7.64) ('Explained variance score =', 0.91) |