import numpy as np
import matplotlib.pyplot as plt
def plot_feature_importance(importances, names):
'fivethirtyeight')
plt.style.use(
= np.array(names)
feat_names = np.argsort(importances)[::-1]
indices
= plt.figure(figsize=(12, 8))
fig "Feature importances")
plt.title(range(len(indices)), importances[indices], color='lightblue', align="center")
plt.bar(range(len(indices)), np.cumsum(importances[indices]), where='mid', label='Cumulative')
plt.step(range(len(indices)), feat_names[indices], rotation='vertical', fontsize=14)
plt.xticks(-1, len(indices)])
plt.xlim([ plt.show()
Visualize feature importances
Define function plot feature importance
Get example data (german credit)
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the German Credit Data
= "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
data_url = ['existing_checking', 'duration', 'credit_history', 'purpose', 'credit_amount', 'savings',
column_names 'employment', 'installment_rate', 'personal_status', 'other_debtors', 'residence_since',
'property', 'age', 'other_installment_plans', 'housing', 'existing_credits', 'job', 'people_liable',
'telephone', 'foreign_worker', 'class']
= pd.read_csv(data_url, delimiter=' ', names=column_names)
data
# Preprocess the data
= data.drop('class', axis=1)
X = data['class']
y = pd.get_dummies(X, drop_first=True)
X = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test == 2] = 0
y_train[y_train == 2] = 0 y_test[y_test
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 existing_checking 1000 non-null object
1 duration 1000 non-null int64
2 credit_history 1000 non-null object
3 purpose 1000 non-null object
4 credit_amount 1000 non-null int64
5 savings 1000 non-null object
6 employment 1000 non-null object
7 installment_rate 1000 non-null int64
8 personal_status 1000 non-null object
9 other_debtors 1000 non-null object
10 residence_since 1000 non-null int64
11 property 1000 non-null object
12 age 1000 non-null int64
13 other_installment_plans 1000 non-null object
14 housing 1000 non-null object
15 existing_credits 1000 non-null int64
16 job 1000 non-null object
17 people_liable 1000 non-null int64
18 telephone 1000 non-null object
19 foreign_worker 1000 non-null object
20 class 1000 non-null int64
dtypes: int64(8), object(13)
memory usage: 164.2+ KB
from sklearn.preprocessing import StandardScaler
# Standardize features
= StandardScaler()
scaler = scaler.fit_transform(X_train)
X_train_scaled = scaler.transform(X_test) X_test_scaled
Visualize feature importances using a Lasso regression
from sklearn.linear_model import LassoCV
# Use LassoCV for feature selection
= LassoCV(alphas=[0.1, 1, 0.001, 0.0005]).fit(X_train_scaled, y_train)
model_lasso
# Visualize feature importances using the function
abs(model_lasso.coef_), X_train.columns) plot_feature_importance(np.
Visualize feature importances using a Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
# Use RandomForestClassifier for feature importance
= RandomForestClassifier(n_estimators=10, random_state=123)
clf
clf.fit(X_train_scaled, y_train)
= X_train.columns
names = clf.feature_importances_
importances
# Call the plotting function
plot_feature_importance(importances, names)
Visualize feature importances using a LightGBM classifier
import lightgbm as lgb
# Use LightGBM classifier
= lgb.LGBMClassifier(n_estimators=100, random_state=123)
clf
clf.fit(X_train_scaled, y_train)
# Get feature importances
= clf.feature_importances_
importances = X_train.columns
names
# Call the plotting function
plot_feature_importance(importances, names)
Visualize feature importances using a XGBoost classifier
import xgboost as xgb
# Use XGBoost classifier
= xgb.XGBClassifier(n_estimators=100, random_state=123)
clf
clf.fit(X_train_scaled, y_train)
# Get feature importances
= clf.feature_importances_
importances = X_train.columns
names
# Call the plotting function
plot_feature_importance(importances, names)