Applicability of Ridge regression in regression problems

I ran into a problem related to Ridge regression.

As you know, Ridge regression is used in the case of strong conditionality of the feature matrix. This is just my case: the determinant of my interfactor correlation matrix has the order 10^(-18). Multicollinearity is evident. The data sample itself consists of only 8 quantitative features.

The question is, what leads to the fact that for any parameters lmbd Ridge regression gives a worse result than standard linear regression.

What leads to this result?

Author: Иван Васильев, 2018-03-13

1 answers

I have KernelRidge gave the best results:

Program output:

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 171 out of 171 | elapsed:    8.8s finished
Fitting 3 folds for each of 57 candidates, totalling 171 fits
Best score:     0.9810896320851934
Best parameters:

{'regr': KernelRidge(alpha=0.001, coef0=1, degree=3, gamma=0.1, kernel='rbf',
 'regr__alpha': 0.001,
 'regr__gamma': 0.1,
 'regr__kernel': 'rbf',
 'scale': StandardScaler(copy=True, with_mean=True, with_std=True)}
Best score per estimator:

              estimator  best_score
0           KernelRidge    0.981090
1      LinearRegression    0.899578
2  MultiOutputRegressor    0.979180
3                 Ridge    0.899609



Full code:

import re
from pprint import pprint
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import seaborn as sns

def get_data(path):
    df = pd.read_excel(path)
    return df.filter(regex=r'^X\d+'), df.filter(regex=r'^Y\d+')

def plot_results(Y_test, Y_pred):
    y1 = (Y_test[['Y1']]
           .assign(Pred_Y1=Y_pred[:, 0])
           .rename(columns={'level_0':'idx', 'level_1':'Label'}))
    sns.lmplot(data=y1, x='idx', y='value', hue='Label', size=6)
    y2 = (Y_test[['Y2']]
           .assign(Pred_Y2=Y_pred[:, 1])
           .rename(columns={'level_0':'idx', 'level_1':'Label'}))
    sns.lmplot(data=y2, x='idx', y='value', hue='Label', size=6)

def main(path):
    pipe = Pipeline([
        ('scale', StandardScaler()),
        ('regr', LinearRegression())

    param_grid = [
            'scale': [StandardScaler()],
            'regr': [LinearRegression()],
            'scale': [StandardScaler()],
            'regr': [Ridge()],
            'regr__alpha': np.logspace(-3, 1, 5),
            'scale': [StandardScaler()],
            'regr': [KernelRidge()],
            'regr__kernel': ['rbf','linear'],
            'regr__alpha': np.logspace(-3, 1, 5),
            'regr__gamma': np.logspace(-2, 2, 5),
            'scale': [StandardScaler()],
            'regr': [MultiOutputRegressor(RandomForestRegressor(max_depth=15))],

    grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

    X, Y = get_data(path)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25), Y_train)
    joblib.dump(grid, 'grid.pkl')
    res = pd.DataFrame(
            [[re.sub(r'\(.*', '', str(p['regr']), flags=re.S), s]
             for p,s in zip(grid.cv_results_['params'],
            columns=['estimator', 'best_score']
    print('*' * 70)
    print('Best score:\t\t{}'.format(grid.best_score_))
    print('*' * 70)
    print('Best parameters:\n')
    print('*' * 70)
    print('Best score per estimator:\n')
    print(res.groupby('estimator', as_index=False)['best_score'].max())
    print('*' * 70)

    plot_results(Y_test, grid.predict(X_test))

if __name__ == "__main__":
    path = r'ENB2012_data.xlsx'
Author: MaxU, 2018-03-14 00:38:01