Skip to content

Commit

Permalink
VIF fix, diagnostic plots edits, pairwise correlation threshold fix
Browse files Browse the repository at this point in the history
- Fixed pairwise correlations threshold for multicollinearity assumption testing (0.3 -> 0.7)
- Fixed partial regression plots fixed figure size
- Added titles to diagnostic plots
- Fixed the VIF to match with SPSS output by adding the constant to X
  • Loading branch information
teanijarv committed Mar 9, 2023
1 parent fd4e131 commit 55d5afc
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 56 deletions.
1 change: 0 additions & 1 deletion AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,3 @@ Development Lead

Contributors
------------

8 changes: 8 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,11 @@ History
------------------

* First release on PyPI.

0.1.4 (2023-03-9)
------------------

* Fixed pairwise correlations threshold for multicollinearity assumption testing (0.3 -> 0.7)
* Fixed partial regression plots fixed figure size
* Added titles to diagnostic plots
* Fixed the VIF to match with SPSS output by adding the constant to X
35 changes: 21 additions & 14 deletions HLR/diagnostic_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@

import statsmodels.api as sm
import statsmodels.stats.outliers_influence as sm_diagnostics
from statsmodels.tools.tools import add_constant
import statsmodels.stats as sm_stats
import matplotlib.pyplot as plt
import scipy as scipy
import pandas as pd
import seaborn as sns
import os, sys

def regression_diagnostics(model, result, y, X, saveto='results', showfig=False, verbose=True):
def regression_diagnostics(model, result, y, X, X_names, saveto='results', showfig=False, verbose=True):
"""Runs formal diagnostic tests for linear regression and creates plots for
further inspection. Outputs a summary text file listing the failed
diagnostic tests and a list of assumptions that require further inspection.
Expand Down Expand Up @@ -198,7 +199,7 @@ def regression_diagnostics(model, result, y, X, saveto='results', showfig=False,
pairwise_corr = X.corr()
pairwise_corr = pairwise_corr[pairwise_corr != 1] # make diagonals=nan
# If pairwise correlations < 0.7, the test is passed and vice versa
high_pairwise_corr = pairwise_corr[pairwise_corr >= 0.3]
high_pairwise_corr = pairwise_corr[pairwise_corr >= 0.7]
if high_pairwise_corr.isnull().all().all():
diagnostics['high_pairwise_correlations_passed'] = 'Yes'
else:
Expand All @@ -210,14 +211,15 @@ def regression_diagnostics(model, result, y, X, saveto='results', showfig=False,
assumptionTests['high_pairwise_correlations_passed'] = 'Multicollinearity'
formalNames['high_pairwise_correlations_passed'] = 'High Pairwise correlations'

### (7) Variance Inflation Factors (VIF) < 10
### (7) Variance Inflation Factors (VIF) < 5

# If there are multiple predictors (IVs), find VIFs between IVs
if len(X.columns) > 1:
X_withconst = add_constant(X)
vif = pd.DataFrame()
vif['VIF'] = [sm_stats.outliers_influence.variance_inflation_factor(
X.values, i) for i in range(X.shape[1])]
vif['features'] = X.columns
X_withconst.values, i) for i in range(X_withconst.shape[1])]
vif['features'] = X_withconst.columns

# If no predictors have VIF > 5, the test is passed
if ((vif['VIF'] < 5).all()):
Expand All @@ -234,7 +236,7 @@ def regression_diagnostics(model, result, y, X, saveto='results', showfig=False,
# Link test to assumption
assumptionTests['VIF_passed'] = 'Multicollinearity'
formalNames['VIF_passed'] = 'High Variance Inflation Factor'

###### ASSUMPTION V - OUTLIERS

### (8) Extreme Strandardised Residuals
Expand Down Expand Up @@ -374,10 +376,10 @@ def regression_diagnostics(model, result, y, X, saveto='results', showfig=False,

sns.set_theme(style="whitegrid")
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
suptitle = fig.suptitle('Diagnostic Plots for {} - {}'.format(step, X_names), y=0.92)

### PLOT 1 - STUDENTISED RESIDUALS VS FITTED VALUES
# Used to inspect linearity and homoscedasticity

# Get values for the plot 1
student_resid = influence_df['student_resid']
fitted_vals = model.fittedvalues
Expand All @@ -397,13 +399,11 @@ def regression_diagnostics(model, result, y, X, saveto='results', showfig=False,

### PLOT 2 - NORMAL QQ PLOT OF RESIDUALS
# Used to inspect normality

sm.qqplot(ax=axs[0][1], data=model.resid, fit=True, line='45')
axs[0][1].set_title('Normal QQ Plot of Residuals')

### PLOT 3 - INFLUENCE PLOT WITH COOK'S DISTANCE
# Used to inspect influence

sm.graphics.influence_plot(model, ax=axs[1][0], criterion="cooks")
axs[1][0].set_title('Influence plot')
axs[1][0].set_xlabel('H leverage')
Expand All @@ -419,22 +419,29 @@ def regression_diagnostics(model, result, y, X, saveto='results', showfig=False,

### SAVE ALL THE PLOTS ABOVE AS A SUBPLOT
figName = saveto + '/' + step + '_diagnostictests_plots.png'
plt.savefig(figName, dpi=300)
plt.savefig(figName, dpi=300, bbox_extra_artists=(suptitle,), bbox_inches="tight")
if showfig==True:
plt.show()
else:
plt.close()

### PLOT 5 - PARTIAL REGRESSION PLOTS
# Used to inspect linearity

# Partial regression plots
fig_partRegress = plt.figure(figsize=(12, 8))
if len(X.columns) == 1:
fig_partRegress = plt.figure(figsize=(15, 5))
elif len(X.columns) <= 3:
fig_partRegress = plt.figure(figsize=(15, 10))
elif len(X.columns) <= 5:
fig_partRegress = plt.figure(figsize=(15, 15))
elif len(X.columns) > 5:
fig_partRegress = plt.figure(figsize=(15, 20))
sys.stdout = open(os.devnull, 'w') # disable print for following function
fig_partRegress = sm.graphics.plot_partregress_grid(model, fig=fig_partRegress)
sys.stdout = sys.__stdout__
suptitle = fig_partRegress.suptitle('Partial Regression Plots for {} - {}'.format(step, X_names), y=1)
fig_partRegress.tight_layout()
figName = saveto + '/' + step + '_partial_regression_plots.png'
fig_partRegress.savefig(figName, dpi=300)
fig_partRegress.savefig(figName, dpi=300, bbox_extra_artists=(suptitle,), bbox_inches="tight")
if showfig==True:
plt.show()
else:
Expand Down
4 changes: 2 additions & 2 deletions HLR/hierarchical_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,8 @@ def hierarchical_regression(X, X_names, y, diagnostics=True, save_folder='result
# Run diagnostic tests for assumptions
if diagnostics == True:
assumptionsToCheck = diagnostic_tests.regression_diagnostics(
currentStepModel, currentStepResults, y, currentX, saveto=saveto,
showfig=showfig, verbose=verbose)
currentStepModel, currentStepResults, y, currentX, X_names[ix],
saveto=saveto, showfig=showfig, verbose=verbose)
currentStepResults.append(assumptionsToCheck)

# Add the new results to the results list, same with the model object
Expand Down
81 changes: 43 additions & 38 deletions example/usage.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,6 @@
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/teanijarv/HLR',
version='0.1.3',
version='0.1.4',
zip_safe=False,
)

0 comments on commit 55d5afc

Please sign in to comment.