Skip to content

Commit

Permalink
model output edits with SPSS verification
Browse files Browse the repository at this point in the history
- figures plotted and saved separately
- added standardised beta coefficients
- added partial and semi-partial correlations (unique variance)
- fixed f-change degrees of freedom calculation
- Fixed f-change p-value calculation
  • Loading branch information
JulesMitchell committed Apr 5, 2023
1 parent 11a42d7 commit 9e9c420
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 41 deletions.
4 changes: 3 additions & 1 deletion AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ Development Lead
----------------

* Toomas Erik Anijärv <[email protected]>
* Rory Boyle <[email protected]
* Rory Boyle <[email protected]>

Contributors
------------

* Jules Mitchell <[email protected]>
68 changes: 41 additions & 27 deletions HLR/diagnostic_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def regression_diagnostics(model, result, y, X, X_names, saveto='results', showf

# Calculate the Durbin-Watson statistic
diagnostics['durbin_watson_stat'] = sm_stats.stattools.durbin_watson(
model.resid, axis=0)
model.resid, axis=0)
# If the statistic is between 1.5-2.5, the test is passed
if diagnostics['durbin_watson_stat'] >= 1.5 and diagnostics[
'durbin_watson_stat'] <= 2.5:
Expand Down Expand Up @@ -373,10 +373,7 @@ def regression_diagnostics(model, result, y, X, X_names, saveto='results', showf
high_pairwise_corr.to_csv(pairwiseCorrName)

###### MAKE AND SAVE PLOTS

sns.set_theme(style="whitegrid")
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
suptitle = fig.suptitle('Diagnostic Plots for {} - {}'.format(step, X_names), y=0.92)
#29/03/2023 - 2 x 2 figure subplot removed and figures now saved individually

### PLOT 1 - STUDENTISED RESIDUALS VS FITTED VALUES
# Used to inspect linearity and homoscedasticity
Expand All @@ -388,44 +385,61 @@ def regression_diagnostics(model, result, y, X, X_names, saveto='results', showf
axis=1)
# Plot with a LOWESS (Locally Weighted Scatterplot Smoothing) line
# A relativelty straight LOWESS line indicates a linear model is reasonable
sns.residplot(ax=axs[0][0], data=df_residfitted,
fig, ax = plt.subplots()
sns.residplot(ax=ax, data=df_residfitted,
x='fitted_vals', y='student_resid', lowess=True,
scatter_kws={'alpha': 0.8},
line_kws={'color': 'red', 'lw': 1, 'alpha': 1})
axs[0][0].set(ylim=(-3.5, 3.5))
axs[0][0].set_title('Residuals vs Fitted')
axs[0][0].set_xlabel('Fitted values')
axs[0][0].set_ylabel('Studentised Residuals')
ax.set(ylim=(-3.5, 3.5))
ax.set_title('Residuals vs Fitted')
ax.set_xlabel('Fitted values')
ax.set_ylabel('Studentised Residuals')

figName = saveto + '/' + step + '_studresid_plots.png'
plt.savefig(figName, dpi=300, bbox_inches="tight")

### PLOT 2 - NORMAL QQ PLOT OF RESIDUALS
# Used to inspect normality
sm.qqplot(ax=axs[0][1], data=model.resid, fit=True, line='45')
axs[0][1].set_title('Normal QQ Plot of Residuals')
fig, ax = plt.subplots()
sm.qqplot(ax=ax, data=model.resid, fit=True, line='45')
ax.set_title('Normal QQ Plot of Residuals')
figName = saveto + '/' + step + '_QQ_plots.png'
plt.savefig(figName, dpi=300, bbox_inches="tight")

### PLOT 3 - INFLUENCE PLOT WITH COOK'S DISTANCE
# Used to inspect influence
sm.graphics.influence_plot(model, ax=axs[1][0], criterion="cooks")
axs[1][0].set_title('Influence plot')
axs[1][0].set_xlabel('H leverage')
axs[1][0].set_ylabel('Studentised Residuals')
fig, ax = plt.subplots()
sm.graphics.influence_plot(model, ax=ax, criterion="cooks")
ax.set_title('Influence plot')
ax.set_xlabel('H leverage')
ax.set_ylabel('Studentised Residuals')
figName = saveto + '/' + step + '_influence_plots.png'
plt.savefig(figName, dpi=300, bbox_inches="tight")

### PLOT 4 - BOX PLOT OF STANDARDISED RESIDUALS
# Used to inspect outliers (residuals)
outlier_fig = sns.boxplot(ax=axs[1][1], y=influence_df['standard_resid'])
outlier_fig = sns.swarmplot(ax=axs[1][1], y=influence_df['standard_resid'], color="red")
outlier_fig.axes.set(ylim=(-3.5, 3.5))
outlier_fig.axes.set_title('Boxplot of Standardised Residuals')
fig, ax = plt.subplots()
outlier_fig = sns.boxplot(ax=ax, y=influence_df['standard_resid'])
outlier_fig = sns.swarmplot(ax=ax, y=influence_df['standard_resid'], color="red")
outlier_fig.axes.set(title = 'Boxplot of Standardised Residuals', ylabel='Standardized Residuals', ylim=(-3.5, 3.5))
residBoxplot = outlier_fig.get_figure() # get figure to save

### SAVE ALL THE PLOTS ABOVE AS A SUBPLOT
figName = saveto + '/' + step + '_diagnostictests_plots.png'
plt.savefig(figName, dpi=300, bbox_extra_artists=(suptitle,), bbox_inches="tight")
figName = saveto + '/' + step + '_box_plots.png'
plt.savefig(figName, dpi=300, bbox_inches="tight")
plt.show()

### PLOT 5 - HISTOGRAM OF STANDARDISED RESIDUAL
plot = sns.histplot(data=influence_df, x='standard_resid', kde=True, bins=16) # you may select the no. of bins
plot.set(title = 'Histogram of Standardised Residuals', xlabel='Standardized Residuals', ylabel='Frequency', xlim=(-3.5, 3.5))

figName = saveto + '/' + step + '_histogram_plots.png'
plt.savefig(figName, dpi=300, bbox_inches="tight")

if showfig==True:
plt.show()
else:
plt.close()

### PLOT 5 - PARTIAL REGRESSION PLOTS
### PLOT 6 - PARTIAL REGRESSION PLOTS
# Used to inspect linearity
if len(X.columns) == 1:
fig_partRegress = plt.figure(figsize=(15, 5))
Expand All @@ -450,4 +464,4 @@ def regression_diagnostics(model, result, y, X, X_names, saveto='results', showf
# Return list of assumptions that require further inspection (i.e. failed
# at least 1 diagnostic test)
assumptionsToCheck = list(set(violated))
return assumptionsToCheck
return assumptionsToCheck
91 changes: 78 additions & 13 deletions HLR/hierarchical_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import statsmodels.api as sm
import scipy as scipy
import pandas as pd
import numpy as np
import pingouin as pg
import os

from HLR import diagnostic_tests
Expand All @@ -35,7 +37,7 @@ def linear_reg(X, X_names, y):
"""
# Run linear regression & add column of ones to X to serve as intercept
model = sm.OLS(y, sm.add_constant(X)).fit()

# Extract results from statsmodels.OLS object
results = [X_names, model.nobs, model.df_resid, model.df_model,
model.rsquared, model.fvalue, model.f_pvalue, model.ssr,
Expand All @@ -53,11 +55,66 @@ def linear_reg(X, X_names, y):
p_values = {}
for ix, coeff in enumerate(model.params):
coeffs[namesCopy[ix]] = coeff
p_values[namesCopy[ix]] = model.pvalues[ix]
p_values[namesCopy[ix]] = model.pvalues[ix]

# 26/03/2023 (Jules) - Standardised beta coefficients calculated and appended
# Std for predictors in step calculated, and std of outcome set to length of predictors
# Conversion factor created (std x/std y), applied to each coefficient and zipped back to dictionary of predictors

std_x = pd.Series(np.std(X)).to_list() # 25/03/23 dictionary to store standard deviation of predictor variables
std_y = pd.Series(np.std(y)).to_list() # 25/03/23 calculate standard deviation of outcome
std_y = std_y*len(std_x)
conversion = [x / y for x, y in zip(std_x, std_y)]

coeffs_list = coeffs.copy()
coeffs_list.pop('Constant')
coeffs_keys, coeffs_values = zip(*coeffs_list.items())
beta_conversion = [b * c for b,c in zip(coeffs_values, conversion)]
std_coeffs = dict(zip(coeffs_keys, beta_conversion)) # 25/03/23 dictionary to store beta_coefficients

# 05/04/2023 - Calculate the partial and semi-partial (part in SPSS) correlations for each X value with y, while holding other X values as covariates
# Initialize an array to store partial and semi-partial correlation values
x_cols = [X.columns.tolist()]
semi_partial_correlations = np.zeros(len(x_cols[0]))
partial_correlations = np.zeros(len(x_cols[0]))

for i in range(len(x_cols[0])):
# Create a mask to exclude the current variable from the covariates
mask = [True] * len(x_cols[0])
mask[i] = False

#Concat X and Y dataframes
Xy_temp = pd.concat([X, y], axis=1)

# Extract the current variable (x) from xx
xx = x_cols[0][i]

# Extract the remaining variables (covariates) from xx
covariates = [x_cols[0][j] for j in range(len(x_cols[0])) if mask[j]]

# Calculate partial and semi-partial values and assign to array
semi_partial_corr = pg.partial_corr(data = Xy_temp, x=xx, y=y.columns[0], x_covar=covariates)
partial_corr = pg.partial_corr(data = Xy_temp, x=xx, y=y.columns[0], covar=covariates)
semi_partial_correlations[i] = semi_partial_corr['r']
partial_correlations[i] = partial_corr['r']


# Create dict to zip variables names to values
dict_partial = dict(zip(coeffs_keys, partial_correlations)) # 25/03/23 dictionary to store beta_coefficients
dict_semi_partial = dict(zip(coeffs_keys, semi_partial_correlations)) # 25/03/23 dictionary to store beta_coefficients

# Square semi partial values to get unique variance accounted for.\
unique_var = (semi_partial_correlations*semi_partial_correlations)*100
dict_unique_var = dict(zip(coeffs_keys, unique_var)) # 25/03/23 dictionary to store beta_coefficients

# Append to results
results.append(coeffs)
results.append(p_values)

results.append(p_values)
results.append(std_coeffs)
results.append(dict_partial)
results.append(dict_semi_partial)
results.append(dict_unique_var)

return results, model

def calculate_change_stats(model_stats):
Expand Down Expand Up @@ -95,22 +152,27 @@ def calculate_change_stats(model_stats):

# calculate f change - formula from here:
f_change = []
f_change_pval = []
for step in range(0, num_steps-1):
# numerator of f change formula
# 24/03/2023 Predictor change variable created
predictor_change = len(model_stats.iloc[step+1]['Predictors']) - len(model_stats.iloc[step]['Predictors'])

# (r_sq change / number of predictors added)
f_change_numerator = r_sq_change[step] / (len(model_stats.iloc[step+1]['Predictors'])
- len(model_stats.iloc[step]['Predictors']))
f_change_numerator = r_sq_change[step] / predictor_change
# denominator of f change formula
# (1 - step2 r_sq) / (num obs - number of predictors - 1)
f_change_denominator = ((1 - model_stats.iloc[step+1]['R-squared']) /
model_stats.iloc[step+1]['DF (residuals)'])
# compute f change
f_change.append(f_change_numerator / f_change_denominator)

# calculate pvalue of f change
f_change_pval = [scipy.stats.f.sf(f_change[step], 1,
model_stats.iloc[step+1]['DF (residuals)'])
for step in range(0, num_steps-1)]
# calculate pvalue of f change
# 24/03/2023 Difference in predictors/step added as df1 (originally had 1)
# 05/04/2023 p-value for f change fixed (brought function to the for loop)
f_change_pval_temp = scipy.stats.f.sf(f_change[step], predictor_change,
model_stats.iloc[step+1]['DF (residuals)'])
f_change_pval.append(f_change_pval_temp)

return [r_sq_change, f_change, f_change_pval]

Expand Down Expand Up @@ -198,17 +260,20 @@ def hierarchical_regression(X, X_names, y, diagnostics=True, save_folder='result
reg_models.append(['Step ' + str(ix+1), currentStepModel])

# Add the results to model_stats dataframe and name the columns
# 26/03/2023 (Jules) - Std beta coefs added
model_stats = pd.DataFrame(results)
if diagnostics == True:
model_stats.columns = ['Step', 'Predictors', 'N (observations)', 'DF (residuals)',
'DF (model)', 'R-squared', 'F-value', 'P-value (F)', 'SSE', 'SSTO',
'MSE (model)', 'MSE (residuals)', 'MSE (total)', 'Beta coefs',
'P-values (beta coefs)', 'Failed assumptions (check!)']
'P-values (beta coefs)','Std Beta coefs', 'Partial Correlation',
'Semi-partial Correlation', 'Unique Variance %' , 'Failed assumptions (check!)']
else:
model_stats.columns = ['Step', 'Predictors', 'N (observations)', 'DF (residuals)',
'DF (model)', 'R-squared', 'F-value', 'P-value (F)', 'SSE', 'SSTO',
'MSE (model)', 'MSE (residuals)', 'MSE (total)', 'Beta coefs',
'P-values (beta coefs)']
'P-values (beta coefs)','Std Beta coefs', 'Partial Correlation',
'Semi-partial Correlation', 'Unique Variance %']

# Create change results by calculating r-sq change, f change, p-value of f change between steps
change_results = calculate_change_stats(model_stats)
Expand All @@ -224,4 +289,4 @@ def hierarchical_regression(X, X_names, y, diagnostics=True, save_folder='result
# Merge model_stats and change_stats, creating the final model results dataframe
model_stats = pd.merge(model_stats, change_stats, on='Step', how='outer')

return model_stats, reg_models
return model_stats, reg_models

0 comments on commit 9e9c420

Please sign in to comment.