model output edits with SPSS verification

- figures plotted and saved separately - added standardised beta coefficients - added partial and semi-partial correlations (unique variance) - fixed f-change degrees of freedom calculation - Fixed f-change p-value calculation
teanijarv · Apr 5, 2023 · 9e9c420 · 9e9c420
1 parent 11a42d7
commit 9e9c420
Show file tree

Hide file tree

Showing 3 changed files with 122 additions and 41 deletions.
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -6,7 +6,9 @@ Development Lead
 ----------------
 
 * Toomas Erik Anijärv <[email protected]>
-* Rory Boyle <[email protected]
+* Rory Boyle <[email protected]>
 
 Contributors
 ------------
+
+* Jules Mitchell <[email protected]>
diff --git a/HLR/diagnostic_tests.py b/HLR/diagnostic_tests.py
@@ -91,7 +91,7 @@ def regression_diagnostics(model, result, y, X, X_names, saveto='results', showf
 
     # Calculate the Durbin-Watson statistic
     diagnostics['durbin_watson_stat'] = sm_stats.stattools.durbin_watson(
-            model.resid, axis=0)
+           model.resid, axis=0)
     # If the statistic is between 1.5-2.5, the test is passed
     if diagnostics['durbin_watson_stat'] >= 1.5 and diagnostics[
             'durbin_watson_stat'] <= 2.5:
@@ -373,10 +373,7 @@ def regression_diagnostics(model, result, y, X, X_names, saveto='results', showf
         high_pairwise_corr.to_csv(pairwiseCorrName)
 
     ###### MAKE AND SAVE PLOTS
-
-    sns.set_theme(style="whitegrid")
-    fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 15))
-    suptitle = fig.suptitle('Diagnostic Plots for {} - {}'.format(step, X_names), y=0.92)
+    #29/03/2023 - 2 x 2 figure subplot removed and figures now saved individually
 
     ### PLOT 1 - STUDENTISED RESIDUALS VS FITTED VALUES 
     # Used to inspect linearity and homoscedasticity
@@ -388,44 +385,61 @@ def regression_diagnostics(model, result, y, X, X_names, saveto='results', showf
                                              axis=1)
     # Plot with a LOWESS (Locally Weighted Scatterplot Smoothing) line 
     # A relativelty straight LOWESS line indicates a linear model is reasonable
-    sns.residplot(ax=axs[0][0], data=df_residfitted,
+    fig, ax = plt.subplots()
+    sns.residplot(ax=ax, data=df_residfitted,
             x='fitted_vals', y='student_resid', lowess=True,
             scatter_kws={'alpha': 0.8},
             line_kws={'color': 'red', 'lw': 1, 'alpha': 1})
-    axs[0][0].set(ylim=(-3.5, 3.5))
-    axs[0][0].set_title('Residuals vs Fitted')
-    axs[0][0].set_xlabel('Fitted values')
-    axs[0][0].set_ylabel('Studentised Residuals')
+    ax.set(ylim=(-3.5, 3.5))
+    ax.set_title('Residuals vs Fitted')
+    ax.set_xlabel('Fitted values')
+    ax.set_ylabel('Studentised Residuals')
+
+    figName = saveto + '/' + step + '_studresid_plots.png'
+    plt.savefig(figName, dpi=300, bbox_inches="tight")
 
     ### PLOT 2 - NORMAL QQ PLOT OF RESIDUALS
     # Used to inspect normality
-    sm.qqplot(ax=axs[0][1], data=model.resid, fit=True, line='45')
-    axs[0][1].set_title('Normal QQ Plot of Residuals')
+    fig, ax = plt.subplots()
+    sm.qqplot(ax=ax, data=model.resid, fit=True, line='45')
+    ax.set_title('Normal QQ Plot of Residuals')
+    figName = saveto + '/' + step + '_QQ_plots.png'
+    plt.savefig(figName, dpi=300, bbox_inches="tight")
 
     ### PLOT 3 - INFLUENCE PLOT WITH COOK'S DISTANCE
     # Used to inspect influence
-    sm.graphics.influence_plot(model, ax=axs[1][0], criterion="cooks")
-    axs[1][0].set_title('Influence plot')
-    axs[1][0].set_xlabel('H leverage')
-    axs[1][0].set_ylabel('Studentised Residuals')
+    fig, ax = plt.subplots()
+    sm.graphics.influence_plot(model, ax=ax, criterion="cooks")
+    ax.set_title('Influence plot')
+    ax.set_xlabel('H leverage')
+    ax.set_ylabel('Studentised Residuals')
+    figName = saveto + '/' + step + '_influence_plots.png'
+    plt.savefig(figName, dpi=300, bbox_inches="tight")
 
     ### PLOT 4 - BOX PLOT OF STANDARDISED RESIDUALS
     # Used to inspect outliers (residuals)
-    outlier_fig = sns.boxplot(ax=axs[1][1], y=influence_df['standard_resid'])
-    outlier_fig = sns.swarmplot(ax=axs[1][1], y=influence_df['standard_resid'], color="red")
-    outlier_fig.axes.set(ylim=(-3.5, 3.5))
-    outlier_fig.axes.set_title('Boxplot of Standardised Residuals')
+    fig, ax = plt.subplots()
+    outlier_fig = sns.boxplot(ax=ax, y=influence_df['standard_resid'])
+    outlier_fig = sns.swarmplot(ax=ax, y=influence_df['standard_resid'], color="red")
+    outlier_fig.axes.set(title = 'Boxplot of  Standardised Residuals', ylabel='Standardized Residuals', ylim=(-3.5, 3.5))
     residBoxplot = outlier_fig.get_figure()  # get figure to save
-
-    ### SAVE ALL THE PLOTS ABOVE AS A SUBPLOT
-    figName = saveto + '/' + step + '_diagnostictests_plots.png'
-    plt.savefig(figName, dpi=300, bbox_extra_artists=(suptitle,), bbox_inches="tight")
+    figName = saveto + '/' + step + '_box_plots.png'
+    plt.savefig(figName, dpi=300, bbox_inches="tight")
+    plt.show()  
+
+    ### PLOT 5 - HISTOGRAM OF STANDARDISED RESIDUAL
+    plot = sns.histplot(data=influence_df, x='standard_resid', kde=True, bins=16) # you may select the no. of bins
+    plot.set(title = 'Histogram of Standardised Residuals', xlabel='Standardized Residuals', ylabel='Frequency', xlim=(-3.5, 3.5))
+
+    figName = saveto + '/' + step + '_histogram_plots.png'
+    plt.savefig(figName, dpi=300, bbox_inches="tight")
+
     if showfig==True:
         plt.show()
     else:
         plt.close()
-
-    ### PLOT 5 - PARTIAL REGRESSION PLOTS
+    
+    ### PLOT 6 - PARTIAL REGRESSION PLOTS
     # Used to inspect linearity
     if len(X.columns) == 1:
         fig_partRegress = plt.figure(figsize=(15, 5))
@@ -450,4 +464,4 @@ def regression_diagnostics(model, result, y, X, X_names, saveto='results', showf
     # Return list of assumptions that require further inspection (i.e. failed
     # at least 1 diagnostic test)
     assumptionsToCheck = list(set(violated))
-    return assumptionsToCheck
+    return assumptionsToCheck
diff --git a/HLR/hierarchical_regression.py b/HLR/hierarchical_regression.py
@@ -9,6 +9,8 @@
 import statsmodels.api as sm
 import scipy as scipy
 import pandas as pd
+import numpy as np
+import pingouin as pg
 import os
 
 from HLR import diagnostic_tests
@@ -35,7 +37,7 @@ def linear_reg(X, X_names, y):
     """
     # Run linear regression & add column of ones to X to serve as intercept
     model = sm.OLS(y, sm.add_constant(X)).fit()
-     
+
     # Extract results from statsmodels.OLS object
     results = [X_names, model.nobs, model.df_resid, model.df_model,
                model.rsquared, model.fvalue, model.f_pvalue, model.ssr,
@@ -53,11 +55,66 @@ def linear_reg(X, X_names, y):
     p_values = {}
     for ix, coeff in enumerate(model.params):
         coeffs[namesCopy[ix]] = coeff
-        p_values[namesCopy[ix]] = model.pvalues[ix]
+        p_values[namesCopy[ix]] = model.pvalues[ix]    
+
+    # 26/03/2023 (Jules) - Standardised beta coefficients calculated and appended
+    # Std for predictors in step calculated, and std of outcome set to length of predictors
+    # Conversion factor created (std x/std y), applied to each coefficient and zipped back to dictionary of predictors
+
+    std_x = pd.Series(np.std(X)).to_list() # 25/03/23 dictionary to store standard deviation of predictor variables
+    std_y = pd.Series(np.std(y)).to_list()  # 25/03/23 calculate standard deviation of outcome
+    std_y = std_y*len(std_x)
+    conversion = [x / y for x, y in zip(std_x, std_y)]
+
+    coeffs_list = coeffs.copy()
+    coeffs_list.pop('Constant')
+    coeffs_keys, coeffs_values = zip(*coeffs_list.items())
+    beta_conversion = [b * c for b,c in zip(coeffs_values, conversion)]
+    std_coeffs = dict(zip(coeffs_keys, beta_conversion)) # 25/03/23 dictionary to store beta_coefficients
+
+    # 05/04/2023 - Calculate the partial and semi-partial (part in SPSS) correlations for each X value with y, while holding other X values as covariates
+    # Initialize an array to store partial and semi-partial correlation values
+    x_cols = [X.columns.tolist()]
+    semi_partial_correlations = np.zeros(len(x_cols[0]))
+    partial_correlations = np.zeros(len(x_cols[0]))
+
+    for i in range(len(x_cols[0])):
+        # Create a mask to exclude the current variable from the covariates
+        mask = [True] * len(x_cols[0])
+        mask[i] = False
+
+        #Concat X and Y dataframes
+        Xy_temp = pd.concat([X, y], axis=1)
 
+        # Extract the current variable (x) from xx
+        xx = x_cols[0][i]
+
+        # Extract the remaining variables (covariates) from xx
+        covariates = [x_cols[0][j] for j in range(len(x_cols[0])) if mask[j]]
+
+        # Calculate partial and semi-partial values and assign to array
+        semi_partial_corr = pg.partial_corr(data = Xy_temp, x=xx, y=y.columns[0], x_covar=covariates)
+        partial_corr = pg.partial_corr(data = Xy_temp, x=xx, y=y.columns[0], covar=covariates)
+        semi_partial_correlations[i] = semi_partial_corr['r']
+        partial_correlations[i] = partial_corr['r']
+
+
+    # Create dict to zip variables names to values
+    dict_partial = dict(zip(coeffs_keys, partial_correlations)) # 25/03/23 dictionary to store beta_coefficients
+    dict_semi_partial = dict(zip(coeffs_keys, semi_partial_correlations)) # 25/03/23 dictionary to store beta_coefficients
+
+    # Square semi partial values to get unique variance accounted for.\
+    unique_var = (semi_partial_correlations*semi_partial_correlations)*100
+    dict_unique_var = dict(zip(coeffs_keys, unique_var)) # 25/03/23 dictionary to store beta_coefficients
+
+    # Append to results
     results.append(coeffs)
-    results.append(p_values)
-
+    results.append(p_values)  
+    results.append(std_coeffs)
+    results.append(dict_partial)
+    results.append(dict_semi_partial)
+    results.append(dict_unique_var)
+
     return results, model
 
 def calculate_change_stats(model_stats):
@@ -95,22 +152,27 @@ def calculate_change_stats(model_stats):
 
     # calculate f change - formula from here: 
     f_change = []
+    f_change_pval = []
     for step in range(0, num_steps-1):
         # numerator of f change formula
+        # 24/03/2023 Predictor change variable created
+        predictor_change = len(model_stats.iloc[step+1]['Predictors']) - len(model_stats.iloc[step]['Predictors'])
+
         # (r_sq change / number of predictors added)
-        f_change_numerator = r_sq_change[step] / (len(model_stats.iloc[step+1]['Predictors'])
-                                                  - len(model_stats.iloc[step]['Predictors']))
+        f_change_numerator = r_sq_change[step] / predictor_change
         # denominator of f change formula
         # (1 - step2 r_sq) / (num obs - number of predictors - 1)
         f_change_denominator = ((1 - model_stats.iloc[step+1]['R-squared']) /
                                 model_stats.iloc[step+1]['DF (residuals)'])
         # compute f change
         f_change.append(f_change_numerator / f_change_denominator)
 
-    # calculate pvalue of f change
-    f_change_pval = [scipy.stats.f.sf(f_change[step], 1,
-                                      model_stats.iloc[step+1]['DF (residuals)'])
-                     for step in range(0, num_steps-1)]
+        # calculate pvalue of f change
+        # 24/03/2023 Difference in predictors/step added as df1 (originally had 1)
+        # 05/04/2023 p-value for f change fixed (brought function to the for loop)
+        f_change_pval_temp = scipy.stats.f.sf(f_change[step], predictor_change, 
+                                        model_stats.iloc[step+1]['DF (residuals)'])
+        f_change_pval.append(f_change_pval_temp)
 
     return [r_sq_change, f_change, f_change_pval]
 
@@ -198,17 +260,20 @@ def hierarchical_regression(X, X_names, y, diagnostics=True, save_folder='result
         reg_models.append(['Step ' + str(ix+1), currentStepModel])
 
     # Add the results to model_stats dataframe and name the columns
+    # 26/03/2023 (Jules) - Std beta coefs added
     model_stats = pd.DataFrame(results)
     if diagnostics == True:
         model_stats.columns = ['Step', 'Predictors', 'N (observations)', 'DF (residuals)',
                             'DF (model)', 'R-squared', 'F-value', 'P-value (F)', 'SSE', 'SSTO',
                             'MSE (model)', 'MSE (residuals)', 'MSE (total)', 'Beta coefs',
-                            'P-values (beta coefs)', 'Failed assumptions (check!)']
+                            'P-values (beta coefs)','Std Beta coefs', 'Partial Correlation', 
+                            'Semi-partial Correlation', 'Unique Variance %' , 'Failed assumptions (check!)']
     else:
         model_stats.columns = ['Step', 'Predictors', 'N (observations)', 'DF (residuals)',
                             'DF (model)', 'R-squared', 'F-value', 'P-value (F)', 'SSE', 'SSTO',
                             'MSE (model)', 'MSE (residuals)', 'MSE (total)', 'Beta coefs',
-                            'P-values (beta coefs)']
+                            'P-values (beta coefs)','Std Beta coefs', 'Partial Correlation', 
+                            'Semi-partial Correlation', 'Unique Variance %'] 
 
     # Create change results by calculating r-sq change, f change, p-value of f change between steps
     change_results = calculate_change_stats(model_stats)
@@ -224,4 +289,4 @@ def hierarchical_regression(X, X_names, y, diagnostics=True, save_folder='result
     # Merge model_stats and change_stats, creating the final model results dataframe
     model_stats = pd.merge(model_stats, change_stats, on='Step', how='outer')
 
-    return model_stats, reg_models
+    return model_stats, reg_models