Code for "An analysis of Berkson's Paradox"

The code I used to investigate the relationships and generate my graphs

May 14, 2024
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.tools

def run_single_sim(ability_weighting,num_samples=10000,
                   performance_noise_level=0.0,
                   measuring_noise_level=0.0,
                   low_threshold=2,
                   high_threshold=1000,
                   make_graphs=False):

    #Generate true ability and measurement noise in ability
    true_ability = np.random.randn(num_samples)
    measuring_noise = measuring_noise_level*np.random.randn(num_samples)
    
    #Non-normalised measured ability
    measured_ability = (1-measuring_noise_level)*true_ability+measuring_noise

    #Normalise measured ability to have variance 1
    ability_normalising_factor = np.sqrt(measuring_noise_level**2+(1-measuring_noise_level)**2)
    measured_ability = measured_ability/ability_normalising_factor
    
    #Generate array of other factors which contribute to performance
    other_factors = np.random.randn(num_samples)

    #Generate array of noise for final performance
    performance_noise = performance_noise_level*np.random.randn(num_samples)

    #Compute composite of measured height and other factors
    composite = ability_weighting*measured_ability+(1-ability_weighting)*other_factors
    
    #Normalise the composite to have variance 1
    normalising_factor = np.sqrt(ability_weighting**2+(1-ability_weighting)**2) 
    composite = composite/normalising_factor
    
    #Compute the actual ability by combining true height and other factors
    expected_performance = np.sqrt(2)*(0.5*true_ability+0.5*other_factors) #The sqrt(2) is there for normalisation

    #Compute realised performance by adding noise to true ability
    realised_performance = (1-performance_noise_level)*expected_performance+performance_noise
    
    #Normalise realised_performance to have variance 1
    realised_performance_normalising_factor = np.sqrt(performance_noise_level**2+(1-performance_noise_level)**2)
    realised_performance = realised_performance/realised_performance_normalising_factor

    #Filter on our composite to only keep those samples whose score is within a set of thresholds
    measured_ability_filtered = measured_ability[(composite > low_threshold) & (composite < high_threshold)]
    realised_performance_filtered = realised_performance[(composite > low_threshold) & (composite < high_threshold)]

    #Regression on full data
    full_ols = sm.OLS(realised_performance,sm.add_constant(measured_ability))
    full_res = full_ols.fit()
    full_coeff = full_res.params
    full_r = np.sign(full_coeff[1])*np.sqrt(full_res.rsquared)
    
    #Regression on filtered data
    filtered_ols = sm.OLS(realised_performance_filtered,sm.add_constant(measured_ability_filtered))
    filtered_res = filtered_ols.fit()
    filtered_coeff = filtered_res.params
    filtered_r = np.sign(filtered_coeff[1])*np.sqrt(filtered_res.rsquared)

    #Code to generate graphs
    if make_graphs: 
        
        fig,ax = plt.subplots()
        
        full_best_fit = [full_coeff[0]+i*full_coeff[1] for i in range(-5,6)]
        filtered_best_fit = [filtered_coeff[0]+i*filtered_coeff[1] for i in range(-5,6)]
        
        ax.scatter(measured_ability,realised_performance)
        ax.plot(range(-5,6),full_best_fit,color="m",label="Best fit whole data")
        ax.scatter(measured_ability_filtered,realised_performance_filtered,color="r")
        ax.plot(range(-5,6),filtered_best_fit,color="g",label="Best fit hired subset")
        
        
        if ability_weighting != 1:
            a = ability_weighting/np.sqrt(ability_weighting**2+(1-ability_weighting)**2)
            b = (1-ability_weighting)/np.sqrt(ability_weighting**2+(1-ability_weighting)**2)
        

            filtering_gradient = -1*(a-b)/(np.sqrt(2)*b)
        
            filtering_const_low = low_threshold/(np.sqrt(2)*b)
            flitering_const_high = high_threshold/(np.sqrt(2)*b)
        
            filtering_low = [filtering_const_low+i*filtering_gradient for i in range(-5,6)]
            filtering_high = [flitering_const_high+i*filtering_gradient for i in range(-5,6)]
        
            ax.plot(range(-5,6),filtering_low,color="k",label="Filtering bounds",linestyle="--")
            ax.plot(range(-5,6),filtering_high,color="k",linestyle="--")
        else:
            ax.axvline(low_threshold,color="k",label="Filtering bounds",linestyle="--")
            ax.axvline(high_threshold,color="k",linestyle="--")
        
        ax.set_xlim([-5.5,5.5])
        ax.set_ylim([-5.5,5.5])
        
        #ax.plot([valid_x_lower_low,valid_x_upper_Low],)
        
        ax.set_xlabel("Test Performance (X)")
        ax.set_ylabel("Job performance (X+Y normalised)")
        ax.set_title(f"Scatter plot for weighting proportion on ability = {ability_weighting :.2f}")
        ax.legend()
        
        fig.set_size_inches(10,7)
        fig.set_facecolor("w")
        
        plt.show()
    
    #Return metrics of interest for the trial we just ran
    return ((full_coeff,full_res.pvalues,full_r),
        (filtered_coeff,filtered_res.pvalues,filtered_r),
        np.mean(realised_performance_filtered))

def full_single_analysis(ability_weighting,num_iter=2500,num_samples=10000,performance_noise_level=0.0,measuring_noise_level=0.0,low_threshold=2,high_threshold=1000):
    
    #Lists which will contain information about the effect sizes/correlations/p-values for all the runs 
    full_coeffs = []
    full_p_vals = []
    full_r_vals = []

    filtered_coeffs = []
    filtered_p_vals = []
    filtered_r_vals = []
    
    filtered_performance = []

    #Iterate for num_iter times
    for i in range(num_iter):
        
        #Make a graph for the first iteration
        if i == 0:
            make_graphs = True
            sample_number = num_samples*1
        else:
            make_graphs = False
            sample_number = num_samples
        
        #Run the trial
        trial = run_single_sim(ability_weighting,
                               num_samples=sample_number,
                               performance_noise_level=performance_noise_level,
                               measuring_noise_level=measuring_noise_level,
                               low_threshold=low_threshold,
                               high_threshold=high_threshold,
                               make_graphs = make_graphs)
        
        #Add result of trials to our respective lists
        full_coeffs.append(trial[0][0][1])
        full_p_vals.append(trial[0][1][1])
        full_r_vals.append(trial[0][2])
    
        filtered_coeffs.append(trial[1][0][1])
        filtered_p_vals.append(trial[1][1][1])
        filtered_r_vals.append(trial[1][2])
        
        filtered_performance.append(trial[2])
    
    
    #Calculate the percentage of trials where the calculated beta was statisticall significantly different from 0
    filtered_p_vals_small = np.array(filtered_p_vals)[(np.array(filtered_p_vals) < 0.05) & (np.array(filtered_coeffs) > 0)]
    p_vals_ratio = len(filtered_p_vals_small)/num_iter

    #Code to plot a historgram of the correlations/effect sizes for each of the trials. I don't use this code in my post so it's commented out
    # coeffs_min = min(min(full_coeffs),min(filtered_coeffs))
    # coeffs_max = max(max(full_coeffs),max(filtered_coeffs))

    # fig,ax = plt.subplots()
    
    # ax.hist(full_coeffs,alpha=0.4,bins=[0.01*i for i in range(int(100*coeffs_min),int(100*coeffs_max+1))])
    # ax.hist(filtered_coeffs,alpha=0.4,bins=[0.01*i for i in range(int(100*coeffs_min),int(100*coeffs_max+1))])
    
    # ax.axvline(0,color="k",label="Zero")
    # ax.axvline(np.mean(full_coeffs),color="r",label="Full data mean")
    # ax.axvline(np.mean(filtered_coeffs),color="g",label="Filtered data mean")
    # ax.set_title(f"Beta Value for weighting {height_weighting}")
    # ax.legend()
    
    # fig.set_size_inches(10,7)
    # fig.set_facecolor("w")
    # plt.show()
    # #plt.savefig(f"correlation_graphs/correlations_no_measuring_noise/betas_{height_weighting}.png")
    # #plt.close(fig)
    

    # r_vals_min = min(min(full_r_vals),min(filtered_r_vals))
    # r_vals_max = max(max(full_r_vals),max(filtered_r_vals))

    # fig,ax = plt.subplots()

    # ax.hist(full_r_vals,alpha=0.4,bins=[0.01*i for i in range(int(100*r_vals_min),int(100*r_vals_max+1))])
    # ax.hist(filtered_r_vals,alpha=0.4,bins=[0.01*i for i in range(int(100*r_vals_min),int(100*r_vals_max+1))])

    # ax.axvline(0,color="k",label="Zero")
    # ax.axvline(np.mean(full_r_vals),color="r",label="Full data mean")
    # ax.axvline(np.mean(filtered_r_vals),color="g",label="Filtered data mean")
    # ax.set_title(f"Correlation for weighting {height_weighting}")
    # ax.legend()
    
    # fig.set_size_inches(10,7)
    # fig.set_facecolor("w")
    # plt.show()
    # #plt.savefig(f"correlation_graphs/correlations_no_measuring_noise/correlation_{height_weighting}.png")
    # #plt.close(fig)
    
    #Return mean values over all the trials for our metrics of interest
    return ((np.mean(full_coeffs),np.mean(full_r_vals)),
            (np.mean(filtered_coeffs),np.mean(filtered_r_vals)),
            np.mean(filtered_performance),p_vals_ratio)

#%%
#Lists which will contain the mean values for each possible weighting
full_coeff_means = []
full_r_val_means = []

filtered_coeff_means = []
filtered_r_val_means = []

filtered_perf_means = []

p_val_cross_percs = []

#The weightings ranging over [0.00,0.01,...,1.00]
weighting_percs = [0.01*i for i in range(0,101)]

#Iterate over the weightings
for weight in weighting_percs:
    
    #Calculate the results for each weighting
    results = full_single_analysis(weight,low_threshold=2,high_threshold=1000)
    
    #Add these mean results to the respective lists
    full_coeff_means.append(results[0][0])
    full_r_val_means.append(results[0][1])
    
    filtered_coeff_means.append(results[1][0])
    filtered_r_val_means.append(results[1][1])
    
    filtered_perf_means.append(results[2])
    
    p_val_cross_percs.append(results[3])
    
    # print(weight)
    # print(results)
    
    # print()

#%%

#Plot betas,correlations,increase in performance due to filtering and p-values
plt.plot(weighting_percs,filtered_coeff_means)
plt.axhline(0,color="k")
plt.xticks([0.1*i for i in range(11)])
plt.title("Betas")
plt.grid()
plt.show()

plt.plot(weighting_percs,filtered_r_val_means)
plt.axhline(0,color="k")
plt.xticks([0.1*i for i in range(11)])
plt.title("Correlations")
plt.grid()
plt.show()

plt.plot(weighting_percs,filtered_perf_means)
plt.xticks([0.1*i for i in range(11)])
plt.title("Selected subset performance above baseline")
plt.grid()
plt.show()

plt.plot(weighting_percs,p_val_cross_percs)
plt.xticks([0.1*i for i in range(11)])
plt.grid()
plt.title("p_vals")
plt.show()

#%%
"""
Copyright 2024 Hadi Khan

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
Of Aurochs and Angels

Discussion about this post