import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.tools
def run_single_sim(ability_weighting,num_samples=10000,
performance_noise_level=0.0,
measuring_noise_level=0.0,
low_threshold=2,
high_threshold=1000,
make_graphs=False):
#Generate true ability and measurement noise in ability
true_ability = np.random.randn(num_samples)
measuring_noise = measuring_noise_level*np.random.randn(num_samples)
#Non-normalised measured ability
measured_ability = (1-measuring_noise_level)*true_ability+measuring_noise
#Normalise measured ability to have variance 1
ability_normalising_factor = np.sqrt(measuring_noise_level**2+(1-measuring_noise_level)**2)
measured_ability = measured_ability/ability_normalising_factor
#Generate array of other factors which contribute to performance
other_factors = np.random.randn(num_samples)
#Generate array of noise for final performance
performance_noise = performance_noise_level*np.random.randn(num_samples)
#Compute composite of measured height and other factors
composite = ability_weighting*measured_ability+(1-ability_weighting)*other_factors
#Normalise the composite to have variance 1
normalising_factor = np.sqrt(ability_weighting**2+(1-ability_weighting)**2)
composite = composite/normalising_factor
#Compute the actual ability by combining true height and other factors
expected_performance = np.sqrt(2)*(0.5*true_ability+0.5*other_factors) #The sqrt(2) is there for normalisation
#Compute realised performance by adding noise to true ability
realised_performance = (1-performance_noise_level)*expected_performance+performance_noise
#Normalise realised_performance to have variance 1
realised_performance_normalising_factor = np.sqrt(performance_noise_level**2+(1-performance_noise_level)**2)
realised_performance = realised_performance/realised_performance_normalising_factor
#Filter on our composite to only keep those samples whose score is within a set of thresholds
measured_ability_filtered = measured_ability[(composite > low_threshold) & (composite < high_threshold)]
realised_performance_filtered = realised_performance[(composite > low_threshold) & (composite < high_threshold)]
#Regression on full data
full_ols = sm.OLS(realised_performance,sm.add_constant(measured_ability))
full_res = full_ols.fit()
full_coeff = full_res.params
full_r = np.sign(full_coeff[1])*np.sqrt(full_res.rsquared)
#Regression on filtered data
filtered_ols = sm.OLS(realised_performance_filtered,sm.add_constant(measured_ability_filtered))
filtered_res = filtered_ols.fit()
filtered_coeff = filtered_res.params
filtered_r = np.sign(filtered_coeff[1])*np.sqrt(filtered_res.rsquared)
#Code to generate graphs
if make_graphs:
fig,ax = plt.subplots()
full_best_fit = [full_coeff[0]+i*full_coeff[1] for i in range(-5,6)]
filtered_best_fit = [filtered_coeff[0]+i*filtered_coeff[1] for i in range(-5,6)]
ax.scatter(measured_ability,realised_performance)
ax.plot(range(-5,6),full_best_fit,color="m",label="Best fit whole data")
ax.scatter(measured_ability_filtered,realised_performance_filtered,color="r")
ax.plot(range(-5,6),filtered_best_fit,color="g",label="Best fit hired subset")
if ability_weighting != 1:
a = ability_weighting/np.sqrt(ability_weighting**2+(1-ability_weighting)**2)
b = (1-ability_weighting)/np.sqrt(ability_weighting**2+(1-ability_weighting)**2)
filtering_gradient = -1*(a-b)/(np.sqrt(2)*b)
filtering_const_low = low_threshold/(np.sqrt(2)*b)
flitering_const_high = high_threshold/(np.sqrt(2)*b)
filtering_low = [filtering_const_low+i*filtering_gradient for i in range(-5,6)]
filtering_high = [flitering_const_high+i*filtering_gradient for i in range(-5,6)]
ax.plot(range(-5,6),filtering_low,color="k",label="Filtering bounds",linestyle="--")
ax.plot(range(-5,6),filtering_high,color="k",linestyle="--")
else:
ax.axvline(low_threshold,color="k",label="Filtering bounds",linestyle="--")
ax.axvline(high_threshold,color="k",linestyle="--")
ax.set_xlim([-5.5,5.5])
ax.set_ylim([-5.5,5.5])
#ax.plot([valid_x_lower_low,valid_x_upper_Low],)
ax.set_xlabel("Test Performance (X)")
ax.set_ylabel("Job performance (X+Y normalised)")
ax.set_title(f"Scatter plot for weighting proportion on ability = {ability_weighting :.2f}")
ax.legend()
fig.set_size_inches(10,7)
fig.set_facecolor("w")
plt.show()
#Return metrics of interest for the trial we just ran
return ((full_coeff,full_res.pvalues,full_r),
(filtered_coeff,filtered_res.pvalues,filtered_r),
np.mean(realised_performance_filtered))
def full_single_analysis(ability_weighting,num_iter=2500,num_samples=10000,performance_noise_level=0.0,measuring_noise_level=0.0,low_threshold=2,high_threshold=1000):
#Lists which will contain information about the effect sizes/correlations/p-values for all the runs
full_coeffs = []
full_p_vals = []
full_r_vals = []
filtered_coeffs = []
filtered_p_vals = []
filtered_r_vals = []
filtered_performance = []
#Iterate for num_iter times
for i in range(num_iter):
#Make a graph for the first iteration
if i == 0:
make_graphs = True
sample_number = num_samples*1
else:
make_graphs = False
sample_number = num_samples
#Run the trial
trial = run_single_sim(ability_weighting,
num_samples=sample_number,
performance_noise_level=performance_noise_level,
measuring_noise_level=measuring_noise_level,
low_threshold=low_threshold,
high_threshold=high_threshold,
make_graphs = make_graphs)
#Add result of trials to our respective lists
full_coeffs.append(trial[0][0][1])
full_p_vals.append(trial[0][1][1])
full_r_vals.append(trial[0][2])
filtered_coeffs.append(trial[1][0][1])
filtered_p_vals.append(trial[1][1][1])
filtered_r_vals.append(trial[1][2])
filtered_performance.append(trial[2])
#Calculate the percentage of trials where the calculated beta was statisticall significantly different from 0
filtered_p_vals_small = np.array(filtered_p_vals)[(np.array(filtered_p_vals) < 0.05) & (np.array(filtered_coeffs) > 0)]
p_vals_ratio = len(filtered_p_vals_small)/num_iter
#Code to plot a historgram of the correlations/effect sizes for each of the trials. I don't use this code in my post so it's commented out
# coeffs_min = min(min(full_coeffs),min(filtered_coeffs))
# coeffs_max = max(max(full_coeffs),max(filtered_coeffs))
# fig,ax = plt.subplots()
# ax.hist(full_coeffs,alpha=0.4,bins=[0.01*i for i in range(int(100*coeffs_min),int(100*coeffs_max+1))])
# ax.hist(filtered_coeffs,alpha=0.4,bins=[0.01*i for i in range(int(100*coeffs_min),int(100*coeffs_max+1))])
# ax.axvline(0,color="k",label="Zero")
# ax.axvline(np.mean(full_coeffs),color="r",label="Full data mean")
# ax.axvline(np.mean(filtered_coeffs),color="g",label="Filtered data mean")
# ax.set_title(f"Beta Value for weighting {height_weighting}")
# ax.legend()
# fig.set_size_inches(10,7)
# fig.set_facecolor("w")
# plt.show()
# #plt.savefig(f"correlation_graphs/correlations_no_measuring_noise/betas_{height_weighting}.png")
# #plt.close(fig)
# r_vals_min = min(min(full_r_vals),min(filtered_r_vals))
# r_vals_max = max(max(full_r_vals),max(filtered_r_vals))
# fig,ax = plt.subplots()
# ax.hist(full_r_vals,alpha=0.4,bins=[0.01*i for i in range(int(100*r_vals_min),int(100*r_vals_max+1))])
# ax.hist(filtered_r_vals,alpha=0.4,bins=[0.01*i for i in range(int(100*r_vals_min),int(100*r_vals_max+1))])
# ax.axvline(0,color="k",label="Zero")
# ax.axvline(np.mean(full_r_vals),color="r",label="Full data mean")
# ax.axvline(np.mean(filtered_r_vals),color="g",label="Filtered data mean")
# ax.set_title(f"Correlation for weighting {height_weighting}")
# ax.legend()
# fig.set_size_inches(10,7)
# fig.set_facecolor("w")
# plt.show()
# #plt.savefig(f"correlation_graphs/correlations_no_measuring_noise/correlation_{height_weighting}.png")
# #plt.close(fig)
#Return mean values over all the trials for our metrics of interest
return ((np.mean(full_coeffs),np.mean(full_r_vals)),
(np.mean(filtered_coeffs),np.mean(filtered_r_vals)),
np.mean(filtered_performance),p_vals_ratio)
#%%
#Lists which will contain the mean values for each possible weighting
full_coeff_means = []
full_r_val_means = []
filtered_coeff_means = []
filtered_r_val_means = []
filtered_perf_means = []
p_val_cross_percs = []
#The weightings ranging over [0.00,0.01,...,1.00]
weighting_percs = [0.01*i for i in range(0,101)]
#Iterate over the weightings
for weight in weighting_percs:
#Calculate the results for each weighting
results = full_single_analysis(weight,low_threshold=2,high_threshold=1000)
#Add these mean results to the respective lists
full_coeff_means.append(results[0][0])
full_r_val_means.append(results[0][1])
filtered_coeff_means.append(results[1][0])
filtered_r_val_means.append(results[1][1])
filtered_perf_means.append(results[2])
p_val_cross_percs.append(results[3])
# print(weight)
# print(results)
# print()
#%%
#Plot betas,correlations,increase in performance due to filtering and p-values
plt.plot(weighting_percs,filtered_coeff_means)
plt.axhline(0,color="k")
plt.xticks([0.1*i for i in range(11)])
plt.title("Betas")
plt.grid()
plt.show()
plt.plot(weighting_percs,filtered_r_val_means)
plt.axhline(0,color="k")
plt.xticks([0.1*i for i in range(11)])
plt.title("Correlations")
plt.grid()
plt.show()
plt.plot(weighting_percs,filtered_perf_means)
plt.xticks([0.1*i for i in range(11)])
plt.title("Selected subset performance above baseline")
plt.grid()
plt.show()
plt.plot(weighting_percs,p_val_cross_percs)
plt.xticks([0.1*i for i in range(11)])
plt.grid()
plt.title("p_vals")
plt.show()
#%%
"""
Copyright 2024 Hadi Khan
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""