# In class-examples, February 25, 2020

In :
from scipy.stats import norm

In :
norm.cdf(1)

Out:
0.8413447460685429
In :
norm.cdf(2)

Out:
0.9772498680518208
In :
norm.cdf(-2)

Out:
0.022750131948179195
In :
norm.cdf(-1)

Out:
0.15865525393145707
In :
norm.cdf(1) - norm.cdf(-1)

Out:
0.6826894921370859
In :
import pandas as pd

In :
df = pd.read_csv("classdata/medicine.csv")

In :
df.head()

Out:
ID Dept Gender Clin Cert Prate Exper Rank Sal94 Sal95
0 1 1 1 0 0 7.4 9 3 77836 84612
1 2 1 1 0 0 6.7 10 2 69994 78497
2 3 1 1 0 0 8.1 6 1 62872 67756
3 4 1 1 1 1 5.1 27 3 155196 173220
4 5 1 1 0 0 7.0 10 3 89268 96099

Some gowder code to hand-roll a hypothesis test. (Note: never do this in practice. This is actually incorrect; for one thing, it isn't the right way to calculate the standard deviation for a difference in means. It's a toy example of how the concept of a z-score and hypothesis test works. For a correct, non-toy, calculation, see e.g. these lecture notes from a yale stats class.)

In :
import numpy as np

In :
print(np.std(df["Sal94"]))

80315.36232768581

In :
print(np.std(df["Sal94"], ddof=1))

80469.66672032783

In :
def standard_deviation(column):
mean = np.mean(column)
n = len(column)
sumnums = 0
for x in column:
sumnums += (x - mean)**2
return np.sqrt(sumnums / (n - 1))

In :
standard_deviation(df["Sal94"])

Out:
80469.66672032783
In :
mean_of_men = np.mean(df[df.Gender==1].Sal94)

In :
mean_of_women = np.mean(df[df.Gender==0].Sal94)

In :
mean_of_men - mean_of_women

Out:
58467.48770541692
In :
def average(column):
return sum(column) / len(column)

print(average(df.Sal94))
print(np.mean(df.Sal94))

153593.3448275862
153593.3448275862

In :
def number_stds_from_zero(mean_difference, std):
return mean_difference / std

number_stds_from_zero(mean_of_men - mean_of_women, standard_deviation(df["Sal94"]))

Out:
0.7265779776200709
In :
def two_tailed_p_value(numstds):
return (1 - norm.cdf(numstds)) + norm.cdf(numstds * -1)

In :
two_tailed_p_value(0.7265779776200709)

Out:
0.46748452301038806

comparison: a t-test, which is at least a bit closet to what you should actually run:

In :
from scipy.stats import ttest_ind
t, p = ttest_ind(df[df.Gender==1].Sal94, df[df.Gender==0].Sal94)
print(p)

2.7515254228164083e-09

In :
print(t)

6.160842760627066

In [ ]: