# In class-examples, February 25, 2020

In [1]:
from scipy.stats import norm

In [2]:
norm.cdf(1)

Out[2]:
0.8413447460685429
In [3]:
norm.cdf(2)

Out[3]:
0.9772498680518208
In [4]:
norm.cdf(-2)

Out[4]:
0.022750131948179195
In [5]:
norm.cdf(-1)

Out[5]:
0.15865525393145707
In [6]:
norm.cdf(1) - norm.cdf(-1)

Out[6]:
0.6826894921370859
In [7]:
import pandas as pd

In [8]:
df = pd.read_csv("classdata/medicine.csv")

In [9]:
df.head()

Out[9]:
ID Dept Gender Clin Cert Prate Exper Rank Sal94 Sal95
0 1 1 1 0 0 7.4 9 3 77836 84612
1 2 1 1 0 0 6.7 10 2 69994 78497
2 3 1 1 0 0 8.1 6 1 62872 67756
3 4 1 1 1 1 5.1 27 3 155196 173220
4 5 1 1 0 0 7.0 10 3 89268 96099

Some gowder code to hand-roll a hypothesis test. (Note: never do this in practice. This is actually incorrect; for one thing, it isn't the right way to calculate the standard deviation for a difference in means. It's a toy example of how the concept of a z-score and hypothesis test works. For a correct, non-toy, calculation, see e.g. these lecture notes from a yale stats class.)

In [10]:
import numpy as np

In [11]:
print(np.std(df["Sal94"]))

80315.36232768581

In [12]:
print(np.std(df["Sal94"], ddof=1))

80469.66672032783

In [13]:
def standard_deviation(column):
mean = np.mean(column)
n = len(column)
sumnums = 0
for x in column:
sumnums += (x - mean)**2
return np.sqrt(sumnums / (n - 1))

In [14]:
standard_deviation(df["Sal94"])

Out[14]:
80469.66672032783
In [15]:
mean_of_men = np.mean(df[df.Gender==1].Sal94)

In [16]:
mean_of_women = np.mean(df[df.Gender==0].Sal94)

In [17]:
mean_of_men - mean_of_women

Out[17]:
58467.48770541692
In [18]:
def average(column):
return sum(column) / len(column)

print(average(df.Sal94))
print(np.mean(df.Sal94))

153593.3448275862
153593.3448275862

In [19]:
def number_stds_from_zero(mean_difference, std):
return mean_difference / std

number_stds_from_zero(mean_of_men - mean_of_women, standard_deviation(df["Sal94"]))

Out[19]:
0.7265779776200709
In [20]:
def two_tailed_p_value(numstds):
return (1 - norm.cdf(numstds)) + norm.cdf(numstds * -1)

In [21]:
two_tailed_p_value(0.7265779776200709)

Out[21]:
0.46748452301038806

comparison: a t-test, which is at least a bit closet to what you should actually run:

In [22]:
from scipy.stats import ttest_ind
t, p = ttest_ind(df[df.Gender==1].Sal94, df[df.Gender==0].Sal94)
print(p)

2.7515254228164083e-09

In [23]:
print(t)

6.160842760627066

In [ ]: