In [1]:
from scipy.stats import norm
In [2]:
norm.cdf(1)
Out[2]:
In [3]:
norm.cdf(2)
Out[3]:
In [4]:
norm.cdf(-2)
Out[4]:
In [5]:
norm.cdf(-1)
Out[5]:
In [6]:
norm.cdf(1) - norm.cdf(-1)
Out[6]:
In [7]:
import pandas as pd
In [8]:
df = pd.read_csv("classdata/medicine.csv")
In [9]:
df.head()
Out[9]:
Some gowder code to hand-roll a hypothesis test. (Note: never do this in practice. This is actually incorrect; for one thing, it isn't the right way to calculate the standard deviation for a difference in means. It's a toy example of how the concept of a z-score and hypothesis test works. For a correct, non-toy, calculation, see e.g. these lecture notes from a yale stats class.)
In [10]:
import numpy as np
In [11]:
print(np.std(df["Sal94"]))
In [12]:
print(np.std(df["Sal94"], ddof=1))
In [13]:
def standard_deviation(column):
mean = np.mean(column)
n = len(column)
sumnums = 0
for x in column:
sumnums += (x - mean)**2
return np.sqrt(sumnums / (n - 1))
In [14]:
standard_deviation(df["Sal94"])
Out[14]:
In [15]:
mean_of_men = np.mean(df[df.Gender==1].Sal94)
In [16]:
mean_of_women = np.mean(df[df.Gender==0].Sal94)
In [17]:
mean_of_men - mean_of_women
Out[17]:
In [18]:
def average(column):
return sum(column) / len(column)
print(average(df.Sal94))
print(np.mean(df.Sal94))
In [19]:
def number_stds_from_zero(mean_difference, std):
return mean_difference / std
number_stds_from_zero(mean_of_men - mean_of_women, standard_deviation(df["Sal94"]))
Out[19]:
In [20]:
def two_tailed_p_value(numstds):
return (1 - norm.cdf(numstds)) + norm.cdf(numstds * -1)
In [21]:
two_tailed_p_value(0.7265779776200709)
Out[21]:
comparison: a t-test, which is at least a bit closet to what you should actually run:
In [22]:
from scipy.stats import ttest_ind
t, p = ttest_ind(df[df.Gender==1].Sal94, df[df.Gender==0].Sal94)
print(p)
In [23]:
print(t)
In [ ]: