In [1]:

from scipy.stats import norm

In [2]:

norm.cdf(1)

Out[2]:

0.8413447460685429

In [3]:

norm.cdf(2)

Out[3]:

0.9772498680518208

In [4]:

norm.cdf(-2)

Out[4]:

0.022750131948179195

In [5]:

norm.cdf(-1)

Out[5]:

0.15865525393145707

In [6]:

norm.cdf(1) - norm.cdf(-1)

Out[6]:

0.6826894921370859

In [7]:

import pandas as pd

In [8]:

df = pd.read_csv("classdata/medicine.csv")

In [9]:

df.head()

Out[9]:

	ID	Dept	Gender	Clin	Cert	Prate	Exper	Rank	Sal94	Sal95
0	1	1	1	0	0	7.4	9	3	77836	84612
1	2	1	1	0	0	6.7	10	2	69994	78497
2	3	1	1	0	0	8.1	6	1	62872	67756
3	4	1	1	1	1	5.1	27	3	155196	173220
4	5	1	1	0	0	7.0	10	3	89268	96099

Some gowder code to hand-roll a hypothesis test. (Note: never do this in practice. This is actually incorrect; for one thing, it isn't the right way to calculate the standard deviation for a difference in means. It's a toy example of how the concept of a z-score and hypothesis test works. For a correct, non-toy, calculation, see e.g. these lecture notes from a yale stats class.)

In [10]:

import numpy as np

In [11]:

print(np.std(df["Sal94"]))

80315.36232768581

In [12]:

print(np.std(df["Sal94"], ddof=1))

80469.66672032783

In [13]:

def standard_deviation(column):
    mean = np.mean(column)
    n = len(column)
    sumnums = 0
    for x in column:
        sumnums += (x - mean)**2
    return np.sqrt(sumnums / (n - 1))

In [14]:

standard_deviation(df["Sal94"])

Out[14]:

80469.66672032783

In [15]:

mean_of_men = np.mean(df[df.Gender==1].Sal94)

In [16]:

mean_of_women = np.mean(df[df.Gender==0].Sal94)

In [17]:

mean_of_men - mean_of_women

Out[17]:

58467.48770541692

In [18]:

def average(column):
    return sum(column) / len(column)

print(average(df.Sal94))
print(np.mean(df.Sal94))

153593.3448275862
153593.3448275862

In [19]:

def number_stds_from_zero(mean_difference, std):
    return mean_difference / std

number_stds_from_zero(mean_of_men - mean_of_women, standard_deviation(df["Sal94"]))

Out[19]:

0.7265779776200709

In [20]:

def two_tailed_p_value(numstds):
    return (1 - norm.cdf(numstds)) + norm.cdf(numstds * -1)

In [21]:

two_tailed_p_value(0.7265779776200709)

Out[21]:

0.46748452301038806

comparison: a t-test, which is at least a bit closet to what you should actually run:

In [22]:

from scipy.stats import ttest_ind
t, p = ttest_ind(df[df.Gender==1].Sal94, df[df.Gender==0].Sal94)
print(p)

2.7515254228164083e-09

In [23]:

print(t)

6.160842760627066

In [ ]:

Sociological Gobbledygook

In class-examples, February 25, 2020

links