Module `stikpetP.other.poho_binomial`

Expand source code

import pandas as pd
from ..tests.test_binomial_os import ts_binomial_os

def ph_binomial(data, expCount=None, twoSidedMethod='eqdist', posthoc = "bonferroni"):
    '''
    Pairwise Binomial Test for Post-Hoc Analysis
    --------------------------------------------
    
    This function will perform a one-sample binomial test for each possible pair in the data. It makes use of the ts_binomial_os() function.
    
    Parameters
    ----------
    data : list or pandas series
    expCount : pandas dataframe, optional 
        categories and expected counts
    twoSidedMethod : string, optional
        method to use for determining two-sided p-value of binomial test. See ts_binomial_os()
    posthoc : string, optional
        the correction to use, currently only "bonferroni" available
    
    Returns
    -------
    res : pandas dataframe with:
    
    * *category 1*, the label of the first category
    * *category 2*, the label of the second category
    * *n1*, the sample size of the first category
    * *n2*, the sample size of the second category 
    * *obs. prop. cat. 1*, the proportion in the sample of the first category
    * *exp. prop. cat. 1*, the expected proportion for the first category
    * *p-value*, the unadjusted significance
    * *adj. p-value*, the adjusted significance
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    Examples
    --------
    >>> pd.set_option('display.width',1000)
    >>> pd.set_option('display.max_columns', 1000)
    
    Example 1: pandas series    
    >>> df1 = pd.read_csv('https://peterstatistics.com/Packages/ExampleData/GSS2012a.csv', sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'})
    >>> ex1 = df1['mar1']
    >>> ph_binomial(ex1)
          category 1     category 2   n1   n2  obs. prop. cat. 1  exp. prop. cat. 1        p-value   adj. p-value
    0        MARRIED  NEVER MARRIED  972  395           0.711046                0.5   1.052263e-56   1.052263e-55
    1        MARRIED       DIVORCED  972  314           0.755832                0.5   7.829174e-79   7.829174e-78
    2        MARRIED        WIDOWED  972  181           0.843018                0.5  1.407217e-131  1.407217e-130
    3        MARRIED      SEPARATED  972   79           0.924833                0.5  1.267980e-196  1.267980e-195
    4  NEVER MARRIED       DIVORCED  395  314           0.557123                0.5   3.001933e-03   3.001933e-02
    5  NEVER MARRIED        WIDOWED  395  181           0.685764                0.5   1.352112e-19   1.352112e-18
    6  NEVER MARRIED      SEPARATED  395   79           0.833333                0.5   7.075688e-52   7.075688e-51
    7       DIVORCED        WIDOWED  314  181           0.634343                0.5   3.295753e-09   3.295753e-08
    8       DIVORCED      SEPARATED  314   79           0.798982                0.5   1.472395e-34   1.472395e-33
    9        WIDOWED      SEPARATED  181   79           0.696154                0.5   2.223544e-10   2.223544e-09
    
    Example 2: pandas series with various settings
    >>> ex2 = df1['mar1']
    >>> eCounts = pd.DataFrame({'category' : ["MARRIED", "DIVORCED", "NEVER MARRIED", "SEPARATED"], 'count' : [5,5,5,5]})
    >>> ph_binomial(ex2, expCount=eCounts)
          category 1     category 2   n1   n2  obs. prop. cat. 1  exp. prop. cat. 1        p-value   adj. p-value
    0        MARRIED       DIVORCED  972  314           0.755832                0.5   7.829174e-79   4.697504e-78
    1        MARRIED  NEVER MARRIED  972  395           0.711046                0.5   1.052263e-56   6.313581e-56
    2        MARRIED      SEPARATED  972   79           0.924833                0.5  1.267980e-196  7.607878e-196
    3       DIVORCED  NEVER MARRIED  314  395           0.442877                0.5   3.001933e-03   1.801160e-02
    4       DIVORCED      SEPARATED  314   79           0.798982                0.5   1.472395e-34   8.834367e-34
    5  NEVER MARRIED      SEPARATED  395   79           0.833333                0.5   7.075688e-52   4.245413e-51

    Example 3: a list
    >>> ex3 = ["MARRIED", "DIVORCED", "MARRIED", "SEPARATED", "DIVORCED", "NEVER MARRIED", "DIVORCED", "DIVORCED", "NEVER MARRIED", "MARRIED", "MARRIED", "MARRIED", "SEPARATED", "DIVORCED", "NEVER MARRIED", "NEVER MARRIED", "DIVORCED", "DIVORCED", "MARRIED"]
    >>> ph_binomial(ex3)
          category 1     category 2  n1  n2  obs. prop. cat. 1  exp. prop. cat. 1   p-value  adj. p-value
    0       DIVORCED        MARRIED   7   6           0.538462                0.5  1.000000             1
    1       DIVORCED  NEVER MARRIED   7   4           0.636364                0.5  0.774414             1
    2       DIVORCED      SEPARATED   7   2           0.777778                0.5  0.343750             1
    3        MARRIED  NEVER MARRIED   6   4           0.600000                0.5  0.753906             1
    4        MARRIED      SEPARATED   6   2           0.750000                0.5  0.289062             1
    5  NEVER MARRIED      SEPARATED   4   2           0.666667                0.5  0.687500             1

    '''
    if type(data) is list:
        data = pd.Series(data)
        
    freq = data.value_counts()
    
    if expCount is None:
        #assume all to be equal
        n = sum(freq)
        k = len(freq)
        categories = list(freq.index)
        expC = [n/k] * k
        
    else:
        #check if categories match
        nE = 0
        n = 0
        for i in range(0, len(expCount)):
            nE = nE + expCount.iloc[i,1]
            n = n + freq[expCount.iloc[i,0]]
        
        expC = []
        for i in range(0,len(expCount)):
            expC.append(expCount.iloc[i, 1]/nE*n)
            
        k = len(expC)
        categories = list(expCount.iloc[:,0])
    

    res = pd.DataFrame(columns=["category 1", "category 2", "n1", "n2", "obs. prop. cat. 1", "exp. prop. cat. 1", "p-value", "adj. p-value"])

    adjFactor = k * (k - 1)/ 2
    for i in range(0, k-1):
        for j in range(i+1, k):
            n1 = freq[categories[i]]
            n2 = freq[categories[j]]
            obP1 = n1/(n1 + n2)
            exP1 = expC[i]/(expC[i]+expC[j])

            codes = [categories[i], categories[j]]
            sig = ts_binomial_os(data, codes=codes, p0 = exP1)["p-value (2-sided)"][0]
            
            if posthoc == "bonferroni":
                adjSig = sig*adjFactor
            if adjSig > 1:
                adjSig = 1
            res.loc[len(res)] = [categories[i], categories[j], n1, n2, obP1, exP1, sig, adjSig]

    return res

Functions

def ph_binomial(data, expCount=None, twoSidedMethod='eqdist', posthoc='bonferroni')

Pairwise Binomial Test for Post-Hoc Analysis

This function will perform a one-sample binomial test for each possible pair in the data. It makes use of the ts_binomial_os() function.

Parameters

data : list or pandas series
expCount : pandas dataframe, optional: categories and expected counts
twoSidedMethod : string, optional: method to use for determining two-sided p-value of binomial test. See ts_binomial_os()
posthoc : string, optional: the correction to use, currently only "bonferroni" available

Returns

res : pandas dataframe with:

category 1, the label of the first category
category 2, the label of the second category
n1, the sample size of the first category
n2, the sample size of the second category
obs. prop. cat. 1, the proportion in the sample of the first category
exp. prop. cat. 1, the expected proportion for the first category
p-value, the unadjusted significance
adj. p-value, the adjusted significance

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Examples

>>> pd.set_option('display.width',1000)
>>> pd.set_option('display.max_columns', 1000)

Example 1: pandas series

>>> df1 = pd.read_csv('https://peterstatistics.com/Packages/ExampleData/GSS2012a.csv', sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'})
>>> ex1 = df1['mar1']
>>> ph_binomial(ex1)
      category 1     category 2   n1   n2  obs. prop. cat. 1  exp. prop. cat. 1        p-value   adj. p-value
0        MARRIED  NEVER MARRIED  972  395           0.711046                0.5   1.052263e-56   1.052263e-55
1        MARRIED       DIVORCED  972  314           0.755832                0.5   7.829174e-79   7.829174e-78
2        MARRIED        WIDOWED  972  181           0.843018                0.5  1.407217e-131  1.407217e-130
3        MARRIED      SEPARATED  972   79           0.924833                0.5  1.267980e-196  1.267980e-195
4  NEVER MARRIED       DIVORCED  395  314           0.557123                0.5   3.001933e-03   3.001933e-02
5  NEVER MARRIED        WIDOWED  395  181           0.685764                0.5   1.352112e-19   1.352112e-18
6  NEVER MARRIED      SEPARATED  395   79           0.833333                0.5   7.075688e-52   7.075688e-51
7       DIVORCED        WIDOWED  314  181           0.634343                0.5   3.295753e-09   3.295753e-08
8       DIVORCED      SEPARATED  314   79           0.798982                0.5   1.472395e-34   1.472395e-33
9        WIDOWED      SEPARATED  181   79           0.696154                0.5   2.223544e-10   2.223544e-09

Example 2: pandas series with various settings

>>> ex2 = df1['mar1']
>>> eCounts = pd.DataFrame({'category' : ["MARRIED", "DIVORCED", "NEVER MARRIED", "SEPARATED"], 'count' : [5,5,5,5]})
>>> ph_binomial(ex2, expCount=eCounts)
      category 1     category 2   n1   n2  obs. prop. cat. 1  exp. prop. cat. 1        p-value   adj. p-value
0        MARRIED       DIVORCED  972  314           0.755832                0.5   7.829174e-79   4.697504e-78
1        MARRIED  NEVER MARRIED  972  395           0.711046                0.5   1.052263e-56   6.313581e-56
2        MARRIED      SEPARATED  972   79           0.924833                0.5  1.267980e-196  7.607878e-196
3       DIVORCED  NEVER MARRIED  314  395           0.442877                0.5   3.001933e-03   1.801160e-02
4       DIVORCED      SEPARATED  314   79           0.798982                0.5   1.472395e-34   8.834367e-34
5  NEVER MARRIED      SEPARATED  395   79           0.833333                0.5   7.075688e-52   4.245413e-51

Example 3: a list

>>> ex3 = ["MARRIED", "DIVORCED", "MARRIED", "SEPARATED", "DIVORCED", "NEVER MARRIED", "DIVORCED", "DIVORCED", "NEVER MARRIED", "MARRIED", "MARRIED", "MARRIED", "SEPARATED", "DIVORCED", "NEVER MARRIED", "NEVER MARRIED", "DIVORCED", "DIVORCED", "MARRIED"]
>>> ph_binomial(ex3)
      category 1     category 2  n1  n2  obs. prop. cat. 1  exp. prop. cat. 1   p-value  adj. p-value
0       DIVORCED        MARRIED   7   6           0.538462                0.5  1.000000             1
1       DIVORCED  NEVER MARRIED   7   4           0.636364                0.5  0.774414             1
2       DIVORCED      SEPARATED   7   2           0.777778                0.5  0.343750             1
3        MARRIED  NEVER MARRIED   6   4           0.600000                0.5  0.753906             1
4        MARRIED      SEPARATED   6   2           0.750000                0.5  0.289062             1
5  NEVER MARRIED      SEPARATED   4   2           0.666667                0.5  0.687500             1

Expand source code

def ph_binomial(data, expCount=None, twoSidedMethod='eqdist', posthoc = "bonferroni"):
    '''
    Pairwise Binomial Test for Post-Hoc Analysis
    --------------------------------------------
    
    This function will perform a one-sample binomial test for each possible pair in the data. It makes use of the ts_binomial_os() function.
    
    Parameters
    ----------
    data : list or pandas series
    expCount : pandas dataframe, optional 
        categories and expected counts
    twoSidedMethod : string, optional
        method to use for determining two-sided p-value of binomial test. See ts_binomial_os()
    posthoc : string, optional
        the correction to use, currently only "bonferroni" available
    
    Returns
    -------
    res : pandas dataframe with:
    
    * *category 1*, the label of the first category
    * *category 2*, the label of the second category
    * *n1*, the sample size of the first category
    * *n2*, the sample size of the second category 
    * *obs. prop. cat. 1*, the proportion in the sample of the first category
    * *exp. prop. cat. 1*, the expected proportion for the first category
    * *p-value*, the unadjusted significance
    * *adj. p-value*, the adjusted significance
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    Examples
    --------
    >>> pd.set_option('display.width',1000)
    >>> pd.set_option('display.max_columns', 1000)
    
    Example 1: pandas series    
    >>> df1 = pd.read_csv('https://peterstatistics.com/Packages/ExampleData/GSS2012a.csv', sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'})
    >>> ex1 = df1['mar1']
    >>> ph_binomial(ex1)
          category 1     category 2   n1   n2  obs. prop. cat. 1  exp. prop. cat. 1        p-value   adj. p-value
    0        MARRIED  NEVER MARRIED  972  395           0.711046                0.5   1.052263e-56   1.052263e-55
    1        MARRIED       DIVORCED  972  314           0.755832                0.5   7.829174e-79   7.829174e-78
    2        MARRIED        WIDOWED  972  181           0.843018                0.5  1.407217e-131  1.407217e-130
    3        MARRIED      SEPARATED  972   79           0.924833                0.5  1.267980e-196  1.267980e-195
    4  NEVER MARRIED       DIVORCED  395  314           0.557123                0.5   3.001933e-03   3.001933e-02
    5  NEVER MARRIED        WIDOWED  395  181           0.685764                0.5   1.352112e-19   1.352112e-18
    6  NEVER MARRIED      SEPARATED  395   79           0.833333                0.5   7.075688e-52   7.075688e-51
    7       DIVORCED        WIDOWED  314  181           0.634343                0.5   3.295753e-09   3.295753e-08
    8       DIVORCED      SEPARATED  314   79           0.798982                0.5   1.472395e-34   1.472395e-33
    9        WIDOWED      SEPARATED  181   79           0.696154                0.5   2.223544e-10   2.223544e-09
    
    Example 2: pandas series with various settings
    >>> ex2 = df1['mar1']
    >>> eCounts = pd.DataFrame({'category' : ["MARRIED", "DIVORCED", "NEVER MARRIED", "SEPARATED"], 'count' : [5,5,5,5]})
    >>> ph_binomial(ex2, expCount=eCounts)
          category 1     category 2   n1   n2  obs. prop. cat. 1  exp. prop. cat. 1        p-value   adj. p-value
    0        MARRIED       DIVORCED  972  314           0.755832                0.5   7.829174e-79   4.697504e-78
    1        MARRIED  NEVER MARRIED  972  395           0.711046                0.5   1.052263e-56   6.313581e-56
    2        MARRIED      SEPARATED  972   79           0.924833                0.5  1.267980e-196  7.607878e-196
    3       DIVORCED  NEVER MARRIED  314  395           0.442877                0.5   3.001933e-03   1.801160e-02
    4       DIVORCED      SEPARATED  314   79           0.798982                0.5   1.472395e-34   8.834367e-34
    5  NEVER MARRIED      SEPARATED  395   79           0.833333                0.5   7.075688e-52   4.245413e-51

    Example 3: a list
    >>> ex3 = ["MARRIED", "DIVORCED", "MARRIED", "SEPARATED", "DIVORCED", "NEVER MARRIED", "DIVORCED", "DIVORCED", "NEVER MARRIED", "MARRIED", "MARRIED", "MARRIED", "SEPARATED", "DIVORCED", "NEVER MARRIED", "NEVER MARRIED", "DIVORCED", "DIVORCED", "MARRIED"]
    >>> ph_binomial(ex3)
          category 1     category 2  n1  n2  obs. prop. cat. 1  exp. prop. cat. 1   p-value  adj. p-value
    0       DIVORCED        MARRIED   7   6           0.538462                0.5  1.000000             1
    1       DIVORCED  NEVER MARRIED   7   4           0.636364                0.5  0.774414             1
    2       DIVORCED      SEPARATED   7   2           0.777778                0.5  0.343750             1
    3        MARRIED  NEVER MARRIED   6   4           0.600000                0.5  0.753906             1
    4        MARRIED      SEPARATED   6   2           0.750000                0.5  0.289062             1
    5  NEVER MARRIED      SEPARATED   4   2           0.666667                0.5  0.687500             1

    '''
    if type(data) is list:
        data = pd.Series(data)
        
    freq = data.value_counts()
    
    if expCount is None:
        #assume all to be equal
        n = sum(freq)
        k = len(freq)
        categories = list(freq.index)
        expC = [n/k] * k
        
    else:
        #check if categories match
        nE = 0
        n = 0
        for i in range(0, len(expCount)):
            nE = nE + expCount.iloc[i,1]
            n = n + freq[expCount.iloc[i,0]]
        
        expC = []
        for i in range(0,len(expCount)):
            expC.append(expCount.iloc[i, 1]/nE*n)
            
        k = len(expC)
        categories = list(expCount.iloc[:,0])
    

    res = pd.DataFrame(columns=["category 1", "category 2", "n1", "n2", "obs. prop. cat. 1", "exp. prop. cat. 1", "p-value", "adj. p-value"])

    adjFactor = k * (k - 1)/ 2
    for i in range(0, k-1):
        for j in range(i+1, k):
            n1 = freq[categories[i]]
            n2 = freq[categories[j]]
            obP1 = n1/(n1 + n2)
            exP1 = expC[i]/(expC[i]+expC[j])

            codes = [categories[i], categories[j]]
            sig = ts_binomial_os(data, codes=codes, p0 = exP1)["p-value (2-sided)"][0]
            
            if posthoc == "bonferroni":
                adjSig = sig*adjFactor
            if adjSig > 1:
                adjSig = 1
            res.loc[len(res)] = [categories[i], categories[j], n1, n2, obP1, exP1, sig, adjSig]

    return res