Module `stikpetP.effect_sizes.eff_size_post_hoc_gof`

Expand source code

from math import asin, log
import pandas as pd

def es_post_hoc_gof(post_hoc_results, es = "auto", bergsma=False):
    '''
    Effect Sizes for a Goodness-of-Fit Post-Hoc Analysis
    ----------------------------------------------------

    Determines an effect size for each test (row) from the results of ph_pairwise_bin(), ph_pairwise_gof(), ph_residual_bin(), or ph_residual_gof().

    The function is shown in this [YouTube video](https://youtu.be/u4rO00xdj7Q) and described at [PeterStatistics.com](https://peterstatistics.com/Terms/Tests/PostHocAfterGoF.html).

    Parameters
    ----------
    post_hoc_results : dataframe
        the result of either ph_pairwise_bin(), ph_pairwise_gof(), ph_residual_bin(), or ph_residual_gof()
    es : {'auto', 'coheng', 'cohenh', 'ar', 'cramerv', 'cohenw', 'jbme', 'fei', 'rosenthal'}, optional
        the effect size to determine
    bergsma : boolean, optional
        use of Bergsma correction, only for Cramér V
    
    Returns
    -------
    pandas.DataFrame
        A dataframe with the following columns:
    
        * for residual post-hoc
        * *category*, the label of the category
        * *name effect size*, the effect size value
        
        * for pairwise post-hoc
        * *category 1*, the label of the first category
        * *category 2*, the label of the second category
        * *name effect size*, the effect size value

    Notes
    -----
    'auto' will use Cohen h for exact tests, Rosenthal correlation for z-tests and Cramér's V otherwise.

    Cohen g ('coheng'), Cohen h ('cohenh') and Alternative Ratio ('ar') can all be used for any test.

    Cramér V ('cramerv'), Cohen w ('cohenw'), Johnston-Berry-Mielke E ('jbme'), and Fei ('fei') can be used with chi-square tests (or likelihood ratio tests)

    The Rosenthal Correlation ('rosenthal') can be used with a z-test (proportion/Wald/score/residual).

    See the separate functions for each of these for details on the calculations.

    Before, After and Alternatives
    ------------------------------
    Before this a post-hoc test might be helpful:
    * [ph_pairwise_bin](../other/poho_pairwise_bin.html#ph_pairwise_bin) for Pairwise Binary Test
    * [ph_pairwise_gof](../other/poho_pairwise_gof.html#ph_pairwise_gof) for Pairwise Goodness-of-Fit Tests
    * [ph_residual_gof_bin](../other/poho_residual_gof_bin.html#ph_residual_gof_bin) for Residuals Tests using Binary tests
    * [ph_residual_gof_gof](../other/poho_residual_gof_gof.html#ph_residual_gof_gof) for Residuals Using Goodness-of-Fit Tests
    
    After this you might want to use a rule-of-thumb for the interpretation:
    * [th_post_hoc_gof](../other/thumb_post_hoc_gof.html#th_post_hoc_gof) for various rules-of-thumb
    
    Effect size in this function:
    * [es_cohen_g](../effect_sizes/eff_size_cohen_g.html#es_cohen_g) for Cohen g
    * [es_cohen_h_os](../effect_sizes/eff_size_cohen_h_os.html#es_cohen_h_os) for Cohen h'
    * [es_alt_ratio](../effect_sizes/eff_size_alt_ratio.html#es_alt_ratio) for Alternative Ratio
    * [r_rosenthal](../correlations/cor_rosenthal.html#r_rosenthal) for Rosenthal Correlation if a z-value is available
    * [es_cramer_v_gof](../effect_sizes/eff_size_cramer_v_gof.html#es_cramer_v_gof) for Cramer's V for Goodness-of-Fit
    * [es_cohen_w](../effect_sizes/eff_size_cohen_w.html#es_cohen_w) for Cohen's w
    * [es_jbm_e](../effect_sizes/eff_size_jbm_e.html#es_jbm_e) for Johnston-Berry-Mielke E
    * [es_fei](../effect_sizes/eff_size_fei.html#es_fei) for Fei
        
    note: the effect size functions are not used themselves in this function, but the same formulas are used.

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076

    Example
    --------
    Import pandas, the datafile and select a nominal field
    >>> import pandas as pd
    >>> gss_df = pd.read_csv('https://peterstatistics.com/Packages/ExampleData/GSS2012a.csv', sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'});
    >>> nominal_field = gss_df['mar1'];

    Obtain the post-hoc test results
    >>> from ..other.poho_pairwise_bin import ph_pairwise_bin
    >>> post_hoc_test = ph_pairwise_bin(nominal_field, test='binomial');

    Obtain the effect size:
    >>> es_post_hoc_gof(post_hoc_test, es='cohenh')
          category 1     category 2   Cohen h
    0        MARRIED  NEVER MARRIED  0.435752
    1        MARRIED       DIVORCED  0.537120
    2        MARRIED        WIDOWED  0.756027
    3        MARRIED      SEPARATED  1.015353
    4  NEVER MARRIED       DIVORCED  0.114495
    5  NEVER MARRIED        WIDOWED  0.380654
    6  NEVER MARRIED      SEPARATED  0.729728
    7       DIVORCED        WIDOWED  0.272030
    8       DIVORCED      SEPARATED  0.640959
    9        WIDOWED      SEPARATED  0.403139
    
    '''
    #rename the post-hoc results
    df = post_hoc_results

    #determine the number of tests in the post-hoc results
    n_tests = len(df)

    
    #get the description of the test used
    if 'test' in df.columns:
        test_used = df['test'][0]
    else:
        test_used = df['test used'][0]

    #determine the type of test
    if any(keyword in test_used for keyword in ['binomial', 'multinomial']):
        ph_test = 'exact'
    elif any(keyword in test_used for keyword in ['Wald', 'score', 'adjusted', 'standardized']):
        ph_test = 'z-test'
    elif any(keyword in test_used for keyword in ['G test', 'likelihood']):
        ph_test = 'likelihood-test'    
    else:
        ph_test = 'chi2-test'

    #find the name of the test-statistic column
    if ph_test!='exact':
        if 'z-statistic' in df.columns:
            stat_col = 'z-statistic'
        else:
            stat_col = 'statistic'    
    
    #label for effect size
    es_labels = {'coheng':'Cohen g', 
                 'cohenh':'Cohen h', 
                 'ar':'alternative ratio', 
                 'cramerv':'Cramér V', 
                 'cohenw':'Cohen w', 
                 'jbme':'Johnston-Berry-Mielke E', 
                 'fei':"Fei", 
                 'rosenthal':'Rosenthal correlation'
                }
    
    #set the effect size measure if es='auto'
    if es=='auto':
        if ph_test=='exact':
            es = 'cohenh'
        elif ph_test=='z-test':
            es = 'rosenthal'
        else:
            es= 'cramerv'
        
    
    #determine if it was a pairwise or residual test
    if 'category 1' in df.columns:
        test_type = 'pairwise'
        col_names = ['category 1', 'category 2', es_labels[es]]
        res_col=2
    else:
        test_type = 'residual'
        n = sum(df['obs. count'])
        col_names = ['category', es_labels[es]]
        res_col=1

    #loop over each row (test)
    results = pd.DataFrame()
    for i in range(0, n_tests):
        # find the observed and expected counts
        if test_type == 'pairwise':
            p_obs = df['obs. prop. 1'][i]
            p_exp = df['exp. prop. 1'][i]
            n_row = df['n1'][i] + df['n2'][i]

            #add the two category names to the dataframe
            results.at[i, 0] = df.iloc[i, 0]
            results.at[i, 1] = df.iloc[i, 1]            
            
        else:
            n_row = n
            p_obs = df['obs. count'][i]/n_row
            p_exp = df['exp. count'][i]/n_row

            #add the category name to the dataframe
            results.at[i, 0] = df.iloc[i, 0]
            
        #for non-exact tests find the test-statistic value
        if ph_test!='exact':
            test_statistic = df[stat_col][i]

        #determine the effect size

        #effect sizes for without a test-statistic needed
        if es=="coheng":
            es_value = p_obs - 0.5
        elif es=="cohenh":
            es_value = 2*asin(p_obs**0.5) - 2*asin(p_exp**0.5)
        elif es=="ar":
            es_value = p_obs/p_exp

        #effect sizes with a z-statistic
        elif es=="rosenthal":
            if ph_test == 'z-test':
                es_value = df[stat_col][i]/(n_row**0.5)
            else:
                es_value = 'not possible with this post-hoc test'

        #effect sizes with a chi-square statistic
        elif es=='cramerv' or es=="cohenw":
            if ph_test == 'chi2-test' or ph_test == 'likelihood-test':
                es_value = (test_statistic/n_row)**0.5
                if es=='cramerv' and bergsma:
                    phi2= test_statistic/n_row
                    phi2_tilde = max(0, phi2 - 1/(n_row-1))
                    es_value = (phi2_tilde/(2 - 1/(n_row-1)))**0.5
            else:
                es_value = 'not possible with this post-hoc test'
        
        elif es=="jbme" or es=="fei":
            if ph_test == 'chi2-test' or ph_test == 'likelihood-test':
                if ph_test == 'chi2-test':
                    es_value = test_statistic*df['minExp'][i]/(n_row*(n_row - df['minExp'][i]))
                else:
                    es_value = -1/log(df['minExp'][i]/n_row)*test_statistic/(2*n_row)
                if es=="fei":
                    es_value= es_value**0.5
            else:
                es_value = 'not possible with this post-hoc test'
        
        #add the effect size value to the dataframe
        results.at[i, res_col] = es_value  

    #add the column names
    results.columns = col_names
    return results

Functions

def es_post_hoc_gof(post_hoc_results, es='auto', bergsma=False)

Effect Sizes for a Goodness-of-Fit Post-Hoc Analysis

Determines an effect size for each test (row) from the results of ph_pairwise_bin(), ph_pairwise_gof(), ph_residual_bin(), or ph_residual_gof().

The function is shown in this YouTube video and described at PeterStatistics.com.

Parameters

post_hoc_results : dataframe: the result of either ph_pairwise_bin(), ph_pairwise_gof(), ph_residual_bin(), or ph_residual_gof()
es : {'auto', 'coheng', 'cohenh', 'ar', 'cramerv', 'cohenw', 'jbme', 'fei', 'rosenthal'}, optional: the effect size to determine
bergsma : boolean, optional: use of Bergsma correction, only for Cramér V

Returns

pandas.DataFrame

A dataframe with the following columns:

for residual post-hoc
category, the label of the category
name effect size, the effect size value
for pairwise post-hoc
category 1, the label of the first category
category 2, the label of the second category
name effect size, the effect size value

Notes

'auto' will use Cohen h for exact tests, Rosenthal correlation for z-tests and Cramér's V otherwise.

Cohen g ('coheng'), Cohen h ('cohenh') and Alternative Ratio ('ar') can all be used for any test.

Cramér V ('cramerv'), Cohen w ('cohenw'), Johnston-Berry-Mielke E ('jbme'), and Fei ('fei') can be used with chi-square tests (or likelihood ratio tests)

The Rosenthal Correlation ('rosenthal') can be used with a z-test (proportion/Wald/score/residual).

See the separate functions for each of these for details on the calculations.

Before, After and Alternatives

Before this a post-hoc test might be helpful: * ph_pairwise_bin for Pairwise Binary Test * ph_pairwise_gof for Pairwise Goodness-of-Fit Tests * ph_residual_gof_bin for Residuals Tests using Binary tests * ph_residual_gof_gof for Residuals Using Goodness-of-Fit Tests

After this you might want to use a rule-of-thumb for the interpretation: * th_post_hoc_gof for various rules-of-thumb

Effect size in this function: * es_cohen_g for Cohen g * es_cohen_h_os for Cohen h' * es_alt_ratio for Alternative Ratio * r_rosenthal for Rosenthal Correlation if a z-value is available * es_cramer_v_gof for Cramer's V for Goodness-of-Fit * es_cohen_w for Cohen's w * es_jbm_e for Johnston-Berry-Mielke E * es_fei for Fei

note: the effect size functions are not used themselves in this function, but the same formulas are used.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Example

Import pandas, the datafile and select a nominal field

>>> import pandas as pd
>>> gss_df = pd.read_csv('https://peterstatistics.com/Packages/ExampleData/GSS2012a.csv', sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'});
>>> nominal_field = gss_df['mar1'];

Obtain the post-hoc test results

>>> from ..other.poho_pairwise_bin import ph_pairwise_bin
>>> post_hoc_test = ph_pairwise_bin(nominal_field, test='binomial');

Obtain the effect size:

>>> es_post_hoc_gof(post_hoc_test, es='cohenh')
      category 1     category 2   Cohen h
0        MARRIED  NEVER MARRIED  0.435752
1        MARRIED       DIVORCED  0.537120
2        MARRIED        WIDOWED  0.756027
3        MARRIED      SEPARATED  1.015353
4  NEVER MARRIED       DIVORCED  0.114495
5  NEVER MARRIED        WIDOWED  0.380654
6  NEVER MARRIED      SEPARATED  0.729728
7       DIVORCED        WIDOWED  0.272030
8       DIVORCED      SEPARATED  0.640959
9        WIDOWED      SEPARATED  0.403139

Expand source code

def es_post_hoc_gof(post_hoc_results, es = "auto", bergsma=False):
    '''
    Effect Sizes for a Goodness-of-Fit Post-Hoc Analysis
    ----------------------------------------------------

    Determines an effect size for each test (row) from the results of ph_pairwise_bin(), ph_pairwise_gof(), ph_residual_bin(), or ph_residual_gof().

    The function is shown in this [YouTube video](https://youtu.be/u4rO00xdj7Q) and described at [PeterStatistics.com](https://peterstatistics.com/Terms/Tests/PostHocAfterGoF.html).

    Parameters
    ----------
    post_hoc_results : dataframe
        the result of either ph_pairwise_bin(), ph_pairwise_gof(), ph_residual_bin(), or ph_residual_gof()
    es : {'auto', 'coheng', 'cohenh', 'ar', 'cramerv', 'cohenw', 'jbme', 'fei', 'rosenthal'}, optional
        the effect size to determine
    bergsma : boolean, optional
        use of Bergsma correction, only for Cramér V
    
    Returns
    -------
    pandas.DataFrame
        A dataframe with the following columns:
    
        * for residual post-hoc
        * *category*, the label of the category
        * *name effect size*, the effect size value
        
        * for pairwise post-hoc
        * *category 1*, the label of the first category
        * *category 2*, the label of the second category
        * *name effect size*, the effect size value

    Notes
    -----
    'auto' will use Cohen h for exact tests, Rosenthal correlation for z-tests and Cramér's V otherwise.

    Cohen g ('coheng'), Cohen h ('cohenh') and Alternative Ratio ('ar') can all be used for any test.

    Cramér V ('cramerv'), Cohen w ('cohenw'), Johnston-Berry-Mielke E ('jbme'), and Fei ('fei') can be used with chi-square tests (or likelihood ratio tests)

    The Rosenthal Correlation ('rosenthal') can be used with a z-test (proportion/Wald/score/residual).

    See the separate functions for each of these for details on the calculations.

    Before, After and Alternatives
    ------------------------------
    Before this a post-hoc test might be helpful:
    * [ph_pairwise_bin](../other/poho_pairwise_bin.html#ph_pairwise_bin) for Pairwise Binary Test
    * [ph_pairwise_gof](../other/poho_pairwise_gof.html#ph_pairwise_gof) for Pairwise Goodness-of-Fit Tests
    * [ph_residual_gof_bin](../other/poho_residual_gof_bin.html#ph_residual_gof_bin) for Residuals Tests using Binary tests
    * [ph_residual_gof_gof](../other/poho_residual_gof_gof.html#ph_residual_gof_gof) for Residuals Using Goodness-of-Fit Tests
    
    After this you might want to use a rule-of-thumb for the interpretation:
    * [th_post_hoc_gof](../other/thumb_post_hoc_gof.html#th_post_hoc_gof) for various rules-of-thumb
    
    Effect size in this function:
    * [es_cohen_g](../effect_sizes/eff_size_cohen_g.html#es_cohen_g) for Cohen g
    * [es_cohen_h_os](../effect_sizes/eff_size_cohen_h_os.html#es_cohen_h_os) for Cohen h'
    * [es_alt_ratio](../effect_sizes/eff_size_alt_ratio.html#es_alt_ratio) for Alternative Ratio
    * [r_rosenthal](../correlations/cor_rosenthal.html#r_rosenthal) for Rosenthal Correlation if a z-value is available
    * [es_cramer_v_gof](../effect_sizes/eff_size_cramer_v_gof.html#es_cramer_v_gof) for Cramer's V for Goodness-of-Fit
    * [es_cohen_w](../effect_sizes/eff_size_cohen_w.html#es_cohen_w) for Cohen's w
    * [es_jbm_e](../effect_sizes/eff_size_jbm_e.html#es_jbm_e) for Johnston-Berry-Mielke E
    * [es_fei](../effect_sizes/eff_size_fei.html#es_fei) for Fei
        
    note: the effect size functions are not used themselves in this function, but the same formulas are used.

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076

    Example
    --------
    Import pandas, the datafile and select a nominal field
    >>> import pandas as pd
    >>> gss_df = pd.read_csv('https://peterstatistics.com/Packages/ExampleData/GSS2012a.csv', sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'});
    >>> nominal_field = gss_df['mar1'];

    Obtain the post-hoc test results
    >>> from ..other.poho_pairwise_bin import ph_pairwise_bin
    >>> post_hoc_test = ph_pairwise_bin(nominal_field, test='binomial');

    Obtain the effect size:
    >>> es_post_hoc_gof(post_hoc_test, es='cohenh')
          category 1     category 2   Cohen h
    0        MARRIED  NEVER MARRIED  0.435752
    1        MARRIED       DIVORCED  0.537120
    2        MARRIED        WIDOWED  0.756027
    3        MARRIED      SEPARATED  1.015353
    4  NEVER MARRIED       DIVORCED  0.114495
    5  NEVER MARRIED        WIDOWED  0.380654
    6  NEVER MARRIED      SEPARATED  0.729728
    7       DIVORCED        WIDOWED  0.272030
    8       DIVORCED      SEPARATED  0.640959
    9        WIDOWED      SEPARATED  0.403139
    
    '''
    #rename the post-hoc results
    df = post_hoc_results

    #determine the number of tests in the post-hoc results
    n_tests = len(df)

    
    #get the description of the test used
    if 'test' in df.columns:
        test_used = df['test'][0]
    else:
        test_used = df['test used'][0]

    #determine the type of test
    if any(keyword in test_used for keyword in ['binomial', 'multinomial']):
        ph_test = 'exact'
    elif any(keyword in test_used for keyword in ['Wald', 'score', 'adjusted', 'standardized']):
        ph_test = 'z-test'
    elif any(keyword in test_used for keyword in ['G test', 'likelihood']):
        ph_test = 'likelihood-test'    
    else:
        ph_test = 'chi2-test'

    #find the name of the test-statistic column
    if ph_test!='exact':
        if 'z-statistic' in df.columns:
            stat_col = 'z-statistic'
        else:
            stat_col = 'statistic'    
    
    #label for effect size
    es_labels = {'coheng':'Cohen g', 
                 'cohenh':'Cohen h', 
                 'ar':'alternative ratio', 
                 'cramerv':'Cramér V', 
                 'cohenw':'Cohen w', 
                 'jbme':'Johnston-Berry-Mielke E', 
                 'fei':"Fei", 
                 'rosenthal':'Rosenthal correlation'
                }
    
    #set the effect size measure if es='auto'
    if es=='auto':
        if ph_test=='exact':
            es = 'cohenh'
        elif ph_test=='z-test':
            es = 'rosenthal'
        else:
            es= 'cramerv'
        
    
    #determine if it was a pairwise or residual test
    if 'category 1' in df.columns:
        test_type = 'pairwise'
        col_names = ['category 1', 'category 2', es_labels[es]]
        res_col=2
    else:
        test_type = 'residual'
        n = sum(df['obs. count'])
        col_names = ['category', es_labels[es]]
        res_col=1

    #loop over each row (test)
    results = pd.DataFrame()
    for i in range(0, n_tests):
        # find the observed and expected counts
        if test_type == 'pairwise':
            p_obs = df['obs. prop. 1'][i]
            p_exp = df['exp. prop. 1'][i]
            n_row = df['n1'][i] + df['n2'][i]

            #add the two category names to the dataframe
            results.at[i, 0] = df.iloc[i, 0]
            results.at[i, 1] = df.iloc[i, 1]            
            
        else:
            n_row = n
            p_obs = df['obs. count'][i]/n_row
            p_exp = df['exp. count'][i]/n_row

            #add the category name to the dataframe
            results.at[i, 0] = df.iloc[i, 0]
            
        #for non-exact tests find the test-statistic value
        if ph_test!='exact':
            test_statistic = df[stat_col][i]

        #determine the effect size

        #effect sizes for without a test-statistic needed
        if es=="coheng":
            es_value = p_obs - 0.5
        elif es=="cohenh":
            es_value = 2*asin(p_obs**0.5) - 2*asin(p_exp**0.5)
        elif es=="ar":
            es_value = p_obs/p_exp

        #effect sizes with a z-statistic
        elif es=="rosenthal":
            if ph_test == 'z-test':
                es_value = df[stat_col][i]/(n_row**0.5)
            else:
                es_value = 'not possible with this post-hoc test'

        #effect sizes with a chi-square statistic
        elif es=='cramerv' or es=="cohenw":
            if ph_test == 'chi2-test' or ph_test == 'likelihood-test':
                es_value = (test_statistic/n_row)**0.5
                if es=='cramerv' and bergsma:
                    phi2= test_statistic/n_row
                    phi2_tilde = max(0, phi2 - 1/(n_row-1))
                    es_value = (phi2_tilde/(2 - 1/(n_row-1)))**0.5
            else:
                es_value = 'not possible with this post-hoc test'
        
        elif es=="jbme" or es=="fei":
            if ph_test == 'chi2-test' or ph_test == 'likelihood-test':
                if ph_test == 'chi2-test':
                    es_value = test_statistic*df['minExp'][i]/(n_row*(n_row - df['minExp'][i]))
                else:
                    es_value = -1/log(df['minExp'][i]/n_row)*test_statistic/(2*n_row)
                if es=="fei":
                    es_value= es_value**0.5
            else:
                es_value = 'not possible with this post-hoc test'
        
        #add the effect size value to the dataframe
        results.at[i, res_col] = es_value  

    #add the column names
    results.columns = col_names
    return results