Module stikpetP.effect_sizes.eff_size_theil_u

Expand source code
import pandas as pd
from statistics import NormalDist
from math import log
from ..other.table_cross import tab_cross

def es_theil_u(field1, field2, categories1=None, categories2=None):
    '''
    Theil U / Uncertainty Coefficient
    ---------------------------------
    Theil U is a measure of nominal association. According to Wikipedia: "given Y, what fraction of the bits of X can we predict? In this case we can think of X as containing the total information, and of Y as allowing one to predict part of such information." (2022).
    
    The term Theil U can also refer to two completely different measures, often used in forecasting and sometimes referred to as index of inequality.
    
    Parameters
    ----------
    field1 : list or pandas series
        the first categorical field
    field2 : list or pandas series
        the first categorical field
    categories1 : list or dictionary, optional
        order and/or selection for categories of field1
    categories2 : list or dictionary, optional
        order and/or selection for categories of field2
        
    Returns
    -------
    A dataframe with:
    
    * *dependent*, the field used as dependent variable
    * *n*, the sample size
    * *value*, the Theil U value
    * *ASE_0*, the asymptotic standard error assuming the null hypothesis
    * *ASE_1*, the asymptotic standard error assuming the alternative hypothesis
    * *statistic*, the z-value
    * *p-value*, the significance (p-value)
    
    Notes
    -----
    The formula used (SPSS, 2006, p. 117):
    $$U_{Y|X} = \\frac{H_X+H_Y-H_{XY}}{H_Y}$$
    $$U_{X|Y} = \\frac{H_X+H_Y-H_{XY}}{H_X}$$
    $$U = 2\\times\\frac{H_X+H_Y-H_{XY}}{H_X+H_Y}$$
    
    With:
    $$H_X = -\\sum_i \\left(\\frac{R_i}{n}\\times\\ln\\left(\\frac{R_i}{n}\\right)\\right)$$
    $$H_Y = -\\sum_j \\left(\\frac{C_j}{n}\\times\\ln\\left(\\frac{C_j}{n}\\right)\\right)$$
    $$H_XY = -\\sum_i \\left(\\frac{F_{i,j}}{n}\\times\\ln\\left(\\frac{F_{i,j}}{n}\\right)\\right), \\text{ for } F_{i,j}>0$$
    
    Asymptotic standard error:
    $$ASE\\left(U_{Y|X}\\right)_1 = \\frac{\\sqrt{\\sum_{i,j} F_{i,j}\\times\\left(H_Y\\times\\ln\\left(\\frac{F_{i,j}}{R_i}\\right) + \\left(H_X - H_{XY}\\right)\\times\\ln\\left(\\frac{C_j}{n}\\right)\\right)^2}}{n\\times H_Y^2}$$
    $$ASE\\left(U_{X|Y}\\right)_1 = \\frac{\\sqrt{\\sum_{i,j} F_{i,j}\\times\\left(H_X\\times\\ln\\left(\\frac{F_{i,j}}{C_j}\\right) + \\left(H_Y - H_{XY}\\right)\\times\\ln\\left(\\frac{R_i}{n}\\right)\\right)^2}}{n\\times H_X^2}$$
    $$ASE\\left(U\\right)_1 = \\frac{2\\times\\sqrt{\\sum_{i,j} F_{i,j}\\times\\left(H_{XY}\\times\\ln\\left(\\frac{R_i\\times C_j}{n^2}\\right) + \\left(H_X + H_{Y}\\right)\\times\\ln\\left(\\frac{F_{i,j}}{n}\\right)\\right)^2}}{n\\times \\left(H_X+H_Y\\right)^2}$$
    
    $$ASE\\left(U_{Y|X}\\right)_0 = \\frac{\\sqrt{P - n\\times\\left(H_X+H_Y-H_{XY}\\right)^2}}{n\\times H_Y}$$
    $$ASE\\left(U_{X|Y}\\right)_0 = \\frac{\\sqrt{P - n\\times\\left(H_X+H_Y-H_{XY}\\right)^2}}{n\\times H_X}$$
    $$ASE\\left(U\\right)_0 = \\frac{2\\times\\sqrt{P - n\\times\\left(H_X+H_Y-H_{XY}\\right)^2}}{n\\times \\left(H_X+H_Y\\right)}$$
    
    With:
    $$P = \\sum_{i,j} F_{i,j}\\times\\left(\\ln\\left(\\frac{R_i\\times C_j}{n\\times F_{i,j}}\\right)\\right)^2$$
    
    The p-value (significance):
    $$T_{Y|X} = \\frac{U_{Y|X}}{ASE\\left(U_{Y|X}\\right)_0}$$
    $$T_{X|Y} = \\frac{U_{X|Y}}{ASE\\left(U_{X|Y}\\right)_0}$$
    $$T = \\frac{U}{ASE\\left(U\\right)_0}$$
    $$sig. = 2\\times\\left(1 - \\Phi\\left(\\left|T\\right|\\right)\\right)$$
    
    *Symbols used:*
    
    * \\(F_{i,j}\\), the absolute frequency (observed count) from row i and column j
    * \\(c\\), the number of columns
    * \\(r\\), the number of rows
    * \\(R_i\\), row total of row i, it can be calculated using \\(R_i=\\sum_{j=1}^{c}F_{i,j}\\)
    * \\(C_j\\), column total of column j, it can be calculated using \\(C_j=\\sum_{i=1}^{r}F_{i,j}\\)
    * \\(n\\) = the total number of cases, it can be calculated in various ways, \\(n=\\sum_{j=1}^{c}C_j=\\sum_{i=1}^{r}R_i=\\sum_{i=1}^{r}\\sum_{j=1}^{c}F_{i,j}\\)
    * \\(\\Phi\\left(\\ldots\\right)\\), the cumulative density function of the standard normal distribution
    
    The formula’s were taken from SPSS 15 Algorithms (2006, p. 117), unclear what the original source is, probably Theil (1970) or Theil (1972)
    
    References
    ----------
    SPSS. (2006). SPSS 15.0 algorithms.
    
    Theil, H. (1970). On the estimation of relationships involving qualitative variables. *American Journal of Sociology, 76*(1), 103–154. doi:10.1086/224909
    
    Theil, H. (1972). *Statistical decomposition analysis: With applications in the social and administrative sciences* (Vol. 14). North-Holland Pub. Co.; American Elsevier Pub. Co.
    
    Wikipedia. (2022). Uncertainty coefficient. In Wikipedia. https://en.wikipedia.org/w/index.php?title=Uncertainty_coefficient&oldid=1099636947#Definition

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #create the cross table
    ct = tab_cross(field1, field2, categories1, categories2, totals="include")    
    
    #basic counts
    nrows = ct.shape[0]-1
    ncols =  ct.shape[1]-1
    n = ct.iloc[nrows, ncols]
    
    #the margin totals
    rs = ct.iloc[0:nrows, ncols]
    cs = ct.iloc[nrows, 0:ncols]
    
    #h and hx
    h = 0
    hx = 0
    for i in range(0, nrows):
        hx = hx + rs[i] / n * log(rs[i] / n)
        
        for j in range(0, ncols):
            h = h + ct.iloc[i, j] / n * log(ct.iloc[i, j] / n)
    #hy
    hy=0
    for j in range(0, ncols):
        hy = hy + cs[j] / n * log(cs[j] / n)
        
    h = -h
    hx = -hx
    hy = -hy
    
    #U values
    uyx = (hx + hy - h) / hy
    uxy = (hy + hx - h) / hx
    u = 2 * ((hy + hx - h) / (hx + hy))
    
    #ase1 and q for ase0
    ase1 = 0
    ase1xy = 0
    ase1yx = 0
    q = 0
    for i in range(0, nrows):
        for j in range(0, ncols):
            ase1yx = ase1yx + ct.iloc[i, j] * (hy * log(ct.iloc[i, j] / rs[i]) + (hx - h) * log(cs[j] / n))**2
            ase1xy = ase1xy + ct.iloc[i, j] * (hx * log(ct.iloc[i, j] / cs[j]) + (hy - h) * log(rs[i] / n))**2
            ase1 = ase1 + ct.iloc[i, j] * (h * log(rs[i] * cs[j] / n**2) - (hx + hy) * log(ct.iloc[i, j] / n))**2
            q = q + ct.iloc[i, j] * (log(rs[i] * cs[j] / (n * ct.iloc[i, j])))**2
    ase1yx = (ase1yx)**0.5 / (n * hy**2)
    ase1xy = (ase1xy)**0.5 / (n * hx**2)
    ase1 = 2 * (ase1)**0.5 / (n * (hx + hy)**2)
    
    
    #ase0
    ase0yx = (q - n * (hx + hy - h)**2)**0.5 / (n * hy)
    ase0xy = (q - n * (hx + hy - h)**2)**0.5 / (n * hx)
    ase0 = 2 * (q - n * (hx + hy - h)**2)**0.5 / (n * (hx + hy))
    
    #t
    tyx = uyx / ase0yx
    txy = uxy / ase0xy
    t = u / ase0
    
    #p-values
    p = 2 * (1 - NormalDist().cdf(abs(t))) 
    pyx = 2 * (1 - NormalDist().cdf(abs(tyx)))
    pxy = 2 * (1 - NormalDist().cdf(abs(txy)))
    
    #the results
    ver = ["symmetric", "field1", "field2"]
    us = [u, uxy, uyx]
    ns = [n, n, n]
    
    ase0s = [ase0, ase0xy, ase0yx]
    ase1s = [ase1, ase1xy, ase1yx]
    zs = [t, txy, tyx]
    pvalues = [p, pxy, pyx]
    
    colNames = ["dependent", "n", "value", "ASE_0", "ASE_1", "statistic", "p-value"]
    results = pd.DataFrame(list(zip(ver, ns, us, ase0s, ase1s, zs, pvalues)), columns=colNames)
    
    return results

Functions

def es_theil_u(field1, field2, categories1=None, categories2=None)

Theil U / Uncertainty Coefficient

Theil U is a measure of nominal association. According to Wikipedia: "given Y, what fraction of the bits of X can we predict? In this case we can think of X as containing the total information, and of Y as allowing one to predict part of such information." (2022).

The term Theil U can also refer to two completely different measures, often used in forecasting and sometimes referred to as index of inequality.

Parameters

field1 : list or pandas series
the first categorical field
field2 : list or pandas series
the first categorical field
categories1 : list or dictionary, optional
order and/or selection for categories of field1
categories2 : list or dictionary, optional
order and/or selection for categories of field2

Returns

A dataframe with:
 
  • dependent, the field used as dependent variable
  • n, the sample size
  • value, the Theil U value
  • ASE_0, the asymptotic standard error assuming the null hypothesis
  • ASE_1, the asymptotic standard error assuming the alternative hypothesis
  • statistic, the z-value
  • p-value, the significance (p-value)

Notes

The formula used (SPSS, 2006, p. 117): U_{Y|X} = \frac{H_X+H_Y-H_{XY}}{H_Y} U_{X|Y} = \frac{H_X+H_Y-H_{XY}}{H_X} U = 2\times\frac{H_X+H_Y-H_{XY}}{H_X+H_Y}

With: H_X = -\sum_i \left(\frac{R_i}{n}\times\ln\left(\frac{R_i}{n}\right)\right) H_Y = -\sum_j \left(\frac{C_j}{n}\times\ln\left(\frac{C_j}{n}\right)\right) H_XY = -\sum_i \left(\frac{F_{i,j}}{n}\times\ln\left(\frac{F_{i,j}}{n}\right)\right), \text{ for } F_{i,j}>0

Asymptotic standard error: ASE\left(U_{Y|X}\right)_1 = \frac{\sqrt{\sum_{i,j} F_{i,j}\times\left(H_Y\times\ln\left(\frac{F_{i,j}}{R_i}\right) + \left(H_X - H_{XY}\right)\times\ln\left(\frac{C_j}{n}\right)\right)^2}}{n\times H_Y^2} ASE\left(U_{X|Y}\right)_1 = \frac{\sqrt{\sum_{i,j} F_{i,j}\times\left(H_X\times\ln\left(\frac{F_{i,j}}{C_j}\right) + \left(H_Y - H_{XY}\right)\times\ln\left(\frac{R_i}{n}\right)\right)^2}}{n\times H_X^2} ASE\left(U\right)_1 = \frac{2\times\sqrt{\sum_{i,j} F_{i,j}\times\left(H_{XY}\times\ln\left(\frac{R_i\times C_j}{n^2}\right) + \left(H_X + H_{Y}\right)\times\ln\left(\frac{F_{i,j}}{n}\right)\right)^2}}{n\times \left(H_X+H_Y\right)^2}

ASE\left(U_{Y|X}\right)_0 = \frac{\sqrt{P - n\times\left(H_X+H_Y-H_{XY}\right)^2}}{n\times H_Y} ASE\left(U_{X|Y}\right)_0 = \frac{\sqrt{P - n\times\left(H_X+H_Y-H_{XY}\right)^2}}{n\times H_X} ASE\left(U\right)_0 = \frac{2\times\sqrt{P - n\times\left(H_X+H_Y-H_{XY}\right)^2}}{n\times \left(H_X+H_Y\right)}

With: P = \sum_{i,j} F_{i,j}\times\left(\ln\left(\frac{R_i\times C_j}{n\times F_{i,j}}\right)\right)^2

The p-value (significance): T_{Y|X} = \frac{U_{Y|X}}{ASE\left(U_{Y|X}\right)_0} T_{X|Y} = \frac{U_{X|Y}}{ASE\left(U_{X|Y}\right)_0} T = \frac{U}{ASE\left(U\right)_0} sig. = 2\times\left(1 - \Phi\left(\left|T\right|\right)\right)

Symbols used:

  • F_{i,j}, the absolute frequency (observed count) from row i and column j
  • c, the number of columns
  • r, the number of rows
  • R_i, row total of row i, it can be calculated using R_i=\sum_{j=1}^{c}F_{i,j}
  • C_j, column total of column j, it can be calculated using C_j=\sum_{i=1}^{r}F_{i,j}
  • n = the total number of cases, it can be calculated in various ways, n=\sum_{j=1}^{c}C_j=\sum_{i=1}^{r}R_i=\sum_{i=1}^{r}\sum_{j=1}^{c}F_{i,j}
  • \Phi\left(\ldots\right), the cumulative density function of the standard normal distribution

The formula’s were taken from SPSS 15 Algorithms (2006, p. 117), unclear what the original source is, probably Theil (1970) or Theil (1972)

References

SPSS. (2006). SPSS 15.0 algorithms.

Theil, H. (1970). On the estimation of relationships involving qualitative variables. American Journal of Sociology, 76(1), 103–154. doi:10.1086/224909

Theil, H. (1972). Statistical decomposition analysis: With applications in the social and administrative sciences (Vol. 14). North-Holland Pub. Co.; American Elsevier Pub. Co.

Wikipedia. (2022). Uncertainty coefficient. In Wikipedia. https://en.wikipedia.org/w/index.php?title=Uncertainty_coefficient&oldid=1099636947#Definition

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Expand source code
def es_theil_u(field1, field2, categories1=None, categories2=None):
    '''
    Theil U / Uncertainty Coefficient
    ---------------------------------
    Theil U is a measure of nominal association. According to Wikipedia: "given Y, what fraction of the bits of X can we predict? In this case we can think of X as containing the total information, and of Y as allowing one to predict part of such information." (2022).
    
    The term Theil U can also refer to two completely different measures, often used in forecasting and sometimes referred to as index of inequality.
    
    Parameters
    ----------
    field1 : list or pandas series
        the first categorical field
    field2 : list or pandas series
        the first categorical field
    categories1 : list or dictionary, optional
        order and/or selection for categories of field1
    categories2 : list or dictionary, optional
        order and/or selection for categories of field2
        
    Returns
    -------
    A dataframe with:
    
    * *dependent*, the field used as dependent variable
    * *n*, the sample size
    * *value*, the Theil U value
    * *ASE_0*, the asymptotic standard error assuming the null hypothesis
    * *ASE_1*, the asymptotic standard error assuming the alternative hypothesis
    * *statistic*, the z-value
    * *p-value*, the significance (p-value)
    
    Notes
    -----
    The formula used (SPSS, 2006, p. 117):
    $$U_{Y|X} = \\frac{H_X+H_Y-H_{XY}}{H_Y}$$
    $$U_{X|Y} = \\frac{H_X+H_Y-H_{XY}}{H_X}$$
    $$U = 2\\times\\frac{H_X+H_Y-H_{XY}}{H_X+H_Y}$$
    
    With:
    $$H_X = -\\sum_i \\left(\\frac{R_i}{n}\\times\\ln\\left(\\frac{R_i}{n}\\right)\\right)$$
    $$H_Y = -\\sum_j \\left(\\frac{C_j}{n}\\times\\ln\\left(\\frac{C_j}{n}\\right)\\right)$$
    $$H_XY = -\\sum_i \\left(\\frac{F_{i,j}}{n}\\times\\ln\\left(\\frac{F_{i,j}}{n}\\right)\\right), \\text{ for } F_{i,j}>0$$
    
    Asymptotic standard error:
    $$ASE\\left(U_{Y|X}\\right)_1 = \\frac{\\sqrt{\\sum_{i,j} F_{i,j}\\times\\left(H_Y\\times\\ln\\left(\\frac{F_{i,j}}{R_i}\\right) + \\left(H_X - H_{XY}\\right)\\times\\ln\\left(\\frac{C_j}{n}\\right)\\right)^2}}{n\\times H_Y^2}$$
    $$ASE\\left(U_{X|Y}\\right)_1 = \\frac{\\sqrt{\\sum_{i,j} F_{i,j}\\times\\left(H_X\\times\\ln\\left(\\frac{F_{i,j}}{C_j}\\right) + \\left(H_Y - H_{XY}\\right)\\times\\ln\\left(\\frac{R_i}{n}\\right)\\right)^2}}{n\\times H_X^2}$$
    $$ASE\\left(U\\right)_1 = \\frac{2\\times\\sqrt{\\sum_{i,j} F_{i,j}\\times\\left(H_{XY}\\times\\ln\\left(\\frac{R_i\\times C_j}{n^2}\\right) + \\left(H_X + H_{Y}\\right)\\times\\ln\\left(\\frac{F_{i,j}}{n}\\right)\\right)^2}}{n\\times \\left(H_X+H_Y\\right)^2}$$
    
    $$ASE\\left(U_{Y|X}\\right)_0 = \\frac{\\sqrt{P - n\\times\\left(H_X+H_Y-H_{XY}\\right)^2}}{n\\times H_Y}$$
    $$ASE\\left(U_{X|Y}\\right)_0 = \\frac{\\sqrt{P - n\\times\\left(H_X+H_Y-H_{XY}\\right)^2}}{n\\times H_X}$$
    $$ASE\\left(U\\right)_0 = \\frac{2\\times\\sqrt{P - n\\times\\left(H_X+H_Y-H_{XY}\\right)^2}}{n\\times \\left(H_X+H_Y\\right)}$$
    
    With:
    $$P = \\sum_{i,j} F_{i,j}\\times\\left(\\ln\\left(\\frac{R_i\\times C_j}{n\\times F_{i,j}}\\right)\\right)^2$$
    
    The p-value (significance):
    $$T_{Y|X} = \\frac{U_{Y|X}}{ASE\\left(U_{Y|X}\\right)_0}$$
    $$T_{X|Y} = \\frac{U_{X|Y}}{ASE\\left(U_{X|Y}\\right)_0}$$
    $$T = \\frac{U}{ASE\\left(U\\right)_0}$$
    $$sig. = 2\\times\\left(1 - \\Phi\\left(\\left|T\\right|\\right)\\right)$$
    
    *Symbols used:*
    
    * \\(F_{i,j}\\), the absolute frequency (observed count) from row i and column j
    * \\(c\\), the number of columns
    * \\(r\\), the number of rows
    * \\(R_i\\), row total of row i, it can be calculated using \\(R_i=\\sum_{j=1}^{c}F_{i,j}\\)
    * \\(C_j\\), column total of column j, it can be calculated using \\(C_j=\\sum_{i=1}^{r}F_{i,j}\\)
    * \\(n\\) = the total number of cases, it can be calculated in various ways, \\(n=\\sum_{j=1}^{c}C_j=\\sum_{i=1}^{r}R_i=\\sum_{i=1}^{r}\\sum_{j=1}^{c}F_{i,j}\\)
    * \\(\\Phi\\left(\\ldots\\right)\\), the cumulative density function of the standard normal distribution
    
    The formula’s were taken from SPSS 15 Algorithms (2006, p. 117), unclear what the original source is, probably Theil (1970) or Theil (1972)
    
    References
    ----------
    SPSS. (2006). SPSS 15.0 algorithms.
    
    Theil, H. (1970). On the estimation of relationships involving qualitative variables. *American Journal of Sociology, 76*(1), 103–154. doi:10.1086/224909
    
    Theil, H. (1972). *Statistical decomposition analysis: With applications in the social and administrative sciences* (Vol. 14). North-Holland Pub. Co.; American Elsevier Pub. Co.
    
    Wikipedia. (2022). Uncertainty coefficient. In Wikipedia. https://en.wikipedia.org/w/index.php?title=Uncertainty_coefficient&oldid=1099636947#Definition

    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076
    
    '''
    #create the cross table
    ct = tab_cross(field1, field2, categories1, categories2, totals="include")    
    
    #basic counts
    nrows = ct.shape[0]-1
    ncols =  ct.shape[1]-1
    n = ct.iloc[nrows, ncols]
    
    #the margin totals
    rs = ct.iloc[0:nrows, ncols]
    cs = ct.iloc[nrows, 0:ncols]
    
    #h and hx
    h = 0
    hx = 0
    for i in range(0, nrows):
        hx = hx + rs[i] / n * log(rs[i] / n)
        
        for j in range(0, ncols):
            h = h + ct.iloc[i, j] / n * log(ct.iloc[i, j] / n)
    #hy
    hy=0
    for j in range(0, ncols):
        hy = hy + cs[j] / n * log(cs[j] / n)
        
    h = -h
    hx = -hx
    hy = -hy
    
    #U values
    uyx = (hx + hy - h) / hy
    uxy = (hy + hx - h) / hx
    u = 2 * ((hy + hx - h) / (hx + hy))
    
    #ase1 and q for ase0
    ase1 = 0
    ase1xy = 0
    ase1yx = 0
    q = 0
    for i in range(0, nrows):
        for j in range(0, ncols):
            ase1yx = ase1yx + ct.iloc[i, j] * (hy * log(ct.iloc[i, j] / rs[i]) + (hx - h) * log(cs[j] / n))**2
            ase1xy = ase1xy + ct.iloc[i, j] * (hx * log(ct.iloc[i, j] / cs[j]) + (hy - h) * log(rs[i] / n))**2
            ase1 = ase1 + ct.iloc[i, j] * (h * log(rs[i] * cs[j] / n**2) - (hx + hy) * log(ct.iloc[i, j] / n))**2
            q = q + ct.iloc[i, j] * (log(rs[i] * cs[j] / (n * ct.iloc[i, j])))**2
    ase1yx = (ase1yx)**0.5 / (n * hy**2)
    ase1xy = (ase1xy)**0.5 / (n * hx**2)
    ase1 = 2 * (ase1)**0.5 / (n * (hx + hy)**2)
    
    
    #ase0
    ase0yx = (q - n * (hx + hy - h)**2)**0.5 / (n * hy)
    ase0xy = (q - n * (hx + hy - h)**2)**0.5 / (n * hx)
    ase0 = 2 * (q - n * (hx + hy - h)**2)**0.5 / (n * (hx + hy))
    
    #t
    tyx = uyx / ase0yx
    txy = uxy / ase0xy
    t = u / ase0
    
    #p-values
    p = 2 * (1 - NormalDist().cdf(abs(t))) 
    pyx = 2 * (1 - NormalDist().cdf(abs(tyx)))
    pxy = 2 * (1 - NormalDist().cdf(abs(txy)))
    
    #the results
    ver = ["symmetric", "field1", "field2"]
    us = [u, uxy, uyx]
    ns = [n, n, n]
    
    ase0s = [ase0, ase0xy, ase0yx]
    ase1s = [ase1, ase1xy, ase1yx]
    zs = [t, txy, tyx]
    pvalues = [p, pxy, pyx]
    
    colNames = ["dependent", "n", "value", "ASE_0", "ASE_1", "statistic", "p-value"]
    results = pd.DataFrame(list(zip(ver, ns, us, ase0s, ase1s, zs, pvalues)), columns=colNames)
    
    return results