Module stikpetP.visualisations.vis_stem_and_leaf_b2b

Expand source code
import pandas as pd
from math import floor, log10

def vi_stem_and_leaf_b2b(catField, scaleField, categories=None, key_factor=None):
    '''
    Back-to-Back Stem-and-Leaf Plot
    -------------------------------
    The Cambridge dictionary of statistics defines a back-to-back stem-leaf plot as "a method for comparing two distributions by 'hanging' the two sets of leaves in the stem-and-leaf plots of the two sets of data, off either side of the same stem" (Everitt, 1998, p. 22).
    
    It is a diagram that could be used to show the scores from two independent samples, so when having a binary and a scale variable.

    The visualisation is also described at [PeterStatistics.com](https://peterstatistics.com/Terms/Visualisations/stemAndLeafDisplay.html)

    Parameters
    ----------
    catField : list or dataframe 
        the categories
    scaleField : list or dataframe 
        the scores
    categories : list or dictionary, optional
        the two categories to use from bin_field. If not set the first two found will be used
    key_factor : float, optional
        the value the stem needs to be multiplied with
        
    Returns
    -------
    prints the diagram in console.

    Alternatives
    ------------
    To display the results of a binary and scale variable, alternative visualisations include: [overlaid histogram](../visualisations/vis_histogram_overlay.html), [back-to-back histogram](../visualisations/vis_histogram_b2b.html), [split histogram](../visualisations/vis_histogram_split.html), [split box-plot](../visualisations/vis_boxplot_split.html), [butterfly chart/pyramid chart](../visualisations/vis_butterfly_bin.html)

    Next
    ----
    After visualizing the data, you might want to run a test: [Student t](../tests/test_student_t_is.html), [Welch t](../tests/test_welch_t_is.html), [Trimmed means](../tests/test_trimmed_mean_is.html), [Yuen-Welch](../tests/test_trimmed_mean_is.html), [Z test](../tests/test_z_is.html)

    References 
    ----------
    Everitt, B. S. (1998). *The Cambridge dictionary of statistics*. Cambridge University Press.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076

    Examples
    --------
    >>> import pandas as pd
    >>> file1 = "https://peterstatistics.com/Packages/ExampleData/StudentStatistics.csv"
    >>> df = pd.read_csv(file1, sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'})
    >>> vi_stem_and_leaf_b2b(df['Gen_Gender'], df['Over_Grade'])

    '''
    
    #convert to pandas series if needed
    if type(catField) is list:
        catField = pd.Series(catField)
    
    if type(scaleField) is list:
        scaleField = pd.Series(scaleField)
    
    #combine as one dataframe
    df = pd.concat([catField, scaleField], axis=1)
    df = df.dropna()
    
    #the two categories
    if categories is not None:
        cat1 = categories[0]
        cat2 = categories[1]
    else:
        cat1 = df.iloc[:,0].value_counts().index[0]
        cat2 = df.iloc[:,0].value_counts().index[1]

    var_name = df.iloc[:,1].name
    
    #seperate the scores for each category
    X = list(df.iloc[:,1][df.iloc[:,0] == cat1])
    Y = list(df.iloc[:,1][df.iloc[:,0] == cat2])
    
    A = X + Y
    
    if key_factor is None:
        key_factor = 10**floor(log10(abs(max(A))))
        
    stems = sorted(list(set([int(a/key_factor) for a in A])))

    X_s = sorted(X)
    Y_s = sorted(Y)

    leafs = {}
    for s in stems:
        leafs[s] = [[], []]

    for x in X_s:
        s = int(x/key_factor)
        l = x - s*key_factor
        leafs[s][0].append(l)
    
    for y in Y_s:
        s = int(y/key_factor)
        l = y - s*key_factor
        leafs[s][1].append(l)

    # Calculate maximum width of any leaf for formatting
    max_leaf_value = max(A)
    leaf_width = len(str(max_leaf_value % key_factor))  # how wide a leaf can be
    leaf_fmt = f"{{:{leaf_width}}}"  # e.g., '{:02}' if width is 2
    
    max_stem = max(stems)
    stem_width = len(str(max_stem))
    stem_fmt = f"{{:{stem_width}}}"
    
    # Align left side (X leafs)
    max_x_width = max(
        len(' '.join(leaf_fmt.format(l) for l in reversed(leafs[s][0])))
        for s in stems
    )
    
    X_label = cat1
    Y_label = cat2
    
    header_start = max_x_width - len(X_label)
    print(' ' * header_start + X_label + ' | s' +' '*(stem_width) + '|' + Y_label)
    # Print stem-and-leaf plot
    for stem in stems:
        x_leafs = ' '.join(leaf_fmt.format(l) for l in reversed(leafs[stem][0]))
        y_leafs = ' '.join(leaf_fmt.format(l) for l in leafs[stem][1])
        stem_str = stem_fmt.format(stem)
        print(f"{x_leafs:>{max_x_width}} | {stem_str} | {y_leafs}")
    print('Key: 1 | 8 = ' + str(key_factor+8))

    return

Functions

def vi_stem_and_leaf_b2b(catField, scaleField, categories=None, key_factor=None)

Back-to-Back Stem-and-Leaf Plot

The Cambridge dictionary of statistics defines a back-to-back stem-leaf plot as "a method for comparing two distributions by 'hanging' the two sets of leaves in the stem-and-leaf plots of the two sets of data, off either side of the same stem" (Everitt, 1998, p. 22).

It is a diagram that could be used to show the scores from two independent samples, so when having a binary and a scale variable.

The visualisation is also described at PeterStatistics.com

Parameters

catField : list or dataframe
the categories
scaleField : list or dataframe
the scores
categories : list or dictionary, optional
the two categories to use from bin_field. If not set the first two found will be used
key_factor : float, optional
the value the stem needs to be multiplied with

Returns

prints the diagram in console.

Alternatives

To display the results of a binary and scale variable, alternative visualisations include: overlaid histogram, back-to-back histogram, split histogram, split box-plot, butterfly chart/pyramid chart

Next

After visualizing the data, you might want to run a test: Student t, Welch t, Trimmed means, Yuen-Welch, Z test

References

Everitt, B. S. (1998). The Cambridge dictionary of statistics. Cambridge University Press.

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Examples

>>> import pandas as pd
>>> file1 = "https://peterstatistics.com/Packages/ExampleData/StudentStatistics.csv"
>>> df = pd.read_csv(file1, sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'})
>>> vi_stem_and_leaf_b2b(df['Gen_Gender'], df['Over_Grade'])
Expand source code
def vi_stem_and_leaf_b2b(catField, scaleField, categories=None, key_factor=None):
    '''
    Back-to-Back Stem-and-Leaf Plot
    -------------------------------
    The Cambridge dictionary of statistics defines a back-to-back stem-leaf plot as "a method for comparing two distributions by 'hanging' the two sets of leaves in the stem-and-leaf plots of the two sets of data, off either side of the same stem" (Everitt, 1998, p. 22).
    
    It is a diagram that could be used to show the scores from two independent samples, so when having a binary and a scale variable.

    The visualisation is also described at [PeterStatistics.com](https://peterstatistics.com/Terms/Visualisations/stemAndLeafDisplay.html)

    Parameters
    ----------
    catField : list or dataframe 
        the categories
    scaleField : list or dataframe 
        the scores
    categories : list or dictionary, optional
        the two categories to use from bin_field. If not set the first two found will be used
    key_factor : float, optional
        the value the stem needs to be multiplied with
        
    Returns
    -------
    prints the diagram in console.

    Alternatives
    ------------
    To display the results of a binary and scale variable, alternative visualisations include: [overlaid histogram](../visualisations/vis_histogram_overlay.html), [back-to-back histogram](../visualisations/vis_histogram_b2b.html), [split histogram](../visualisations/vis_histogram_split.html), [split box-plot](../visualisations/vis_boxplot_split.html), [butterfly chart/pyramid chart](../visualisations/vis_butterfly_bin.html)

    Next
    ----
    After visualizing the data, you might want to run a test: [Student t](../tests/test_student_t_is.html), [Welch t](../tests/test_welch_t_is.html), [Trimmed means](../tests/test_trimmed_mean_is.html), [Yuen-Welch](../tests/test_trimmed_mean_is.html), [Z test](../tests/test_z_is.html)

    References 
    ----------
    Everitt, B. S. (1998). *The Cambridge dictionary of statistics*. Cambridge University Press.
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076

    Examples
    --------
    >>> import pandas as pd
    >>> file1 = "https://peterstatistics.com/Packages/ExampleData/StudentStatistics.csv"
    >>> df = pd.read_csv(file1, sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'})
    >>> vi_stem_and_leaf_b2b(df['Gen_Gender'], df['Over_Grade'])

    '''
    
    #convert to pandas series if needed
    if type(catField) is list:
        catField = pd.Series(catField)
    
    if type(scaleField) is list:
        scaleField = pd.Series(scaleField)
    
    #combine as one dataframe
    df = pd.concat([catField, scaleField], axis=1)
    df = df.dropna()
    
    #the two categories
    if categories is not None:
        cat1 = categories[0]
        cat2 = categories[1]
    else:
        cat1 = df.iloc[:,0].value_counts().index[0]
        cat2 = df.iloc[:,0].value_counts().index[1]

    var_name = df.iloc[:,1].name
    
    #seperate the scores for each category
    X = list(df.iloc[:,1][df.iloc[:,0] == cat1])
    Y = list(df.iloc[:,1][df.iloc[:,0] == cat2])
    
    A = X + Y
    
    if key_factor is None:
        key_factor = 10**floor(log10(abs(max(A))))
        
    stems = sorted(list(set([int(a/key_factor) for a in A])))

    X_s = sorted(X)
    Y_s = sorted(Y)

    leafs = {}
    for s in stems:
        leafs[s] = [[], []]

    for x in X_s:
        s = int(x/key_factor)
        l = x - s*key_factor
        leafs[s][0].append(l)
    
    for y in Y_s:
        s = int(y/key_factor)
        l = y - s*key_factor
        leafs[s][1].append(l)

    # Calculate maximum width of any leaf for formatting
    max_leaf_value = max(A)
    leaf_width = len(str(max_leaf_value % key_factor))  # how wide a leaf can be
    leaf_fmt = f"{{:{leaf_width}}}"  # e.g., '{:02}' if width is 2
    
    max_stem = max(stems)
    stem_width = len(str(max_stem))
    stem_fmt = f"{{:{stem_width}}}"
    
    # Align left side (X leafs)
    max_x_width = max(
        len(' '.join(leaf_fmt.format(l) for l in reversed(leafs[s][0])))
        for s in stems
    )
    
    X_label = cat1
    Y_label = cat2
    
    header_start = max_x_width - len(X_label)
    print(' ' * header_start + X_label + ' | s' +' '*(stem_width) + '|' + Y_label)
    # Print stem-and-leaf plot
    for stem in stems:
        x_leafs = ' '.join(leaf_fmt.format(l) for l in reversed(leafs[stem][0]))
        y_leafs = ' '.join(leaf_fmt.format(l) for l in leafs[stem][1])
        stem_str = stem_fmt.format(stem)
        print(f"{x_leafs:>{max_x_width}} | {stem_str} | {y_leafs}")
    print('Key: 1 | 8 = ' + str(key_factor+8))

    return