Module stikpetP.visualisations.vis_histogram_b2b

Expand source code
import matplotlib.pyplot as plt
import pandas as pd

def vi_histogram_b2b(catField, scaleField, categories=None, bins=None, equal_bins=True, density=False, **kwargs):
    '''
    Back-to-Back Histogram (Pyramid chart)
    ---------------------------------------

    This function creates a simple back-to-back histogram. This is sometimes also referred to as a Pyramid chart. 

    The visualisation is also described at [PeterStatistics.com](https://peterstatistics.com/Terms/Visualisations/histogram.html)

    Parameters
    ----------
    catField : list or dataframe 
        the categories
    scaleField : list or dataframe 
        the scores
    categories : list or dictionary, optional
        the two categories to use from bin_field. If not set the first two found will be used
    bins : list, optional
        upper bounds of bins to be used, or any of the pre-set options from pyplot.hist() bins.
    equal_bins : bool, optional
        use the same bins for each sample
    density : bool, optional
        use frequency density instead of counts
    kwargs : other parameters from pandas plot()
        
    Returns
    -------
    back-to-back histogram

    Alternatives
    ------------
    To display the results of a binary and scale variable, alternative visualisations include: [overlaid histogram](../visualisations/vis_histogram_overlay.html), [back-to-back stem-and-leaf display](../visualisations/vis_stem_and_leaf_b2b.html), [split histogram](../visualisations/vis_histogram_split.html), [split box-plot](../visualisations/vis_boxplot_split.html), [butterfly chart/pyramid chart](../visualisations/vis_butterfly_bin.html)

    Next
    ----
    After visualizing the data, you might want to run a test: [Student t](../tests/test_student_t_is.html), [Welch t](../tests/test_welch_t_is.html), [Trimmed means](../tests/test_trimmed_mean_is.html), [Yuen-Welch](../tests/test_trimmed_mean_is.html), [Z test](../tests/test_z_is.html)
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076

    Examples
    --------
    >>> import pandas as pd
    >>> file1 = "https://peterstatistics.com/Packages/ExampleData/StudentStatistics.csv"
    >>> df = pd.read_csv(file1, sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'})
    >>> vi_histogram_b2b(df['Gen_Gender'], df['Over_Grade'], edgecolor='blue')
    '''

    #convert to pandas series if needed
    if type(catField) is list:
        catField = pd.Series(catField)
    
    if type(scaleField) is list:
        scaleField = pd.Series(scaleField)
    
    #combine as one dataframe
    df = pd.concat([catField, scaleField], axis=1)
    df = df.dropna()
    
    #the two categories
    if categories is not None:
        cat1 = categories[0]
        cat2 = categories[1]
    else:
        cat1 = df.iloc[:,0].value_counts().index[0]
        cat2 = df.iloc[:,0].value_counts().index[1]

    var_name = df.iloc[:,1].name
    
    #seperate the scores for each category
    X = list(df.iloc[:,1][df.iloc[:,0] == cat1])
    Y = list(df.iloc[:,1][df.iloc[:,0] == cat2])
    
    #make sure they are floats
    X = [float(x) for x in X]
    Y = [float(y) for y in Y]
    
    if bins is None and equal_bins:
        (n, bins, patches) = plt.hist(X+Y)
        plt.close()
        
    fig, axes = plt.subplots(ncols=2, sharey=True, figsize=(9, 6))
    
    n_X, bins, _ = axes[0].hist(X, orientation='horizontal', color='orange', bins=bins, density=density, **kwargs)
    axes[0].set(title=cat1, ylabel=var_name)
    axes[0].invert_xaxis()
    
    n_Y, bins, _ = axes[1].hist(Y, orientation='horizontal', bins=bins, density=density, **kwargs)
    axes[1].set(title=cat2)

    xLim = max(list(n_X) + list(n_Y)) + 0.5
    axes[0].set_xlim([xLim,0])
    axes[1].set_xlim([0,xLim])

    plt.subplots_adjust(wspace=0, hspace=0)
    if density:
        plt.xlabel('frequency density')
    else:
        plt.xlabel('count')
    plt.show()

Functions

def vi_histogram_b2b(catField, scaleField, categories=None, bins=None, equal_bins=True, density=False, **kwargs)

Back-to-Back Histogram (Pyramid chart)

This function creates a simple back-to-back histogram. This is sometimes also referred to as a Pyramid chart.

The visualisation is also described at PeterStatistics.com

Parameters

catField : list or dataframe
the categories
scaleField : list or dataframe
the scores
categories : list or dictionary, optional
the two categories to use from bin_field. If not set the first two found will be used
bins : list, optional
upper bounds of bins to be used, or any of the pre-set options from pyplot.hist() bins.
equal_bins : bool, optional
use the same bins for each sample
density : bool, optional
use frequency density instead of counts
kwargs : other parameters from pandas plot()
 

Returns

back-to-back histogram
 

Alternatives

To display the results of a binary and scale variable, alternative visualisations include: overlaid histogram, back-to-back stem-and-leaf display, split histogram, split box-plot, butterfly chart/pyramid chart

Next

After visualizing the data, you might want to run a test: Student t, Welch t, Trimmed means, Yuen-Welch, Z test

Author

Made by P. Stikker

Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076

Examples

>>> import pandas as pd
>>> file1 = "https://peterstatistics.com/Packages/ExampleData/StudentStatistics.csv"
>>> df = pd.read_csv(file1, sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'})
>>> vi_histogram_b2b(df['Gen_Gender'], df['Over_Grade'], edgecolor='blue')
Expand source code
def vi_histogram_b2b(catField, scaleField, categories=None, bins=None, equal_bins=True, density=False, **kwargs):
    '''
    Back-to-Back Histogram (Pyramid chart)
    ---------------------------------------

    This function creates a simple back-to-back histogram. This is sometimes also referred to as a Pyramid chart. 

    The visualisation is also described at [PeterStatistics.com](https://peterstatistics.com/Terms/Visualisations/histogram.html)

    Parameters
    ----------
    catField : list or dataframe 
        the categories
    scaleField : list or dataframe 
        the scores
    categories : list or dictionary, optional
        the two categories to use from bin_field. If not set the first two found will be used
    bins : list, optional
        upper bounds of bins to be used, or any of the pre-set options from pyplot.hist() bins.
    equal_bins : bool, optional
        use the same bins for each sample
    density : bool, optional
        use frequency density instead of counts
    kwargs : other parameters from pandas plot()
        
    Returns
    -------
    back-to-back histogram

    Alternatives
    ------------
    To display the results of a binary and scale variable, alternative visualisations include: [overlaid histogram](../visualisations/vis_histogram_overlay.html), [back-to-back stem-and-leaf display](../visualisations/vis_stem_and_leaf_b2b.html), [split histogram](../visualisations/vis_histogram_split.html), [split box-plot](../visualisations/vis_boxplot_split.html), [butterfly chart/pyramid chart](../visualisations/vis_butterfly_bin.html)

    Next
    ----
    After visualizing the data, you might want to run a test: [Student t](../tests/test_student_t_is.html), [Welch t](../tests/test_welch_t_is.html), [Trimmed means](../tests/test_trimmed_mean_is.html), [Yuen-Welch](../tests/test_trimmed_mean_is.html), [Z test](../tests/test_z_is.html)
    
    Author
    ------
    Made by P. Stikker
    
    Companion website: https://PeterStatistics.com  
    YouTube channel: https://www.youtube.com/stikpet  
    Donations: https://www.patreon.com/bePatron?u=19398076

    Examples
    --------
    >>> import pandas as pd
    >>> file1 = "https://peterstatistics.com/Packages/ExampleData/StudentStatistics.csv"
    >>> df = pd.read_csv(file1, sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'})
    >>> vi_histogram_b2b(df['Gen_Gender'], df['Over_Grade'], edgecolor='blue')
    '''

    #convert to pandas series if needed
    if type(catField) is list:
        catField = pd.Series(catField)
    
    if type(scaleField) is list:
        scaleField = pd.Series(scaleField)
    
    #combine as one dataframe
    df = pd.concat([catField, scaleField], axis=1)
    df = df.dropna()
    
    #the two categories
    if categories is not None:
        cat1 = categories[0]
        cat2 = categories[1]
    else:
        cat1 = df.iloc[:,0].value_counts().index[0]
        cat2 = df.iloc[:,0].value_counts().index[1]

    var_name = df.iloc[:,1].name
    
    #seperate the scores for each category
    X = list(df.iloc[:,1][df.iloc[:,0] == cat1])
    Y = list(df.iloc[:,1][df.iloc[:,0] == cat2])
    
    #make sure they are floats
    X = [float(x) for x in X]
    Y = [float(y) for y in Y]
    
    if bins is None and equal_bins:
        (n, bins, patches) = plt.hist(X+Y)
        plt.close()
        
    fig, axes = plt.subplots(ncols=2, sharey=True, figsize=(9, 6))
    
    n_X, bins, _ = axes[0].hist(X, orientation='horizontal', color='orange', bins=bins, density=density, **kwargs)
    axes[0].set(title=cat1, ylabel=var_name)
    axes[0].invert_xaxis()
    
    n_Y, bins, _ = axes[1].hist(Y, orientation='horizontal', bins=bins, density=density, **kwargs)
    axes[1].set(title=cat2)

    xLim = max(list(n_X) + list(n_Y)) + 0.5
    axes[0].set_xlim([xLim,0])
    axes[1].set_xlim([0,xLim])

    plt.subplots_adjust(wspace=0, hspace=0)
    if density:
        plt.xlabel('frequency density')
    else:
        plt.xlabel('count')
    plt.show()