Module stikpetP.visualisations.vis_butterfly_bin
Expand source code
import pandas as pd
from ..other.table_frequency_bins import tab_frequency_bins
from ..visualisations.vis_butterfly_chart import vi_butterfly_chart
def vi_butterfly_bin(catField, scaleField, categories=None, bins=None):
'''
Butterfly Chart of Binned Data (Pyramid chart)
---------------------------------------
This function creates a simple butterfly chart by binning scaled data. This is sometimes also referred to as a Pyramid chart.
The visualisation is also described at [PeterStatistics.com](https://peterstatistics.com/Terms/Visualisations/PyramidChart.html)
Parameters
----------
catField : list or dataframe
the categories
scaleField : list or dataframe
the scores
categories : list or dictionary, optional
the two categories to use from bin_field. If not set the first two found will be used
bins : list, optional
list of tuples to use as bins.
Returns
-------
butterfly chart
Alternatives
------------
To display the results of a binary and scale variable, alternative visualisations include: [overlaid histogram](../visualisations/vis_histogram_overlay.html), [back-to-back histogram](../visualisations/vis_histogram_b2b.html), [back-to-back stem-and-leaf display](../visualisations/vis_stem_and_leaf_b2b.html), [split histogram](../visualisations/vis_histogram_split.html), [split box-plot](../visualisations/vis_boxplot_split.html)
Next
----
After visualizing the data, you might want to run a test: [Student t](../tests/test_student_t_is.html), [Welch t](../tests/test_welch_t_is.html), [Trimmed means](../tests/test_trimmed_mean_is.html), [Yuen-Welch](../tests/test_trimmed_mean_is.html), [Z test](../tests/test_z_is.html)
Author
------
Made by P. Stikker
Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076
Examples
--------
>>> import pandas as pd
>>> file1 = "https://peterstatistics.com/Packages/ExampleData/StudentStatistics.csv"
>>> df = pd.read_csv(file1, sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'})
>>> vi_butterfly_bin(df['Gen_Gender'], df['Over_Grade'], bins=[(0, 25), (25, 50), (50, 75), (75, 100)])
'''
#convert to pandas series if needed
if type(catField) is list:
catField = pd.Series(catField)
if type(scaleField) is list:
scaleField = pd.Series(scaleField)
#combine as one dataframe
df = pd.concat([catField, scaleField], axis=1)
df = df.dropna()
#the two categories
if categories is not None:
cat1 = categories[0]
cat2 = categories[1]
else:
cat1 = df.iloc[:,0].value_counts().index[0]
cat2 = df.iloc[:,0].value_counts().index[1]
var_name = df.iloc[:,1].name
#seperate the scores for each category
X = list(df.iloc[:,1][df.iloc[:,0] == cat1])
Y = list(df.iloc[:,1][df.iloc[:,0] == cat2])
#make sure they are floats
X = [float(x) for x in X]
Y = [float(y) for y in Y]
if bins is None:
# determine the bins from all scores
scores_table = tab_frequency_bins(X + Y)
bins = [(scores_table.iloc[i,0], scores_table.iloc[i,1]) for i in range(len(scores_table))]
# determine counts for each bin for each category
X_table = tab_frequency_bins(X, bins=bins)
Y_table = tab_frequency_bins(Y, bins=bins)
# create list of bins for each category
X_ord = []
Y_ord = []
for i in range(len(bins)):
X_ord = X_ord + [str(bins[i])]*int(X_table.iloc[i, 2])
Y_ord = Y_ord + [str(bins[i])]*int(Y_table.iloc[i, 2])
# create list of categories
cats = [cat1]*len(X) + [cat2]*len(Y)
# use the butterfly function for binary-nominal
vi_butterfly_chart(X_ord + Y_ord, cats, variation='butterfly')
return
Functions
def vi_butterfly_bin(catField, scaleField, categories=None, bins=None)
-
Butterfly Chart of Binned Data (Pyramid chart)
This function creates a simple butterfly chart by binning scaled data. This is sometimes also referred to as a Pyramid chart. The visualisation is also described at PeterStatistics.com
Parameters
catField
:list
ordataframe
- the categories
scaleField
:list
ordataframe
- the scores
categories
:list
ordictionary
, optional- the two categories to use from bin_field. If not set the first two found will be used
bins
:list
, optional- list of tuples to use as bins.
Returns
butterfly chart
Alternatives
To display the results of a binary and scale variable, alternative visualisations include: overlaid histogram, back-to-back histogram, back-to-back stem-and-leaf display, split histogram, split box-plot
Next
After visualizing the data, you might want to run a test: Student t, Welch t, Trimmed means, Yuen-Welch, Z test
Author
Made by P. Stikker
Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076Examples
>>> import pandas as pd >>> file1 = "https://peterstatistics.com/Packages/ExampleData/StudentStatistics.csv" >>> df = pd.read_csv(file1, sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'}) >>> vi_butterfly_bin(df['Gen_Gender'], df['Over_Grade'], bins=[(0, 25), (25, 50), (50, 75), (75, 100)])
Expand source code
def vi_butterfly_bin(catField, scaleField, categories=None, bins=None): ''' Butterfly Chart of Binned Data (Pyramid chart) --------------------------------------- This function creates a simple butterfly chart by binning scaled data. This is sometimes also referred to as a Pyramid chart. The visualisation is also described at [PeterStatistics.com](https://peterstatistics.com/Terms/Visualisations/PyramidChart.html) Parameters ---------- catField : list or dataframe the categories scaleField : list or dataframe the scores categories : list or dictionary, optional the two categories to use from bin_field. If not set the first two found will be used bins : list, optional list of tuples to use as bins. Returns ------- butterfly chart Alternatives ------------ To display the results of a binary and scale variable, alternative visualisations include: [overlaid histogram](../visualisations/vis_histogram_overlay.html), [back-to-back histogram](../visualisations/vis_histogram_b2b.html), [back-to-back stem-and-leaf display](../visualisations/vis_stem_and_leaf_b2b.html), [split histogram](../visualisations/vis_histogram_split.html), [split box-plot](../visualisations/vis_boxplot_split.html) Next ---- After visualizing the data, you might want to run a test: [Student t](../tests/test_student_t_is.html), [Welch t](../tests/test_welch_t_is.html), [Trimmed means](../tests/test_trimmed_mean_is.html), [Yuen-Welch](../tests/test_trimmed_mean_is.html), [Z test](../tests/test_z_is.html) Author ------ Made by P. Stikker Companion website: https://PeterStatistics.com YouTube channel: https://www.youtube.com/stikpet Donations: https://www.patreon.com/bePatron?u=19398076 Examples -------- >>> import pandas as pd >>> file1 = "https://peterstatistics.com/Packages/ExampleData/StudentStatistics.csv" >>> df = pd.read_csv(file1, sep=',', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'}) >>> vi_butterfly_bin(df['Gen_Gender'], df['Over_Grade'], bins=[(0, 25), (25, 50), (50, 75), (75, 100)]) ''' #convert to pandas series if needed if type(catField) is list: catField = pd.Series(catField) if type(scaleField) is list: scaleField = pd.Series(scaleField) #combine as one dataframe df = pd.concat([catField, scaleField], axis=1) df = df.dropna() #the two categories if categories is not None: cat1 = categories[0] cat2 = categories[1] else: cat1 = df.iloc[:,0].value_counts().index[0] cat2 = df.iloc[:,0].value_counts().index[1] var_name = df.iloc[:,1].name #seperate the scores for each category X = list(df.iloc[:,1][df.iloc[:,0] == cat1]) Y = list(df.iloc[:,1][df.iloc[:,0] == cat2]) #make sure they are floats X = [float(x) for x in X] Y = [float(y) for y in Y] if bins is None: # determine the bins from all scores scores_table = tab_frequency_bins(X + Y) bins = [(scores_table.iloc[i,0], scores_table.iloc[i,1]) for i in range(len(scores_table))] # determine counts for each bin for each category X_table = tab_frequency_bins(X, bins=bins) Y_table = tab_frequency_bins(Y, bins=bins) # create list of bins for each category X_ord = [] Y_ord = [] for i in range(len(bins)): X_ord = X_ord + [str(bins[i])]*int(X_table.iloc[i, 2]) Y_ord = Y_ord + [str(bins[i])]*int(Y_table.iloc[i, 2]) # create list of categories cats = [cat1]*len(X) + [cat2]*len(Y) # use the butterfly function for binary-nominal vi_butterfly_chart(X_ord + Y_ord, cats, variation='butterfly') return