Module stikpetP.other.table_frequency_bins
Expand source code
import pandas as pd
from .table_nbins import tab_nbins
def tab_frequency_bins(data, nbins="sturges", bins=None, incl_lower=True, adjust=1):
'''
Binned Frequency Table
----------------------
Bins data and creates a frequency table with frequency density.
This function is shown in this [YouTube video](https://youtu.be/TDWLu1Jp2AE) and frequency tables are also described at [PeterStatistics.com](https://peterstatistics.com/Terms/Tables/FrequencyTable.html)
Parameters
----------
data : list or pandas series
the data
nbins : int or string, optional
either the number of bins to create, or a specific method from the *tab_nbins()* function. Default is "sturges"
bins : list of tuples, optional
incl_lower : boolean, optional
to include the lower bound, otherwise the upper bound is included. Default is True
adjust : float, optional
value to add or subtract to guarantee all scores will fit in a bin
Returns
-------
pandas.DataFrame
A dataframe with the following columns:
* *lower bound*
* *upper bound*
* *frequency*
* *frequency density*
Notes
-----
none
Before, After and Alternatives
------------------------------
Before this you might want to determine the number of bins you use [tab_nbins](../other/table_nbins.html#tab_nbins) to determine the number of bins
After this you might want to visualise the result. Use [vi_boxplot_single](../visualisations/vis_boxplot_single.html#vi_boxplot_single) for a Box (and Whisker) Plot, [vi_histogram](../visualisations/vis_histogram.html#vi_histogram) for a Histogram, [vi_stem_and_leaf](../visualisations/vis_stem_and_leaf.html#vi_stem_and_leaf) for a Stem-and-Leaf Display
Author
------
Made by P. Stikker
Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076
Examples
--------
Example 1: Numeric Pandas Series
>>> import pandas as pd
>>> df2 = pd.read_csv('https://peterstatistics.com/Packages/ExampleData/StudentStatistics.csv', sep=';', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'})
>>> ex1a = df2['Gen_Age']
>>> tab_frequency_bins(ex1a)
lower bound upper bound frequency frequency density
0 18.000000 32.571429 42.0 2.882353
1 32.571429 47.142857 1.0 0.068627
2 47.142857 61.714286 0.0 0.000000
3 61.714286 76.285714 0.0 0.000000
4 76.285714 90.857143 0.0 0.000000
5 90.857143 105.428571 0.0 0.000000
6 105.428571 120.000000 1.0 0.068627
>>> ex1b = df2['Gen_Age']
>>> myBins = [(0, 20), (20, 25), (25, 30), (30, 120)]
>>> tab_frequency_bins(ex1b, bins=myBins)
lower bound upper bound frequency frequency density
0 0.0 20.0 12.0 0.600000
1 20.0 25.0 21.0 4.200000
2 25.0 30.0 8.0 1.600000
3 30.0 120.0 3.0 0.033333
Example 2: Numeric list
>>> ex2 = [1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5]
>>> tab_frequency_bins(ex2, adjust=0.1)
lower bound upper bound frequency frequency density
0 1.000000 1.683333 3.0 4.390244
1 1.683333 2.366667 3.0 4.390244
2 2.366667 3.050000 2.0 2.926829
3 3.050000 3.733333 0.0 0.000000
4 3.733333 4.416667 3.0 4.390244
5 4.416667 5.100000 7.0 10.243902
'''
if type(data) is list:
data = pd.Series(data)
#remove missing values
data = data.dropna()
if bins is None:
if isinstance(nbins, int):
k = nbins
else:
k = tab_nbins(data, method=nbins)
#determine minimum and maximum
mx = max(data)
mn = min(data)
#increase maximimum if to include the lower bound
if incl_lower:
mx = mx + adjust
#decrease minimum if to include the upper bound
else:
mn = mn - adjust
#determine range and width
r = mx - mn
h = r/k
#create the bins
bins=[]
i = 0
while i < k:
lb = mn + i*h
ub = lb + h
bins.append((lb, ub))
i = i+1
tab = pd.DataFrame(columns = ["lower bound", "upper bound", "frequency", "frequency density"])
for i in bins:
lb = i[0]
ub = i[1]
if incl_lower:
f = sum(data<ub) - sum(data<lb)
else:
f = sum(data<=ub) - sum(data<=lb)
fd = f / (ub - lb)
tab.loc[len(tab)] = [lb, ub, f, fd]
return tab
Functions
def tab_frequency_bins(data, nbins='sturges', bins=None, incl_lower=True, adjust=1)
-
Binned Frequency Table
Bins data and creates a frequency table with frequency density.
This function is shown in this YouTube video and frequency tables are also described at PeterStatistics.com
Parameters
data
:list
orpandas series
- the data
nbins
:int
orstring
, optional- either the number of bins to create, or a specific method from the tab_nbins() function. Default is "sturges"
bins
:list
oftuples
, optionalincl_lower
:boolean
, optional- to include the lower bound, otherwise the upper bound is included. Default is True
adjust
:float
, optional- value to add or subtract to guarantee all scores will fit in a bin
Returns
pandas.DataFrame
-
A dataframe with the following columns:
- lower bound
- upper bound
- frequency
- frequency density
Notes
none
Before, After and Alternatives
Before this you might want to determine the number of bins you use tab_nbins to determine the number of bins
After this you might want to visualise the result. Use vi_boxplot_single for a Box (and Whisker) Plot, vi_histogram for a Histogram, vi_stem_and_leaf for a Stem-and-Leaf Display
Author
Made by P. Stikker
Companion website: https://PeterStatistics.com
YouTube channel: https://www.youtube.com/stikpet
Donations: https://www.patreon.com/bePatron?u=19398076Examples
Example 1: Numeric Pandas Series
>>> import pandas as pd >>> df2 = pd.read_csv('https://peterstatistics.com/Packages/ExampleData/StudentStatistics.csv', sep=';', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'}) >>> ex1a = df2['Gen_Age'] >>> tab_frequency_bins(ex1a) lower bound upper bound frequency frequency density 0 18.000000 32.571429 42.0 2.882353 1 32.571429 47.142857 1.0 0.068627 2 47.142857 61.714286 0.0 0.000000 3 61.714286 76.285714 0.0 0.000000 4 76.285714 90.857143 0.0 0.000000 5 90.857143 105.428571 0.0 0.000000 6 105.428571 120.000000 1.0 0.068627
>>> ex1b = df2['Gen_Age'] >>> myBins = [(0, 20), (20, 25), (25, 30), (30, 120)] >>> tab_frequency_bins(ex1b, bins=myBins) lower bound upper bound frequency frequency density 0 0.0 20.0 12.0 0.600000 1 20.0 25.0 21.0 4.200000 2 25.0 30.0 8.0 1.600000 3 30.0 120.0 3.0 0.033333
Example 2: Numeric list
>>> ex2 = [1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5] >>> tab_frequency_bins(ex2, adjust=0.1) lower bound upper bound frequency frequency density 0 1.000000 1.683333 3.0 4.390244 1 1.683333 2.366667 3.0 4.390244 2 2.366667 3.050000 2.0 2.926829 3 3.050000 3.733333 0.0 0.000000 4 3.733333 4.416667 3.0 4.390244 5 4.416667 5.100000 7.0 10.243902
Expand source code
def tab_frequency_bins(data, nbins="sturges", bins=None, incl_lower=True, adjust=1): ''' Binned Frequency Table ---------------------- Bins data and creates a frequency table with frequency density. This function is shown in this [YouTube video](https://youtu.be/TDWLu1Jp2AE) and frequency tables are also described at [PeterStatistics.com](https://peterstatistics.com/Terms/Tables/FrequencyTable.html) Parameters ---------- data : list or pandas series the data nbins : int or string, optional either the number of bins to create, or a specific method from the *tab_nbins()* function. Default is "sturges" bins : list of tuples, optional incl_lower : boolean, optional to include the lower bound, otherwise the upper bound is included. Default is True adjust : float, optional value to add or subtract to guarantee all scores will fit in a bin Returns ------- pandas.DataFrame A dataframe with the following columns: * *lower bound* * *upper bound* * *frequency* * *frequency density* Notes ----- none Before, After and Alternatives ------------------------------ Before this you might want to determine the number of bins you use [tab_nbins](../other/table_nbins.html#tab_nbins) to determine the number of bins After this you might want to visualise the result. Use [vi_boxplot_single](../visualisations/vis_boxplot_single.html#vi_boxplot_single) for a Box (and Whisker) Plot, [vi_histogram](../visualisations/vis_histogram.html#vi_histogram) for a Histogram, [vi_stem_and_leaf](../visualisations/vis_stem_and_leaf.html#vi_stem_and_leaf) for a Stem-and-Leaf Display Author ------ Made by P. Stikker Companion website: https://PeterStatistics.com YouTube channel: https://www.youtube.com/stikpet Donations: https://www.patreon.com/bePatron?u=19398076 Examples -------- Example 1: Numeric Pandas Series >>> import pandas as pd >>> df2 = pd.read_csv('https://peterstatistics.com/Packages/ExampleData/StudentStatistics.csv', sep=';', low_memory=False, storage_options={'User-Agent': 'Mozilla/5.0'}) >>> ex1a = df2['Gen_Age'] >>> tab_frequency_bins(ex1a) lower bound upper bound frequency frequency density 0 18.000000 32.571429 42.0 2.882353 1 32.571429 47.142857 1.0 0.068627 2 47.142857 61.714286 0.0 0.000000 3 61.714286 76.285714 0.0 0.000000 4 76.285714 90.857143 0.0 0.000000 5 90.857143 105.428571 0.0 0.000000 6 105.428571 120.000000 1.0 0.068627 >>> ex1b = df2['Gen_Age'] >>> myBins = [(0, 20), (20, 25), (25, 30), (30, 120)] >>> tab_frequency_bins(ex1b, bins=myBins) lower bound upper bound frequency frequency density 0 0.0 20.0 12.0 0.600000 1 20.0 25.0 21.0 4.200000 2 25.0 30.0 8.0 1.600000 3 30.0 120.0 3.0 0.033333 Example 2: Numeric list >>> ex2 = [1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5] >>> tab_frequency_bins(ex2, adjust=0.1) lower bound upper bound frequency frequency density 0 1.000000 1.683333 3.0 4.390244 1 1.683333 2.366667 3.0 4.390244 2 2.366667 3.050000 2.0 2.926829 3 3.050000 3.733333 0.0 0.000000 4 3.733333 4.416667 3.0 4.390244 5 4.416667 5.100000 7.0 10.243902 ''' if type(data) is list: data = pd.Series(data) #remove missing values data = data.dropna() if bins is None: if isinstance(nbins, int): k = nbins else: k = tab_nbins(data, method=nbins) #determine minimum and maximum mx = max(data) mn = min(data) #increase maximimum if to include the lower bound if incl_lower: mx = mx + adjust #decrease minimum if to include the upper bound else: mn = mn - adjust #determine range and width r = mx - mn h = r/k #create the bins bins=[] i = 0 while i < k: lb = mn + i*h ub = lb + h bins.append((lb, ub)) i = i+1 tab = pd.DataFrame(columns = ["lower bound", "upper bound", "frequency", "frequency density"]) for i in bins: lb = i[0] ub = i[1] if incl_lower: f = sum(data<ub) - sum(data<lb) else: f = sum(data<=ub) - sum(data<=lb) fd = f / (ub - lb) tab.loc[len(tab)] = [lb, ub, f, fd] return tab