From 908eb395675f2d28443f185170ef0aedf6c9f231 Mon Sep 17 00:00:00 2001 From: "Timothy Fraser, PhD" Date: Wed, 18 Sep 2024 00:17:42 -0400 Subject: [PATCH] Create 08_training.py --- code/08_training.py | 1100 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1100 insertions(+) create mode 100644 code/08_training.py diff --git a/code/08_training.py b/code/08_training.py new file mode 100644 index 0000000..172da86 --- /dev/null +++ b/code/08_training.py @@ -0,0 +1,1100 @@ +# 08_training.py +# Training: Statistical Process Control +# Pairs with R training: +# https://timothyfraser.com/sigma/statistical-process-control-in-r.html +# Tim Fraser + +# Getting Started ############################ + +# In this workshop, we will learn how to perform +# statistical process control in R, using statistical tools +# and ggplot visualizations! Statistical Process Control refers +# to using statistics to (1) measure variation in product +# quality over time and (2) identify benchmarks to know +# when intervention is needed. Let’s get started! + +## Install Packages ############################ +# Install main python packages for this script +# !pip install pandas +# !pip install sympy +# !pip install math +# !pip install plotnine +# !pip install os +# !pip install sys +# !pip install patchworklib + +## Load Packages ############################ +import pandas as pd # Import pandas functions +from plotnine import * # import all plotnine visualization functions +import patchworklib as pw # for combining plots +import os +import sys +# Append files in this folder (functions) to the Python Path +sys.path.append(os.path.abspath('functions')) +# Now you can reference them directly +from functions_distributions import * +# Import math functions +import math + + + + +## Our Case ############################################# + +# For today’s workshop, we’re going to think about why +# quality control matters in a local economy, by examining +# the case of the Japanese Hot Springs bath economy! +# Hot springs, or onsen, are a major source of tourism and recreation +# for families in Japan, bringing residents from across +# the country every year to often rural communities where +# the right geological conditions have brought on naturally +# occurring hot springs. Restaurants, taxi and bus companies, +# and many service sector firms rely on their local onsen +# to bring in a steady stream (pun intended) of tourists +# to the local economy. So, it’s often in the best interest +# of onsen operators to keep an eye on the temperature, +# minerals, or other aspects of their hot springs baths to +# ensure quality control, to keep up their firm (and town’s!) +# reputation for quality rest and relaxation! + +# Onsen-goers often seek out specific types of hot springs, +# so it’s important for an onsen to actually provide what +# it advertises! Serbulea and Payyappallimana (2012) describe +# some of these benchmarks. +# +# - Temperature: Onsen are divided into “Extra Hot +# Springs” (>42 °C), “Hot Springs” (41~34°C), and “Warm +# Springs” (33~25°C). +# +# pH: Onsen are classified into “Acidic” (pH < 3), “Mildly +# Acidic” (pH 3~6), “Neutral” (pH 6~7.5), “Mildly alkaline” +# (ph 7.5~8.5), and “Alkaline” (pH > 8.5). +# +# Sulfur: Sulfur onsen typically have about 2mg of sulfur +# per 1kg of hot spring water; sulfur levels must exceed +# 1 mg to count as a Sulfur onsen. (It smells like rotten +# eggs!) +# +# These are decent examples of quality control metrics +# that onsen operators might want to keep tabs on! + +## Our Data #############################################33 + +# You’ve been hired to evaluate quality control at a +# local onsen in sunny Kagoshima prefecture! Every month, +# for 15 months, you systematically took 20 random samples +# of hot spring water and recorded its temperature, pH, +# and sulfur levels. How might you determine if this onsen +# is at risk of slipping out of one sector of the market +# (eg. Extra Hot!) and into another (just normal Hot +# Springs?). + +# Let's import our samples of bathwater over time! +water = pd.read_csv('workshops/onsen.csv') +water.head() +water.info() + + +# Our dataset contains: +# +# id: unique identifer for each sample of onsen water. +# +# time: categorical variable describing date of sample +# (month 1, month 3, … month 15). +# +# temp: temperature in celsius. +# +# ph: pH (0 to 14) +# +# sulfur: milligrams of sulfur ions. + + +# 1. Visualizing Quality Control ########################### + +# Let’s learn some key techniques for visualizing quality control! + +## 1.1 theme_set() #################################### + +# First, when you’re about to make a bunch of ggplot visuals, +# it can help to set a common theme across them all with theme_set(). + +from plotnine import theme_classic, theme, element_text, element_blank, element_rect, theme_set + +# Set a global theme +theme_set( + theme_classic(base_size=14) + + theme( + # Position the legend at the bottom + legend_position='bottom', + # Center-align plot title, subtitle, and caption + plot_title=element_text(ha='center'), # ha is short for 'horizontal alignment' + plot_subtitle=element_text(ha='center'), + plot_caption=element_text(ha='center'), + # Remove axis ticks and lines + axis_ticks=element_blank(), + axis_line=element_blank(), + # Add a grey border around the plot + panel_border=element_rect(fill=None, color='grey') + ) +) + +# 1.2. Process Descriptive Statistics ######################## + +# First, let’s describe our process, using favorite description +# statistics. We’re going to want to do this a bunch, +# so why don’t we just write a function for it? Let’s +# write describe(), which will take a vector x and +# calculate the mean(), sd(), skewness(), and kurtosis(), +# and then combine a nice caption describing them. +# +# (I encourage you to write your own functions like +# this to help expedite your coding! Start simple!) + + + +# Let's write a describe() function to help us +def describe(x): + # Testing values + # x = water.temp + + x = pd.Series(x) + + df = pd.DataFrame({ + 'stat_mu': pd.Series( x.mean() ), + 'stat_sd': pd.Series( x.std() ), + 'stat_skew': pd.Series( x.skew() ), + 'stat_kurtosis': pd.Series( x.kurtosis() ) + }) + + a = 'Process Mean: ' + str(df.stat_mu.round(2)[0]) + b = 'SD: ' + str(df.stat_sd.round(2)[0]) + c = 'Skewness: ' + str(df.stat_skew.round(2)[0]) + d = 'Kurtosis: ' + str(df.stat_kurtosis.round(2)[0]) + caption = a + ' | ' + b + ' | ' + c + ' | ' + d + + df['caption'] = caption + + return df + +# Get caption +tab = describe(water.temp) + +# view +tab + + +# 1.3 Process Overview Visual ######################3 + +# Your first step should always be to look at the +# data overall! geom_jitter() and geom_boxplot() +# can help you do this. + +# geom_jitter() is a jittered scatterplot, jittering points +# a little to help with visibility. Since we want to +# be really precise on our quality control metrics, we +# could jitter the width a little (width = 0.25), but +# hold the y-axis (quality metric) constant at height +# = 0. These are in your x and y axis units, so decide +# based on your data each time. +# +# geom_boxplot() makes a boxplot for each group (time) +# in our data, showing the interquartile range (25th, +# 50th, and 75th percentiles) of our y variable for each +# group. Helpful way to view distributions. (Alternatively, +# you can try geom_violin(), which works the same way.) +# +# geom_hline() makes a horizontal line at the yintercept; +# we can tell it to show the mean() of y (in this case, +# temp). +# +# geom_histogram() is a histogram! We can use coord_flip() +# to turn it vertical to match the y-axis. + +# The pw.load_ggplot() and | operator from the patchworklib package +# binds two plots together into one, giving each a specific proportional +# width (eg. c(0.25, 0.75) percent, or c(5, 1) would +# be 5/6 and 1/6.) + + +# Get grand mean lines +stat = pd.DataFrame({'mu': pd.Series(water.temp.mean()) }) + +# view +stat + +# Make the initial boxplot... +g1 = (ggplot() + + # Plot raw points + geom_jitter(data = water, mapping = aes(x = 'time', y = 'temp'), height = 0, width = 0.25) + + geom_boxplot(data = water, mapping = aes(x = 'time', y = 'temp', group = 'time')) + + # Plot grand mean + geom_hline(data = stat, mapping = aes(yintercept = 'mu' ), color = 'lightgrey', size = 3) + + # Add our descriptive stats in the caption! + labs(x = "Time (Subgroup)", y = "Temperature (Celsius)", + subtitle = "Process Overview", + caption = tab.caption[0]) + ) + +# Part 1 of plot +g1 + +# Make the histogram, but tilt it on its side +g2 = (ggplot() + + geom_histogram( + data = water, mapping = aes(x = 'temp'), + bins = 15, color = "white", fill = "grey") + + theme_void() + # Clear the theme + coord_flip() # tilt on its side + ) +# Part 2 of plot +g2 + +# Then bind them together into 1 plot, 'h'orizontally aligned. + +# Let's combine the plots with patchwork +p1 = pw.load_ggplot(g1, figsize =(5,4)) +p2 = pw.load_ggplot(g2, figsize = (1,4)) + +# Bundle them together. +pp = (p1 | p2) + +pp.savefig("code/08_process_overview.png", dpi = 200) +# Then navigate to the file to open it. + + +# We can tell from this visual several things! +# +# Side Histogram: Our overall distribution is pretty centered. +# +# Descriptive Statistics: Our distribution has little skew (~0) +# and has slightly higher-than-average kurtosis (<3) (very centered). +# +# Line vs. Boxplots: Over time, our samples sure do seem +# to be getting slightly further from the mean! + + +# LEARNING CHECK 1 ################################## + +## QUESTION ############################# + +# We analyzed temperature variation above, but our hot +# springs owner wants to know about variation in pH +# too! Write a function to produce a process overview +# plot given any 2 vectors (water.time and water.pH, +# in this case), and visualize the process overview +# for pH! (You can do it!) + + +## ANSWER ############################# + +def ggprocess(x, y, xlab='Subgroup', ylab='Metric'): + # import pandas as pd + # import patchworklib as pw + # from plotnine import * + + # Testing values + # x = water.time; y = water.temp; xlab='Subgroup'; ylab='Metric'; path = "code/08_process_overview_example.png" + + # Convert vectors to series, and bundle as data.frame + data = pd.DataFrame({ + 'x': pd.Series(x), + 'y': pd.Series(y) }) + + + # Get grand mean lines + stat = pd.DataFrame({'mu': pd.Series(data.y.mean()) }) + + # describe data + tab = describe(data.y) + + # Make the initial boxplot... + g1 = (ggplot() + + # Plot raw points + geom_jitter(data = data, mapping = aes(x = 'x', y = 'y'), height = 0, width = 0.25) + + geom_boxplot(data = data, mapping = aes(x = 'x', y = 'y', group = 'x')) + + # Plot grand mean + geom_hline(data = stat, mapping = aes(yintercept = 'mu' ), color = 'lightgrey', size = 3) + + # Add our descriptive stats in the caption! + labs(x = xlab, y = ylab, + subtitle = "Process Overview", + caption = tab.caption[0]) + ) + + # Make the histogram, but tilt it on its side + g2 = (ggplot() + + geom_histogram( + data = data, mapping = aes(x = 'y'), + bins = 15, color = "white", fill = "grey") + + theme_void() + # Clear the theme + coord_flip() # tilt on its side + ) + + # Then bind them together into 1 plot, horizontally aligned. + + # Let's combine the plots with patchwork + p1 = pw.load_ggplot(g1, figsize =(5,4)) + p2 = pw.load_ggplot(g2, figsize = (1,4)) + + # Bundle them together. + pp = (p1 | p2) + + + return pp + +# Try it! +pp = ggprocess(x = water.time, y = water.ph) + +pp.savefig("code/08_process_overview_function.png", dpi = 200) + +# In comparison to temp, pH is a much more controlled process. +# +# I encourage you to use this ggprocess() function you just created! +# + + +# 2. Average and Standard Deviation Graphs ############################### + +## 2.1 Key Statistics ##################################### + +# Next, to analyze these processes more in depth, we +# need to assemble statistics at 2 levels: +# +# within-group statistics measure quantities of interest +# within each subgroup (eg. each monthly slice time in +# our onsen data). +# +# between-group statistics measure total quantities of +# interest for the overall process (eg. the overall +# “grand” mean, overall standard deviation in our onsen +# data). + + +## 2.2 Subgroup (Within-Group) Statistics ###################### + +# Let’s apply these to our onsen data to get statistics +# describing each subgroup’s distribution, +# a.k.a. short-term or within-group statistics. + + +# Calculate short-term statistics within each group + # For each timestpe +# Calculate these statistics of interest! +stat_s = water.groupby('time').apply(lambda g: pd.Series({ + # Return each time + 'time': g['time'].values[0], + # within group mean + 'xbar': g['temp'].mean(), + # within-group range + 'r': g['temp'].max() - g['temp'].min(), + # within group standard deviation + 'sd': g['temp'].std(), + # within group sample size n + 'nw': g['temp'].count(), + # within group degrees of freedom + 'df': g['temp'].count() - 1 +})) +stat_s + +# Last, we'll calculate sigma_short (within-group variance) +# We're going to calculate the short-term variation parameter sigma_s (sigma_short) +# by taking the square root of the average of the standard deviation +# Essentially, we're weakening the impact of any special cause variation +# so that our sigma is mostly representative of common cause (within-group) variation +stat_s = stat_s.assign(sigma_s = lambda g: (sum(g.sd**2)/len(g.sd**2))**0.5) + +# And get standard error (in a way that retains each subgroup's sample size!) +stat_s = stat_s.assign(se = lambda g: g.sigma_s / g.nw**0.5) + +# Calculate 6-sigma control limits! +stat_s = stat_s.assign(upper = lambda g: g.xbar.mean() + 3*g.se) +stat_s = stat_s.assign(lower = lambda g: g.xbar.mean() - 3*g.se) + +stat_s # view result! + + + + +## 2.3 Total Statistics (Between Groups) ###################3 + +# To get between-group estimates.... +stat_t = pd.DataFrame({ + 'xbbar': [ stat_s.xbar.mean() ], + 'rbar' : [ stat_s.r.mean() ], + 'sdbar' : [ stat_s.sd.mean() ], + # We can also recalculate sigma_short here too + 'sigma_s': (sum(stat_s.sd**2) / len(stat_s.sd**2))**0.5, + # Or we can calculate overall standard deviation + 'sigma_t': water.temp.std() +}) + + +# So, now that we have estimated within-group, common cause variation via +# σ short (sigma_s) and the standard error (se), what can we say about our process? + +stat_t +stat_s + + + + + +## 2.4 Average and Standard Deviation Charts ############################# + +# The preferred method for measuring within-group variability +# is the standard deviation, rather than the range, so +# we generally recommend (a) Average ( X̄ ) and Standard +# Deviation ( S ) charts over (b) Average ( X̄ ) and +# Range ( R ) charts. + + +labels = pd.DataFrame({ + 'time': [ stat_s.time.max(), stat_s.time.max(), stat_s.time.max() ], + 'type': ['xbbar', 'upper', 'lower'], + 'name': ['mean', '+3 s', '-3 s'], + 'value': [stat_s.xbar.mean(), stat_s.upper.max(), stat_s.lower.min()] +}) +labels['value'] = round(labels.value, 2) +labels['text'] = labels.name + " = " + labels.value.astype(str) + +(ggplot() + + geom_hline( + data = stat_t, + mapping = aes(yintercept = 'xbbar'), color = "lightgrey") + + geom_ribbon( + data = stat_s, + mapping = aes(x = 'time', ymin = 'lower', ymax = 'upper'), + fill = "steelblue", alpha = 0.2) + + geom_line( + data = stat_s, + mapping = aes(x = 'time', y = 'xbar'), size = 1) + + geom_point( + data = stat_s, + mapping = aes(x = 'time', y = 'xbar'), size = 5) + + geom_label( + data = labels, + mapping = aes(x = 'time', y = 'value', label = 'text'), + # notice that hjust = 1 is instead in python 'ha = "right"' + ha = 'right') + + labs(x = "Time (Subgroups)", y = "Average", + subtitle = "Average and Standard Deviation Chart") +) + +# This tells us that excitingly, our onsen temperatures +# are quite firmly within range. While the average +# varies quite a bit, it remains comfortably within +# 3 standard deviations of the mean. + + +# LEARNING CHECK 2 ################################################ + +## QUESTION ###################################################### + +# Well, that was nifty, but can we do it all +# over again for pH? Make some rad upper and lower +# confidence intervals for X̄ for pH! + + +## ANSWER ###################################################### + +# Calculate these statistics of interest! + +# Heck, let's make ourselves a get_stat_s() function. + +def get_stat_s(x, y): + # Put x and y into a data.frame + data = pd.DataFrame({'x': pd.Series(x), 'y': pd.Series(y)}) + + stat_s = data.groupby('x').apply(lambda g: pd.Series({ + # Return each time + 'time': g['x'].values[0], + # within group mean + 'xbar': g['y'].mean(), + # within-group range + 'r': g['y'].max() - g['y'].min(), + # within group standard deviation + 'sd': g['y'].std(), + # within group sample size n + 'nw': g['y'].count(), + # within group degrees of freedom + 'df': g['y'].count() - 1 + })) + + + # Last, we'll calculate sigma_short (within-group variance) + stat_s = stat_s.assign(sigma_s = lambda g: (sum(g.sd**2)/len(g.sd**2))**0.5) + + # And get standard error (in a way that retains each subgroup's sample size!) + stat_s = stat_s.assign(se = lambda g: g.sigma_s / g.nw**0.5) + + # Calculate 6-sigma control limits! + stat_s = stat_s.assign(upper = lambda g: g.xbar.mean() + 3*g.se) + stat_s = stat_s.assign(lower = lambda g: g.xbar.mean() - 3*g.se) + + return stat_s + +# Calculate within-group stats! +stat_s = get_stat_s(x = water.time, y = water.ph) + + +stat_s # view result! + + +# Let's write ourselves a get_labels() function, that process a stat_s dataframe. + +def get_labels(data): + labels = pd.DataFrame({ + 'time': [ stat_s.time.max(), stat_s.time.max(), stat_s.time.max() ], + 'type': ['xbbar', 'upper', 'lower'], + 'name': ['mean', '+3 s', '-3 s'], + 'value': [stat_s.xbar.mean(), stat_s.upper.max(), stat_s.lower.min()] + }) + labels['value'] = round(labels.value, 2) + labels['text'] = labels.name + " = " + labels.value.astype(str) + return labels + +# Generate labels! +labels = get_labels(stat_s) + +# Let's write ourselves a get_stat_t() function... + +def get_stat_t(x, y): + # To get between-group estimates.... + stat_t = pd.DataFrame({ + 'xbbar': [ stat_s.xbar.mean() ], + 'rbar' : [ stat_s.r.mean() ], + 'sdbar' : [ stat_s.sd.mean() ], + # We can also recalculate sigma_short here too + 'sigma_s': (sum(stat_s.sd**2) / len(stat_s.sd**2))**0.5, + # Or we can calculate overall standard deviation + 'sigma_t': water.temp.std() + }) + + return stat_t + +# Calculate an extra quantity +extra = pd.DataFrame({'xbbar': [stat_s.xbar.mean()] }) + +(ggplot() + + geom_hline( + data = extra, + mapping = aes(yintercept = 'xbbar'), color = "lightgrey") + + geom_ribbon( + data = stat_s, + mapping = aes(x = 'time', ymin = 'lower', ymax = 'upper'), + fill = "steelblue", alpha = 0.2) + + geom_line( + data = stat_s, + mapping = aes(x = 'time', y = 'xbar'), size = 1) + + geom_point( + data = stat_s, + mapping = aes(x = 'time', y = 'xbar'), size = 5) + + geom_label( + data = labels, + mapping = aes(x = 'time', y = 'value', label = 'text'), + # notice that hjust = 1 is instead in python 'ha = "right"' + ha = 'right') + + labs(x = "Time (Subgroups)", y = "Average", + subtitle = "Average and Standard Deviation Chart") +) + + +# This tells us that excitingly, +# our onsen temperatures are quite firmly within range. +# While the average varies quite a bit, +# it remains comfortably within 3 standard deviations of the mean. + + +# We could also make a jumbo function to do all of this! + + +def ggavgsd(x, y): + # Put x and y into a data.frame + data = pd.DataFrame({'x': pd.Series(x), 'y': pd.Series(y)}) + + stat_s = data.groupby('x').apply(lambda g: pd.Series({ + # Return each time + 'time': g['x'].values[0], + # within group mean + 'xbar': g['y'].mean(), + # within-group range + 'r': g['y'].max() - g['y'].min(), + # within group standard deviation + 'sd': g['y'].std(), + # within group sample size n + 'nw': g['y'].count(), + # within group degrees of freedom + 'df': g['y'].count() - 1 + })) + + # Last, we'll calculate sigma_short (within-group variance) + stat_s = stat_s.assign(sigma_s = lambda g: (sum(g.sd**2)/len(g.sd**2))**0.5) + + # And get standard error (in a way that retains each subgroup's sample size!) + stat_s = stat_s.assign(se = lambda g: g.sigma_s / g.nw**0.5) + + # Calculate 6-sigma control limits! + stat_s = stat_s.assign(upper = lambda g: g.xbar.mean() + 3*g.se) + stat_s = stat_s.assign(lower = lambda g: g.xbar.mean() - 3*g.se) + + # Generate labels! + labels = pd.DataFrame({ + 'time': [ stat_s.time.max(), stat_s.time.max(), stat_s.time.max() ], + 'type': ['xbbar', 'upper', 'lower'], + 'name': ['mean', '+3 s', '-3 s'], + 'value': [stat_s.xbar.mean(), stat_s.upper.max(), stat_s.lower.min()] + }) + labels['value'] = round(labels.value, 2) + labels['text'] = labels.name + " = " + labels.value.astype(str) + + + # Calculate the grand mean + extra = pd.DataFrame({'xbbar': [stat_s.xbar.mean()] }) + + # Generate plot + gg = (ggplot() + + geom_hline( + data = extra, + mapping = aes(yintercept = 'xbbar'), color = "lightgrey") + + geom_ribbon( + data = stat_s, + mapping = aes(x = 'time', ymin = 'lower', ymax = 'upper'), + fill = "steelblue", alpha = 0.2) + + geom_line( + data = stat_s, + mapping = aes(x = 'time', y = 'xbar'), size = 1) + + geom_point( + data = stat_s, + mapping = aes(x = 'time', y = 'xbar'), size = 5) + + geom_label( + data = labels, + mapping = aes(x = 'time', y = 'value', label = 'text'), + # notice that hjust = 1 is instead in python 'ha = "right"' + ha = 'right') + + labs(x = "Time (Subgroups)", y = "Average", + subtitle = "Average and Standard Deviation Chart") + ) + + return gg + +# Run it! +ggavgsd(x = water.time, y = water.ph) + + + + +# 3. Moving Range Charts ############################################ + + +## 3.1 Individual and Moving Range Charts + +# Suppose we only had 1 observation per subgroup! +# There’s no way to calculate standard deviation for that +# - after all, there’s no variation within each subgroup! +# Instead, we can generate an individual and moving range chart. + + +# Suppose we sample just the first out of each our months. +indiv = water[water['id'].isin([1, 21, 41, 61, 81, 101, 121, 141])] + +# The average moving range ̄mRbar aptly refers to +# the average of the moving Range mR, the difference +# in values over time. We can calculate the moving +# range using the diff() function on a vector like +# temp, shown below. abs() converts each value to +# be positive, since ranges are always 0 to infinity. + +# Let's see our original values +indiv.temp + +# diff() gets range between second and first, +# third and second, and so on +indiv.temp.diff().abs() + + + +# Just like all statistics, mRbar too has its own +# distribution, containing a range of slightly higher +# and lower ̄mR statistics we might have gotten had +# our sample been just slightly different due to +# chance. As a result, we will want to estimate a +# confidence interval around ̄mR, but how are we +# to do that if we have no statistic like σ to +# capture that variation? + +# Well, good news: We can approximate σs by taking +# the ratio of ̄mRbar over a factor called d2. What +# is d2? I’m glad you asked! + +# σ short ~ mRbar / d2 + + + +## 3.2 Factors d2 and Friends ############################3 +# +# As discussed above, any statistics has a latent +# distribution of other values you might have gotten +# for your statistic had your sample been just slightly +# different due to random error. Fun fact: we can +# actually see those distributions pretty easily thanks +# to simulation! +# +# Suppose we have a subgroup of size n = 1, so +# we calculate a moving range of length 1. +# +# This subgroup and its moving range is just one of +# the possible subgroups we could have encountered by +# chance, so we can think of it as a random draw +# from an archetypal normal distribution (mean = 0 +# and sd = 1). +# +# If we take enough samples of moving ranges from +# that distribution, we can plot the distribution. Below, +# we take n = 10000 samples with rnorm() and plot +# the vector with hist(). +# +# We find a beautiful distribution of moving range +# statistics for n=1 size subgroups. + +mrsim = rnorm(n = 10000).diff().abs().dropna() +hist(mrsim) + + +# Much like k factors in the exponential distribution (we'll learn these soon), +# we can use this distribution of mR stats to produce +# a series of factors that can help us estimate any +# upper or lower confidence interval in a moving range +# distribution. + +# For example, we can calculate: + +# d2, the mean of this archetypal mR distribution. + +# d3, the standard deviation of this mR distribution. + +# Technically, d2 is a ratio, which says that in +# a distribution with a standard deviation of 1, the +# mean mR is d2. In other words, d2 = (mRbar,n=1)/(σ,n=1). + +# So, if we have observed +# a real life average moving range mRbar Robserved, n = 1, +# we can use this d2 factor to convert out of units +# of 1 σ, n = 1 into units the σshort of +# our observed data! + +# For example, the mean of our vector mrsim, for subgroup size n = 1, +# says that d2 (mean of these mR stats) is... +mrsim.mean() + + +# While d3 (standard deviation is...) +mrsim.std() + + +# But why stop there? We can calculate loads of other +# interesting statistics! +# For example, these statistics +# estimate the median, upper 90, and upper 95% of the distribution! +mrsim.quantile(q = [0.5, 0.9, 0.95]).round(3) + + + +## 3.3 Estimating σ short for Moving Range Statistics ##################3 + + +# Let’s apply our new knowledge about d2 +# to calculate some upper bounds ( +3σ ) +# for our average moving range estimates! + +# Get moving range over time +istat_s = pd.DataFrame({ + 'time': indiv.time.iloc[1:], + 'mr': indiv.temp.diff().abs().dropna() +}) +# Get average moving range +istat_s['mrbar'] = istat_s.mr.mean() +# Get d2 statistic +istat_s['d2'] = rnorm(n = 10000, mean = 0, sd = 1).diff().abs().dropna().mean() +# Approximate sigma s +istat_s['sigma_s'] = istat_s.mrbar / istat_s.d2 +# Our subgroup size was 1, right? +istat_s['n'] = 1 +# so this means that sigma_s just equals the standard error here +istat_s['se'] = istat_s.sigma_s / istat_s.n**0.5 +# compute upper 3-se bound +istat_s['upper'] = istat_s.mrbar + 3 * istat_s.se +# and lower ALWAYS equals 0 for moving range +istat_s['lower'] = 0 + +# Why stop there? Let’s visualize it! + +# Generate labels! +labels = pd.DataFrame({ + 'time': [ istat_s.time.max(), istat_s.time.max(), istat_s.time.max() ], + 'name': ['mean', '+3 s', 'lower'], + 'value': [istat_s.mrbar.mean(), istat_s.upper.max(), istat_s.lower.min()] +}) +labels['value'] = round(labels.value, 2) +labels['text'] = labels.name + " = " + labels.value.astype(str) + + +# Calculate the grand mean +extra = pd.DataFrame({'mrbar': [istat_s.mrbar.mean()] }) + +# Generate plot +gg = (ggplot() + + geom_hline( + data = extra, + mapping = aes(yintercept = 'mrbar'), color = "lightgrey") + + geom_ribbon( + data = istat_s, + mapping = aes(x = 'time', ymin = 'lower', ymax = 'upper'), + fill = "steelblue", alpha = 0.2) + + geom_line( + data = istat_s, + mapping = aes(x = 'time', y = 'mr'), size = 1) + + geom_point( + data = istat_s, + mapping = aes(x = 'time', y = 'mr'), size = 5) + + geom_label( + data = labels, + mapping = aes(x = 'time', y = 'value', label = 'text'), + # notice that hjust = 1 is instead in python 'ha = "right"' + ha = 'right') + + labs(x = "Time (Subgroups)", y = "Moving Range", + subtitle = "Moving Range Chart") + ) + + + +# 4. Constants #####################################3 + +## 4.1 Find any dx Factor #################### + +# While our book writes extensively about d2 and +# other dwhatever factors, it’s not strictly necessary +# to calculate them unless you need them. Usually, +# we do this when we can’t calculate the standard +# deviation normally (eg. when we have only moving +# range statistics or only a subgroup sample size +# of n=1). If you’re working with full data from +# your process though, you can easily calculate σshort +# right from the empirical data, without ever needing +# to use dx factors. +# +# But let’s say you did need a dx factor for a +# subgroup range of a given sample size n = 1, 2, +# 3.... n. Could we calculate some kind of function +# to give us it? +# +# Funny you should ask! I’ve written a little helper +# function you can use. + +def dn(n, reps = 10000): + # testing values + # n = 3; reps = 10000 + + sims = pd.DataFrame({'rep': pd.Series(range(reps))+1, 'n': n }) + + # For each replicate, simulate the ranges of n values + sims = sims.groupby('rep').apply(lambda g: pd.Series({ + 'r': rnorm(n = g.n, mean = 0, sd = 1).quantile(q = [0,1]).diff().abs().dropna() + }) + # Pivot result into a data.frame + ).explode('r') + + # Calculate + stats = pd.DataFrame({ + # mean range + 'd2': sims.mean(), + # standard deviation of ranges + 'd3': sims.std() + }) + # and constants for obtaining lower and upper ci for rbar + stats['D3'] = 1 - 3*stats.d3/stats.d2 + stats['D4'] = 1 + 3*stats.d3/stats.d2 + # Sometimes D3 goes negative; we need to bound it at zero + # For any cases where D3 goes negative, bound it at zero. + stats['D3'][stats['D3'] < 0] = 0 + + return stats + + + +# Let's try it, where subgroup size is n = 2 +dn(n = 2) + + +# Let's get the constants we need too. +# Each of our samples has a sample size of 20 +d = dn(n = 20) + +# Check it! +d + + +## 4.2 Using dx factors ########################### + +# Using dn(), we can make a quick approximation for the +# upper and lower control limits for the grand range Rbar +# as well (as opposed to the grand moving range mRbar ) + +# Make group-wise calculations... +stat_w = water.groupby('time').apply(lambda g: pd.Series({ + # get withingroup range + 'r': g.temp.quantile(q=[0,1]).diff().abs().dropna(), + # get subgroup size + 'n_w': g.temp.shape[0] + # pivot result longer +})).explode(['r', 'n_w'] + # assign the time back as a column + ).assign( + time = lambda x: x.index) + +# Let's get average within group range for temperature... +stat = pd.DataFrame({ + 'rbar': [ stat_w.r.mean() ], # get rbar... + 'n_w': stat_w.n_w.unique().astype(int) }) # assuming constant subgroup size... + +# Check it! +stat + +# We find that dn() gives us constants D3 and D4... +mydstat = dn(n = stat.n_w[0] ) + + +#And use these constants to estimate the upper and lower CI for rbar! +stat['rbar_lower'] = stat.rbar[0] * mydstat.D3[0] +stat['rbar_upper'] = stat.rbar[0] * mydstat.D4[0] + +# So quick! You could use these values to make a range chart now. + +del stat, stat_w + + + +# 4.3 Finding any bx Factor ############################ + +# We might also want to know how much the standard +# deviation could possibly vary due to sampling error. +# To figure this out, we’ll simulate many many standard +# deviations from a normal distribution, like in dn(), +# for a given subgroup size n. +# +# Then, we can calculate some quantities of interest +# like C4 (the mean standard deviation from an archetypal +# normal distribution), B3 (a multiplier for getting +# the lower control limit for 3 sigmas), and B4 (a +# multiplier for getting the upper control limit for +# 3 sigmas.) + + + +def bn(n, reps = 10000): + # testing values + # n = 3; reps = 10000 + + sims = pd.DataFrame({'rep': pd.Series(range(reps))+1, 'n': n }) + + # For each replicate, simulate the ranges of n values + sims = sims.groupby('rep').apply(lambda g: pd.Series({ + 's': rnorm(n = g.n, mean = 0, sd = 1).std() + }) + # Pivot result into a data.frame + ).explode('s') + + # Calculate + stats = pd.DataFrame({ + # mean range + 'b2': sims.mean(), + # standard deviation of ranges + 'b3': sims.std() + }) + + stats['C4'] = stats.b2 # sometimes called C4 + stats['A3'] = 3 / (stats.b2 * n**0.5) + # and constants for obtaining lower and upper ci for rbar + stats['B3'] = 1 - 3*stats.b3/stats.b2 + stats['B4'] = 1 + 3*stats.b3/stats.b2 + # Sometimes B3 goes negative; we need to bound it at zero + # For any cases where D3 goes negative, bound it at zero. + stats['B3'][stats['B3'] < 0] = 0 + + return stats + + +# Let's apply this to our temp vector. + +# First, we’ll calculate the standard deviation within each subgroup, +# saved in stat_w under s. + + +# Let's get within group standard deviation +stat_w = water.groupby('time').apply(lambda g: pd.Series({ + 's': g.temp.std(), + 'n_w': g.temp.shape[0] +})) + +# Second, we’ll calculate the average standard deviation across subgroups, +# saved in stat under sbar. + +# Let's get average within group standard deviation for temperature... +stat = pd.DataFrame({ + 'sbar': [ stat_w.s.mean() ], # get sbar... + 'n_w': stat_w.n_w.unique().astype(int) }) # assuming constant subgroup size... + +stat # view + +# Third, we'll get our constants B3 and B4! + +# For a subgroup size of 20... +stat.n_w + +# Get B constants! +mybstat = bn(n = stat.n_w[0]) + +# Check them! +mybstat + +# Finally, let’s calculate our control limits! + +# Bundle together our columns... +stat = pd.concat([stat.reset_index(drop=True), mybstat.reset_index(drop = True)], axis = 1) + +# Calculate 3 sigma control limits +stat['sbar_lower'] = stat.sbar * stat.B3 +stat['sbar_upper'] = stat.sbar * stat.B4 + +# Now you’re all ready to make a control chart +# showing variation in the standard deviation! +stat + + + +# LEARNING CHECK 3 ############################################ + +## QUESTION #################################################### + +# Using our dn() function above, +# compile for yourself a short table of the +# d2 and d3 factors for subgroups sized 2 to 10. + +# yes, it will take a while to load +dx = pd.concat( + [dn(2), dn(3), dn(4), dn(5), + dn(6), dn(7), dn(8), dn(9), dn(10)], + axis = 0, ignore_index = True) +dx['n'] = [2,3,4,5,6,7,8,9,10] + +# Look at that cool table! +dx + + + +# Conclusion ####################################### + + +# All done! Great work! + + +# Cleanup +globals().clear()