From c96bc2efd07c17f0e0e74e72c6c9b2f9ac2c2384 Mon Sep 17 00:00:00 2001
From: "Timothy Fraser, PhD" <timothy.fraser.1@gmail.com>
Date: Thu, 12 Sep 2024 15:09:13 -0400
Subject: [PATCH] Update 03_workshop_solutions.R

---
 code/03_workshop_solutions.R | 264 ++++++++++++++++++++++++-----------
 1 file changed, 183 insertions(+), 81 deletions(-)

diff --git a/code/03_workshop_solutions.R b/code/03_workshop_solutions.R
index a15a6f6..520c5f2 100644
--- a/code/03_workshop_solutions.R
+++ b/code/03_workshop_solutions.R
@@ -9,13 +9,9 @@ library(ggplot2) # visuals
 # install.packages("mosaicCalc")
 library(mosaicCalc) # derivatives and integrals
 
+# Exercise 1 #####################################################
+# Class teaches Dr. Fraser to analyze PDFs and CDFs
 
-# Exercise 1: Class teaches Dr. Fraser to analyze PDFs and CDFs
-
-### How to make a function!
-addtwo = function(x){  x + 2 }
-addtwo(x = 2)
-remove(addtwo)
 
 ### When the PDF is provided ###################################
 
@@ -26,7 +22,11 @@ remove(addtwo)
 
 d = function(x){  (x + 2*x)/1000  }
 
+# What's the chance that we get a toaster that fails **at** 1 hour
 d(x = 1)
+
+
+
 # There is a 0.3% chance that we would get a value of 1
 # from this particular distribution / density function.
 
@@ -35,133 +35,147 @@ d(x = 1)
 
 
 # toaster - time (hours) to failure 
+# How would I use this PDF for to find probability density of failure at 50 hours?
 d(x = 50)
-x = c(0, 50, 100, 150, 200, 250, 300)
-d(x)
-remove(x)
+
 
 
 # Make our own `tibble` (an empowered data.frame from dplyr)
-toasters = tibble(
-  hours = c(0:1000),
-  # you can run functions within them on their component vectors
-  dprob = d(hours)
+# How about for 0 to 1000 hours?
+
+dat = tibble(
+  hours = 0:50,
+  prob = d(x = hours)
 )
 
-toasters %>% plot()
 
+dat
+
+# How might I plot that?
+
+# thistle4
+# tomato3
+# darksalmon2
+# slategray4
+colors()
 
-gg = toasters %>% 
-  ggplot(mapping = aes(x = hours, y = dprob)) +
-  geom_area(fill = "steelblue", color = "black", size = 1.2, alpha = 0.5) +
-  labs(x = "Hours to Failure", y = "Probability (Density)",
-       title = "Toaster Failure",
-       subtitle = "Probability Density Function")  +
-  theme_classic()
+ggplot() +
+  geom_point(data = dat, mapping = aes(x = hours, y = prob),
+             color = "tomato2")
 
+ggplot() +
+  geom_line(data = dat, mapping = aes(x = hours, y = prob, group = "my line"), 
+            color = "tomato2", linewidth = 2)
 
+ggplot() +
+  geom_area(data = dat, mapping = aes(x = hours, y = prob, group = "my line"), 
+            color = "tomato2", fill = "darksalmon", alpha = 0.5, linewidth = 2)
 
-gg
 
-gg + theme(plot.title = element_text(hjust = 0.5),
-           plot.subtitle = element_text(hjust = 0.5))
+# Oooh... can I save that plot?
 
+gg = ggplot() +
+  geom_area(data = dat, mapping = aes(x = hours, y = prob, group = "my line"), 
+            color = "tomato2", fill = "darksalmon", alpha = 0.5, linewidth = 2)
 
-ggsave(plot = gg, 
-       filename = "myplot.png", 
-       height = 4, width = 4)
+ggsave(gg, filename = "code/03_workshop_image.png", 
+       dpi = 200, width = 5, height = 10)
 
 
 
+dat
+d(x = 2)
 
 
 
+# Exercise 2 ########################################################
+
 ### Simulated Distributions ###################################
 
-# Archetypal Distributions, normal, poisson
-rpois()
-dpois()
-ppois()
-qpois()
+# Archetypal Distributions, normal, poisson, exponential
+
+# What are our 4 types of probability functions?
+# use poisson as an example
+
 
 
 # mean of 50 hours to failure
 mu = 50
 
-dat = tibble(
-  sim = rpois(n = 10, lambda = 50)
-)
 
-dat$sim %>% hist()
 
-dat %>%
-  ggplot(mapping = aes(x = sim)) +
-  geom_density()
 
 
+# How would I simulate 10 products with a mean time to failure of 50 hours?
+# Assume poisson
 
-toasters2 = tibble(
-  hours = 1:100,
-  # probability densities
-  dprob = dpois(hours, lambda = 50),
-  # cumulative probabilities
-  cprob = ppois(hours, lambda = 50),
-  cprob2 = cumsum(dprob) / sum(dprob),
-  # get back raw values
-  q = qpois(cprob, lambda = 50)
-)
-toasters2 %>% head()
+# dpois() # PDF - densities
+# ppois() # CDF - cumulative probabilities
+# qpois() # quantiles 
+# rpois() # random samples
 
-toasters2 %>%
-  ggplot(mapping = aes(x = hours, y = cprob2)) +
-  geom_area()
+toasters = rpois(n = 10, lambda = 50)
+rpois(n = 10, lambda = mu)
 
 
 
-# Empirical Hours to Failure for Some Toasters
-obs = c(10,50, 20, 30, 40, 50, 30, 20, 90)
-obs %>% hist()
-obs %>% density() %>% plot()
-dobs = obs %>% density() %>% approxfun()
 
-# empirical probability density function
-dobs(20)
+# What's the histogram look like?
 
-# empirical cumulative probability function for d()
-pobs = mosaicCalc::antiD(tilde = d(x) ~ x)
-pobs(x = c(1,2,3))
+toasters %>% hist()
 
-# Can't really easily do that for our approxfun()
-dobs
 
+# How could I get the cumulative probability,
+# from 1 to 100? Use a tibble.
 
-# probabilities densities
-dreal(c(1,2,3,4))
 
 
-toasters3 = tibble(
-  hours = seq(from = 0, to = 100, length.out = 9),
-  dprob = dobs(hours),
-  cprob = cumsum(dprob) / sum(dprob)
+dat = tibble(
+  hours = 1:100,
+  prob = ppois(hours, lambda = mu)
 )
 
-pobs = approxfun(x = toasters3$hours, y = toasters3$cprob) 
-pobs(10)
 
+ggplot() + geom_point(data = dat, mapping = aes(x = hours, y = prob))
+
+
+
+# Exercise 3 #################################################
+
+# Empirical Hours to Failure for Some Toasters
+obs = c(10,50, 20, 30, 40, 50, 30, 20, 90)
+
+
+# Make an empirical probability density function
+# install.packages("broom")
+library(broom)
+# known density function
+d = function(x){  (x + 2*x)/1000  }
+# observed density function
+dobs = obs %>% density() %>% approxfun()
 
+d(50)
+dobs(50)
 
+# Integrating the PDF to get the CDF
+p = mosaicCalc::antiD(tilde = d(x) ~ x)
+p(50)
 
-toasters3 %>%
-  ggplot(mapping = aes(x = hours, y = dprob)) +
-  geom_area()
+# You can't integrate a density model function.
+# pobs = mosaicCalc::antiD(tilde = dobs(x) ~ x)
+obs
 
+# empirical cumulative probability function for d()
+pobs = mosaicCalc::antiD(tilde = d(x) ~ x)
+pobs(x = c(1,2,3))
 
-toasters3 %>%
-  ggplot(mapping = aes(x = hours, y = cprob)) +
-  geom_area()
+# Can't really easily do that for our approxfun()
+dobs
 
+rm(list = ls())
 
 
+# Exercise 4 ########################################################
 
 # The cost of a component varies depending on market conditions.
 # Over the last year, analysts report is cost on average $50, 
@@ -170,19 +184,107 @@ toasters3 %>%
 
 # Pick 2!
 
-# Q1. What is the probability the component will cost exactly $60?
-dnorm(60, mean = 50, sd = 5)
+# Q1. What is the probability [density] the component will cost exactly $60?
+
+# PDF
+dnorm(x = 60, mean = 50, sd = 5)
+dnorm(x = 60.5, mean = 50, sd = 5)
+
+0:50 %>% dnorm(mean = 50, sd = 5)
+
 
 # Q2. What is the probability the component costs less than $60?
+
+# CDF
 pnorm(60, mean = 50, sd = 5)
 
+
 # Q3. What is the probability that the component costs more than $60?
+
 1 - pnorm(60, mean = 50, sd = 5)
 
+
 # Q4. What price is greater than 75% of all sales?
+
+# quantiles
 qnorm(0.75, mean = 50, sd = 5)
 
+
+
 # Q5. What is the probability it costs between $45 and $55?
+
+pnorm(45, mean = 50, sd = 5)
+
+pnorm(55, mean = 50, sd = 5)
+
 pnorm(55, mean = 50, sd = 5) - pnorm(45, mean = 50, sd = 5)
 
 
+# Exercise 5 ################################
+
+# Can we make a function to simplify the process 
+# of making probability calculations of Between X1 and X2? (like above)
+pslice = function(x1, x2, mean, sd){
+  pnorm(x2, mean = mean, sd = sd) - pnorm(x1, mean = mean, sd = sd)
+}
+# It works!
+pslice(x1 = 45, x2 = 55, mean=  50, sd = 5)
+
+# Exercise 6 ############################################################
+
+# dexp()
+# pexp()
+# qexp()
+# rexp()
+
+rate = 0.002
+mu = 1 / rate
+mu
+# Our product has an a mean time to failure of 500 hours of use
+
+# 1. Make a tibble of hours from 1 to 1000
+# 2. Calculate the probability density
+# 3. Calculate the cumulative probability
+# 4. Plot one of them.
+
+
+# Exercise 7 #############################################
+
+# How do we get the CDF from an observed vector?
+
+# Empirical Hours to Failure for Some Toasters
+obs = c(10,50, 20, 30, 40, 50, 30, 20, 90)
+
+# observed density function
+dobs = obs %>% density() %>% approxfun()
+
+# See the observed PDF
+obs %>% density() %>% broom::tidy() %>% plot()
+
+# Get the CDF from the observed PDF
+pobs = obs %>% density() %>% broom::tidy() %>%
+  dplyr::select(hours = x, prob = y) %>%
+  mutate(cprob = cumsum(prob) / sum(prob) ) %>%
+  dplyr::select(hours, cprob) %>%
+  approxfun()
+
+pobs(50)
+
+# Let's write ourselves a function get_pobs()
+# to help us get the CDF from an observed vector
+get_pobs = function(obs){
+  
+  obs %>% density() %>% broom::tidy() %>%
+    dplyr::select(hours = x, prob = y) %>%
+    mutate(cprob = cumsum(prob) / sum(prob) ) %>%
+    dplyr::select(hours, cprob) %>%
+    approxfun()
+  
+}
+
+pobs = obs %>% get_pobs()
+pobs(50)
+
+rm(list = ls())
+
+