Skip to content

Commit

Permalink
Update 03_workshop_solutions.R
Browse files Browse the repository at this point in the history
  • Loading branch information
timothyfraser authored Sep 12, 2024
1 parent ecbcb88 commit c96bc2e
Showing 1 changed file with 183 additions and 81 deletions.
264 changes: 183 additions & 81 deletions code/03_workshop_solutions.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,9 @@ library(ggplot2) # visuals
# install.packages("mosaicCalc")
library(mosaicCalc) # derivatives and integrals

# Exercise 1 #####################################################
# Class teaches Dr. Fraser to analyze PDFs and CDFs

# Exercise 1: Class teaches Dr. Fraser to analyze PDFs and CDFs

### How to make a function!
addtwo = function(x){ x + 2 }
addtwo(x = 2)
remove(addtwo)

### When the PDF is provided ###################################

Expand All @@ -26,7 +22,11 @@ remove(addtwo)

d = function(x){ (x + 2*x)/1000 }

# What's the chance that we get a toaster that fails **at** 1 hour
d(x = 1)



# There is a 0.3% chance that we would get a value of 1
# from this particular distribution / density function.

Expand All @@ -35,133 +35,147 @@ d(x = 1)


# toaster - time (hours) to failure
# How would I use this PDF for to find probability density of failure at 50 hours?
d(x = 50)
x = c(0, 50, 100, 150, 200, 250, 300)
d(x)
remove(x)



# Make our own `tibble` (an empowered data.frame from dplyr)
toasters = tibble(
hours = c(0:1000),
# you can run functions within them on their component vectors
dprob = d(hours)
# How about for 0 to 1000 hours?

dat = tibble(
hours = 0:50,
prob = d(x = hours)
)

toasters %>% plot()

dat

# How might I plot that?

# thistle4
# tomato3
# darksalmon2
# slategray4
colors()

gg = toasters %>%
ggplot(mapping = aes(x = hours, y = dprob)) +
geom_area(fill = "steelblue", color = "black", size = 1.2, alpha = 0.5) +
labs(x = "Hours to Failure", y = "Probability (Density)",
title = "Toaster Failure",
subtitle = "Probability Density Function") +
theme_classic()
ggplot() +
geom_point(data = dat, mapping = aes(x = hours, y = prob),
color = "tomato2")

ggplot() +
geom_line(data = dat, mapping = aes(x = hours, y = prob, group = "my line"),
color = "tomato2", linewidth = 2)

ggplot() +
geom_area(data = dat, mapping = aes(x = hours, y = prob, group = "my line"),
color = "tomato2", fill = "darksalmon", alpha = 0.5, linewidth = 2)

gg

gg + theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
# Oooh... can I save that plot?

gg = ggplot() +
geom_area(data = dat, mapping = aes(x = hours, y = prob, group = "my line"),
color = "tomato2", fill = "darksalmon", alpha = 0.5, linewidth = 2)

ggsave(plot = gg,
filename = "myplot.png",
height = 4, width = 4)
ggsave(gg, filename = "code/03_workshop_image.png",
dpi = 200, width = 5, height = 10)



dat
d(x = 2)



# Exercise 2 ########################################################

### Simulated Distributions ###################################

# Archetypal Distributions, normal, poisson
rpois()
dpois()
ppois()
qpois()
# Archetypal Distributions, normal, poisson, exponential

# What are our 4 types of probability functions?
# use poisson as an example



# mean of 50 hours to failure
mu = 50

dat = tibble(
sim = rpois(n = 10, lambda = 50)
)

dat$sim %>% hist()

dat %>%
ggplot(mapping = aes(x = sim)) +
geom_density()


# How would I simulate 10 products with a mean time to failure of 50 hours?
# Assume poisson

toasters2 = tibble(
hours = 1:100,
# probability densities
dprob = dpois(hours, lambda = 50),
# cumulative probabilities
cprob = ppois(hours, lambda = 50),
cprob2 = cumsum(dprob) / sum(dprob),
# get back raw values
q = qpois(cprob, lambda = 50)
)
toasters2 %>% head()
# dpois() # PDF - densities
# ppois() # CDF - cumulative probabilities
# qpois() # quantiles
# rpois() # random samples

toasters2 %>%
ggplot(mapping = aes(x = hours, y = cprob2)) +
geom_area()
toasters = rpois(n = 10, lambda = 50)
rpois(n = 10, lambda = mu)



# Empirical Hours to Failure for Some Toasters
obs = c(10,50, 20, 30, 40, 50, 30, 20, 90)
obs %>% hist()
obs %>% density() %>% plot()
dobs = obs %>% density() %>% approxfun()

# empirical probability density function
dobs(20)
# What's the histogram look like?

# empirical cumulative probability function for d()
pobs = mosaicCalc::antiD(tilde = d(x) ~ x)
pobs(x = c(1,2,3))
toasters %>% hist()

# Can't really easily do that for our approxfun()
dobs

# How could I get the cumulative probability,
# from 1 to 100? Use a tibble.

# probabilities densities
dreal(c(1,2,3,4))


toasters3 = tibble(
hours = seq(from = 0, to = 100, length.out = 9),
dprob = dobs(hours),
cprob = cumsum(dprob) / sum(dprob)
dat = tibble(
hours = 1:100,
prob = ppois(hours, lambda = mu)
)

pobs = approxfun(x = toasters3$hours, y = toasters3$cprob)
pobs(10)

ggplot() + geom_point(data = dat, mapping = aes(x = hours, y = prob))



# Exercise 3 #################################################

# Empirical Hours to Failure for Some Toasters
obs = c(10,50, 20, 30, 40, 50, 30, 20, 90)


# Make an empirical probability density function
# install.packages("broom")
library(broom)
# known density function
d = function(x){ (x + 2*x)/1000 }
# observed density function
dobs = obs %>% density() %>% approxfun()

d(50)
dobs(50)

# Integrating the PDF to get the CDF
p = mosaicCalc::antiD(tilde = d(x) ~ x)
p(50)

toasters3 %>%
ggplot(mapping = aes(x = hours, y = dprob)) +
geom_area()
# You can't integrate a density model function.
# pobs = mosaicCalc::antiD(tilde = dobs(x) ~ x)
obs

# empirical cumulative probability function for d()
pobs = mosaicCalc::antiD(tilde = d(x) ~ x)
pobs(x = c(1,2,3))

toasters3 %>%
ggplot(mapping = aes(x = hours, y = cprob)) +
geom_area()
# Can't really easily do that for our approxfun()
dobs

rm(list = ls())


# Exercise 4 ########################################################

# The cost of a component varies depending on market conditions.
# Over the last year, analysts report is cost on average $50,
Expand All @@ -170,19 +184,107 @@ toasters3 %>%

# Pick 2!

# Q1. What is the probability the component will cost exactly $60?
dnorm(60, mean = 50, sd = 5)
# Q1. What is the probability [density] the component will cost exactly $60?

# PDF
dnorm(x = 60, mean = 50, sd = 5)
dnorm(x = 60.5, mean = 50, sd = 5)

0:50 %>% dnorm(mean = 50, sd = 5)


# Q2. What is the probability the component costs less than $60?

# CDF
pnorm(60, mean = 50, sd = 5)


# Q3. What is the probability that the component costs more than $60?

1 - pnorm(60, mean = 50, sd = 5)


# Q4. What price is greater than 75% of all sales?

# quantiles
qnorm(0.75, mean = 50, sd = 5)



# Q5. What is the probability it costs between $45 and $55?

pnorm(45, mean = 50, sd = 5)

pnorm(55, mean = 50, sd = 5)

pnorm(55, mean = 50, sd = 5) - pnorm(45, mean = 50, sd = 5)


# Exercise 5 ################################

# Can we make a function to simplify the process
# of making probability calculations of Between X1 and X2? (like above)
pslice = function(x1, x2, mean, sd){
pnorm(x2, mean = mean, sd = sd) - pnorm(x1, mean = mean, sd = sd)
}
# It works!
pslice(x1 = 45, x2 = 55, mean= 50, sd = 5)

# Exercise 6 ############################################################

# dexp()
# pexp()
# qexp()
# rexp()

rate = 0.002
mu = 1 / rate
mu
# Our product has an a mean time to failure of 500 hours of use

# 1. Make a tibble of hours from 1 to 1000
# 2. Calculate the probability density
# 3. Calculate the cumulative probability
# 4. Plot one of them.


# Exercise 7 #############################################

# How do we get the CDF from an observed vector?

# Empirical Hours to Failure for Some Toasters
obs = c(10,50, 20, 30, 40, 50, 30, 20, 90)

# observed density function
dobs = obs %>% density() %>% approxfun()

# See the observed PDF
obs %>% density() %>% broom::tidy() %>% plot()

# Get the CDF from the observed PDF
pobs = obs %>% density() %>% broom::tidy() %>%
dplyr::select(hours = x, prob = y) %>%
mutate(cprob = cumsum(prob) / sum(prob) ) %>%
dplyr::select(hours, cprob) %>%
approxfun()

pobs(50)

# Let's write ourselves a function get_pobs()
# to help us get the CDF from an observed vector
get_pobs = function(obs){

obs %>% density() %>% broom::tidy() %>%
dplyr::select(hours = x, prob = y) %>%
mutate(cprob = cumsum(prob) / sum(prob) ) %>%
dplyr::select(hours, cprob) %>%
approxfun()

}

pobs = obs %>% get_pobs()
pobs(50)

rm(list = ls())


0 comments on commit c96bc2e

Please sign in to comment.