--- title: "Regression_lab" author: "Kathleen" date: "3/26/2019" output: pdf_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` Load the necessary packages ```{r} library(tidyverse) library(e1071) library(psych) ``` Let start with the cereal data and get a sense of the data ```{r cereal} cereal_df <- read_csv("cereal.csv") ``` 1. Make distribution plots for each feature ```{r} ggplot(cereal_df) + geom_bar(aes(x = Shelf)) ``` ```{r} ggplot(cereal_df) + geom_bar(aes(x = Manufacturer)) ``` ```{r visualize} ggplot(data = cereal_df, mapping = aes(x = Calories)) + geom_histogram(bins=10) breaks <-c(-3,-2,-1,0,1,2,3) ggplot(cereal_df) + geom_histogram(breaks=breaks,aes(x=Calories,y=..density..), position="identity") + geom_density(aes(x=Calories,y=..density..)) summary(cereal_df) ``` ```{r} ggplot(data = cereal_df, mapping = aes(x = Sodium)) + geom_histogram(bins=15) breaks <-c(-3,-2,-1,0,1,2,3) ggplot(cereal_df) + geom_histogram(aes(x=Sodium,y=..density..), bins=15, position="identity") + geom_density(aes(x=Sodium,y=..density..)) summary(cereal_df) skewness(cereal_df$Sodium) kurtosis(cereal_df$Sodium) ``` Make sure you review your data for skew and kurtosis Skew: Measure of symmetry of the distribution Rules of thumb for skewness Normal distribution skewnwss = 0 If the skewness is between -0.5 and 0.5, the data are fairly symmetrical If the skewness is between -1 and – 0.5 or between 0.5 and 1, the data are moderately skewed If the skewness is less than -1 or greater than 1, the data are highly skewed Kurtosis: measure of distribution in the combined tails. kurtosis decreases as the tails become lighter. It increases as the tails become heavier. Normal distribution kurtosis = 0 < 0 slightly less weight in the tails than Normal distribution > 0 slightly more weight in the tails than Normal distribution ```{r} ggplot(data = cereal_df, mapping = aes(x = Sugars)) + geom_histogram(bins=16) breaks <-c(-3,-2,-1,0,1,2,3) ggplot(cereal_df) + geom_histogram(aes(x=Sugars,y=..density..), bins=16, position="identity") + geom_density(aes(x=Sugars,y=..density..)) summary(cereal_df) skewness(cereal_df$Sugars) kurtosis(cereal_df$Sugars) ``` ```{r} ``` ```{r} ggplot(data = cereal_df, mapping = aes(x = Fiber)) + geom_histogram(bins=15) breaks <-c(-3,-2,-1,0,1,2,3) ggplot(cereal_df) + geom_histogram(aes(x=Fiber,y=..density..), bins=15, position="identity") + geom_density(aes(x=Fiber,y=..density..)) summary(cereal_df) skewness(cereal_df$Fiber) kurtosis(cereal_df$Fiber) ``` 2. Check out the correlations between the variables ```{r} library(corrplot) correlations <- cor(cereal_df[,4:9]) corrplot(correlations, method="circle") ``` ```{r} pairs.panels(cereal_df) ``` 4. Let's say we are trying to predict calories given the other variables. Which variables would you consider as potential features? 5. Build a linear model use backwards elimination to remove insignigicant terms ```{r} lm_model_0 <- lm(Calories ~ Fiber + Carbs + Sugars + Sodium + Manufacturer + Shelf, cereal_df ) summary(lm_model_0) lm_model_1 <- lm(Calories ~ Fiber + Carbs + Sugars + Sodium + Shelf, cereal_df ) summary(lm_model_1) lm_model_2 <- lm(Calories ~ Carbs + Sugars + Sodium + Shelf, cereal_df) summary(lm_model_2) lm_model_3 <- lm(Calories ~ Carbs + Sugars + Shelf , cereal_df) summary(lm_model_3) lm_model_4 <- lm(Calories ~ Carbs + Sugars + Sodium , cereal_df) summary(lm_model_4) lm_model_5 <- lm(Calories ~ Carbs + Sugars , cereal_df) summary(lm_model_5) ``` 6. Determine the caloric range for a cereal that has Manufacturer = Nabisco, Sodium = 80, Fiber = 20, Sugar = 7 Carbs = 8 , Shelf = 1 ```{r} lm_model_5$coefficients pred <- lm_model_5$coefficients[1] + lm_model_5$coefficients[2]*8 + lm_model_5$coefficients[3]*7 error <- sigma(lm_model_5) below <- pred - error * 1.96 above <- pred +error * 1.96 cat("Range is" , below, " to ", above) ``` 7. Lets create a variable that represents low fiber - looking at the distribution of fiber let's define values 0, 1 and 2 as low fiber (target value = 1) all other values are high fiber (target value = 0 ) ```{r} low_fiber <- ifelse(cereal_df$Fiber == 0 | cereal_df$Fiber == 1, 1, 0) cereal_mod <- tibble(low_fiber, Calories = cereal_df$Calories, Manufacturer = cereal_df$Manufacturer, Sugars = cereal_df$Sugars, Sodium = cereal_df$Sodium, Shelf = cereal_df$Shelf) ``` 8. Create a logistic model to predict low_fiber . Is the model predictive? ```{r} pred_logistic <- glm(factor(low_fiber) ~ Calories+Manufacturer+Sugars+Sodium+Shelf, data=cereal_mod, family =binomial) summary(pred_logistic) step(pred_logistic) pred_log1 <- glm(factor(low_fiber) ~ Calories+Sugars+Sodium+Shelf, data=cereal_mod, family =binomial) summary(pred_log1) pred_log2 <- glm(low_fiber ~ Sugars+Sodium+Shelf, data=cereal_mod, family =binomial) summary(pred_log2) pred_log3 <- glm(low_fiber ~ Sodium+Shelf, data=cereal_mod, family =binomial) summary(pred_log3) pred_log4 <- glm(low_fiber ~ Sodium, data=cereal_mod, family =binomial) summary(pred_log4) ``` ```{r} low_fiber_2 <- ifelse(cereal_df$Fiber == 0 , 1, 0) cereal_mod_2 <- tibble(low_fiber_2, Calories = cereal_df$Calories, Manufacturer = cereal_df$Manufacturer, Sugars = cereal_df$Sugars, Sodium = cereal_df$Sodium, Shelf = cereal_df$Shelf) pred_logistic_2 <- glm(factor(low_fiber) ~ Calories+Manufacturer+Sugars+Sodium+Shelf, data=cereal_mod_2, family =binomial) summary(pred_logistic_2) step(pred_logistic) pred_logistic_3 <- glm(low_fiber ~ Calories+Manufacturer, data=cereal_mod_2, family =binomial) summary(pred_logistic_3) confint(pred_logistic_3) pred_logistic_4 <- glm(low_fiber ~ Manufacturer, data=cereal_mod_2, family =binomial) summary(pred_logistic_4) confint(pred_logistic_4) confint.default(pred_logistic_4) logLik(pred_logistic_4) anova(pred_logistic_4, test="Chisq") ``` The confidence intervals cross over 0, so nothing is learned here