---
title: "Regression_lab"
author: "Kathleen"
date: "3/26/2019"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


Load the necessary packages

```{r}
library(tidyverse)
library(e1071)
library(psych)
```


Let start with the cereal data and get a sense of the data 

```{r cereal}
cereal_df <- read_csv("cereal.csv")
```


1. Make distribution plots for each feature 


```{r}
ggplot(cereal_df) + geom_bar(aes(x = Shelf))

```

```{r}
ggplot(cereal_df) + geom_bar(aes(x = Manufacturer))
```


```{r visualize}
ggplot(data = cereal_df, 
       mapping = aes(x = Calories)) + geom_histogram(bins=10)
breaks <-c(-3,-2,-1,0,1,2,3)
ggplot(cereal_df) + 
  geom_histogram(breaks=breaks,aes(x=Calories,y=..density..), position="identity") + 
  geom_density(aes(x=Calories,y=..density..))
summary(cereal_df)

```

```{r}
ggplot(data = cereal_df, 
       mapping = aes(x = Sodium)) + geom_histogram(bins=15)
breaks <-c(-3,-2,-1,0,1,2,3)
ggplot(cereal_df) + 
  geom_histogram(aes(x=Sodium,y=..density..), bins=15, position="identity") + 
  geom_density(aes(x=Sodium,y=..density..))
summary(cereal_df)
skewness(cereal_df$Sodium)
kurtosis(cereal_df$Sodium)

```
Make sure you review your data for skew and kurtosis 

Skew: Measure of symmetry of the distribution
Rules of thumb for skewness
Normal distribution skewnwss = 0
If the skewness is between -0.5 and 0.5, the data are fairly symmetrical
If the skewness is between -1 and – 0.5 or between 0.5 and 1, the data are moderately skewed
If the skewness is less than -1 or greater than 1, the data are highly skewed

Kurtosis: measure of distribution in the combined tails. kurtosis decreases as the tails become lighter.  It increases as the tails become heavier. 
Normal distribution kurtosis = 0
< 0 slightly less weight in the tails than Normal distribution
> 0 slightly more weight in the tails than Normal distribution

```{r}
ggplot(data = cereal_df, 
       mapping = aes(x = Sugars)) + geom_histogram(bins=16)
breaks <-c(-3,-2,-1,0,1,2,3)
ggplot(cereal_df) + 
  geom_histogram(aes(x=Sugars,y=..density..), bins=16, position="identity") + 
  geom_density(aes(x=Sugars,y=..density..))
summary(cereal_df)
skewness(cereal_df$Sugars)
kurtosis(cereal_df$Sugars)
```


```{r}

```


```{r}
ggplot(data = cereal_df, 
       mapping = aes(x = Fiber)) + geom_histogram(bins=15)
breaks <-c(-3,-2,-1,0,1,2,3)
ggplot(cereal_df) + 
  geom_histogram(aes(x=Fiber,y=..density..), bins=15, position="identity") + 
  geom_density(aes(x=Fiber,y=..density..))
summary(cereal_df)
skewness(cereal_df$Fiber)
kurtosis(cereal_df$Fiber)
```

2. Check out the correlations between the variables 


```{r}
library(corrplot)
correlations <- cor(cereal_df[,4:9])
corrplot(correlations, method="circle")
```


```{r}
pairs.panels(cereal_df)
```

4. Let's say we are trying to predict calories given the other variables. Which variables would you consider as potential features?


5. Build a linear model use backwards elimination to remove insignigicant terms 
```{r}
lm_model_0 <- lm(Calories ~ Fiber + Carbs + Sugars + Sodium + Manufacturer + Shelf, cereal_df )
summary(lm_model_0)
lm_model_1 <- lm(Calories ~ Fiber + Carbs + Sugars + Sodium + Shelf, cereal_df )
summary(lm_model_1)
lm_model_2 <- lm(Calories ~ Carbs + Sugars + Sodium + Shelf, cereal_df)
summary(lm_model_2)
lm_model_3 <- lm(Calories ~ Carbs + Sugars + Shelf , cereal_df)
summary(lm_model_3)
lm_model_4 <- lm(Calories ~ Carbs + Sugars + Sodium , cereal_df)
summary(lm_model_4)
lm_model_5 <- lm(Calories ~ Carbs + Sugars , cereal_df)
summary(lm_model_5)
```


6. Determine the caloric range for a cereal that has Manufacturer = Nabisco, Sodium = 80, Fiber = 20, Sugar = 7 Carbs = 8 , Shelf = 1 


```{r}
lm_model_5$coefficients
pred <- lm_model_5$coefficients[1] + lm_model_5$coefficients[2]*8 + lm_model_5$coefficients[3]*7
error <- sigma(lm_model_5)
below <- pred - error * 1.96 
above <- pred +error * 1.96
cat("Range is" , below, " to ", above)
```


7. Lets create a variable that represents low fiber - looking at the distribution of fiber let's define values 0, 1 and 2 as low fiber  (target value = 1) all other values are high fiber (target value = 0 )


```{r}

low_fiber <- ifelse(cereal_df$Fiber == 0 | cereal_df$Fiber == 1, 1, 0)

cereal_mod <- tibble(low_fiber, Calories = cereal_df$Calories, 
                     Manufacturer = cereal_df$Manufacturer, Sugars = cereal_df$Sugars, 
                     Sodium = cereal_df$Sodium, Shelf = cereal_df$Shelf)
```

8. Create a logistic model to predict low_fiber . Is the model predictive? 

```{r}
pred_logistic <- glm(factor(low_fiber) ~ Calories+Manufacturer+Sugars+Sodium+Shelf, 
                     data=cereal_mod, family =binomial)

summary(pred_logistic)

step(pred_logistic)

pred_log1 <- glm(factor(low_fiber) ~ Calories+Sugars+Sodium+Shelf, 
                     data=cereal_mod, family =binomial)

summary(pred_log1)
pred_log2 <- glm(low_fiber ~ Sugars+Sodium+Shelf, 
                     data=cereal_mod, family =binomial)

summary(pred_log2)

pred_log3 <- glm(low_fiber ~ Sodium+Shelf, 
                     data=cereal_mod, family =binomial)

summary(pred_log3)


pred_log4 <- glm(low_fiber ~ Sodium, 
                     data=cereal_mod, family =binomial)

summary(pred_log4)

```


```{r}

low_fiber_2 <- ifelse(cereal_df$Fiber == 0 ,  1, 0)

cereal_mod_2 <- tibble(low_fiber_2, Calories = cereal_df$Calories, 
                     Manufacturer = cereal_df$Manufacturer, Sugars = cereal_df$Sugars, 
                     Sodium = cereal_df$Sodium, Shelf = cereal_df$Shelf)

pred_logistic_2 <- glm(factor(low_fiber) ~ Calories+Manufacturer+Sugars+Sodium+Shelf, 
                     data=cereal_mod_2, family =binomial)

summary(pred_logistic_2)

step(pred_logistic)


pred_logistic_3 <- glm(low_fiber ~ Calories+Manufacturer, 
                     data=cereal_mod_2, family =binomial)
summary(pred_logistic_3)
confint(pred_logistic_3)

pred_logistic_4 <- glm(low_fiber ~ Manufacturer, 
                     data=cereal_mod_2, family =binomial)
summary(pred_logistic_4)
confint(pred_logistic_4)
confint.default(pred_logistic_4)
logLik(pred_logistic_4)
anova(pred_logistic_4, test="Chisq")
```
The confidence intervals cross over 0, so nothing is learned here