library(nycflights13)
library(tidyverse)

# functions we know in dplyr
# filter() select rows from a table filter(dataframe, criteria)
# equivalent to dataframe[criteria]
filter(flights, month == 1, day == 1)

# slice() can be used if you know the rows you want
# slice(dataframe 1:20) retrieve the first 20 rows in dataframe
slice(flights,1:20)

# define an order to the result
# arrange()
arrange(flights, desc(year),desc(month),desc(day))

# select() (and rename())
# distinct()
# mutate() (and transmute())
# summarise()
# sample_n() (and sample_frac())#

#spread /gather example

students <- frame_data(
  ~name, ~id,  ~`2017`, ~`2018`,
  "jason lee", 1, 15, 0,
  "heidi spunky", 2, 8, 8
)

# gather a variable that is acrossed multiple columns
students %>%  gather(`2017`, `2018`, key = "year", value = "num_courses")

students2 <- frame_data(
  ~name, ~id,  ~`year`, ~statistic, ~`count`,
  "jason lee", 1, 2017, "num courses", 4,
  "jason lee", 1, 2017, "num credits", 20,
  "heidi spunky", 2,  2017, "num courses", 3,
  "heidi spunky", 2, 2017, "num credits", 3
)

# spreading: data is across multiple rows
students2 %>% spread(key = statistic, value = count)



# other example from dplyr()
# date when data measured, X,Y,Z are stock names , values is
# price of stock on that day
stocks <- data.frame(
  time = as.Date('2009-01-01') + 0:9,
  X = rnorm(10, 0, 1),
  Y = rnorm(10, 0, 2),
  Z = rnorm(10, 0, 4)
)

# concept is stock price on a specific day
# want to represent the relationship "value" between stock and calendar day
stocksm <- stocks %>% gather(stock, price, -time)

stocksm %>% spread(stock, price)
stocksm %>% spread(time, price)

# Spread and gather are complements

df <- data.frame(x = c("a", "b"), y = c(3, 4), z = c(5, 6))
df
df %>% spread(x, y)
df %>% spread(x, y) %>% gather(x, y, a:b, na.rm = TRUE)

# Use 'convert = TRUE' to produce variables of mixed type
df <- data.frame(row = rep(c(1, 51), each = 3),
                 var = c("Sepal.Length", "Species", "Species_num"),
                 value = c(5.1, "setosa", 1, 7.0, "versicolor", 2))
 df %>% spread(var, value) %>% str
df %>% spread(var, value, convert = TRUE)
df %>% spread(var, value, convert = TRUE) %>% str

# more examples with spread and gather
stocks <- tibble(
  year   = c(2015, 2015, 2016, 2016),
  half  = c(   1,    2,     1,    2),
  return = c(1.88, 0.59, 0.92, 0.17)
)
stocks %>%
  spread(year, return) %>%
  gather("year", "return", `2015`:`2016`)


stocks <- tibble(
  year   = c(2015, 2015, 2016, 2016),
  half  = c(   1,    2,     1,    2),
  return = c(1.88, 0.59, 0.92, 0.17)
)

# what does the argument convert do?
stocks %>%
  spread(year, return)
# lets figure out what convert does
stocks %>%
  spread(year, return,convert=FALSE)

# separate
default_t <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j"))
default_t <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% # warning removed a value g, since values 4 vals in row 2
  separate(x, c("one", "two", "three"))

# inserts an NA since 2nd row only has 2 values
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
  separate(x, c("one", "two", "three"))


# inserts an NA since 2nd row only has 2 values
tibble(x = c("a,b,c", "d,e,f", "f,g,i")) %>%
  separate(x, c("one", "two", "three"))


# optional parameters to separate

filled_tl <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
  separate(x, c("one", "two", "three"), fill="left")

filled_tr <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
  separate(x, c("one", "two", "three"), fill="right")

filled_ted <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
  separate(x, c("one", "two", "three"), extra="drop")

filled_tem <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
  separate(x, c("one", "two", "three"), extra="merge")

tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
  separate(x, c("one", "two", "three"))

treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)

treatment %>%
  fill(person)

who1 <- who %>%
  gather(new_sp_m014:newrel_f65, key = "key", value = "cases", na.rm = TRUE)
who1

who1 %>%
  count(key)


# Lesson 2
# relational data operations
planes %>%
  count(tailnum) %>%
  filter(n > 1)
#> # A tibble: 0 × 2
#> # ... with 2 variables: tailnum <chr>, n <int>

weather %>%
  count(year, month, day, hour, origin) %>%
  filter(n > 1)

flights2 <- flights %>%
  select(year:day, hour, origin, dest, tailnum, carrier)
flights2

# join data with a left join
flights2 %>%
  select(-origin, -dest) %>%
  left_join(airlines, by = "carrier") # by specifying the column in common

flights2 %>% # how does the result of this statement differ from statement above?
  select(-origin, -dest) %>%
  mutate(name = airlines$name[match(carrier, airlines$carrier)])

# set operations from Wickham's book
df1 <- tribble(
  ~x, ~y,
  1,  1,
  2,  1
)
df2 <- tribble(
  ~x, ~y,
  1,  1,
  1,  2
)
# operations work across multiple columns
dplyr::intersect(df1, df2)
#> # A tibble: 1 × 2
#>       x     y
#>   <dbl> <dbl>
#> 1     1     1

# Note that we get 3 rows, not 4
dplyr::union(df1, df2)
#> # A tibble: 3 × 2
#>       x     y
#>   <dbl> <dbl>
#> 1     1     2
#> 2     2     1
#> 3     1     1

dplyr::setdiff(df1, df2)
#> # A tibble: 1 × 2
#>       x     y
#>   <dbl> <dbl>
#> 1     2     1

dplyr::setdiff(df2, df1)
#> # A tibble: 1 × 2
#>       x     y
#>   <dbl> <dbl>
#> 1     1     2