library(nycflights13) library(tidyverse) # functions we know in dplyr # filter() select rows from a table filter(dataframe, criteria) # equivalent to dataframe[criteria] filter(flights, month == 1, day == 1) # slice() can be used if you know the rows you want # slice(dataframe 1:20) retrieve the first 20 rows in dataframe slice(flights,1:20) # define an order to the result # arrange() arrange(flights, desc(year),desc(month),desc(day)) # select() (and rename()) # distinct() # mutate() (and transmute()) # summarise() # sample_n() (and sample_frac())# #spread /gather example students <- frame_data( ~name, ~id, ~`2017`, ~`2018`, "jason lee", 1, 15, 0, "heidi spunky", 2, 8, 8 ) # gather a variable that is acrossed multiple columns students %>% gather(`2017`, `2018`, key = "year", value = "num_courses") students2 <- frame_data( ~name, ~id, ~`year`, ~statistic, ~`count`, "jason lee", 1, 2017, "num courses", 4, "jason lee", 1, 2017, "num credits", 20, "heidi spunky", 2, 2017, "num courses", 3, "heidi spunky", 2, 2017, "num credits", 3 ) # spreading: data is across multiple rows students2 %>% spread(key = statistic, value = count) # other example from dplyr() # date when data measured, X,Y,Z are stock names , values is # price of stock on that day stocks <- data.frame( time = as.Date('2009-01-01') + 0:9, X = rnorm(10, 0, 1), Y = rnorm(10, 0, 2), Z = rnorm(10, 0, 4) ) # concept is stock price on a specific day # want to represent the relationship "value" between stock and calendar day stocksm <- stocks %>% gather(stock, price, -time) stocksm %>% spread(stock, price) stocksm %>% spread(time, price) # Spread and gather are complements df <- data.frame(x = c("a", "b"), y = c(3, 4), z = c(5, 6)) df df %>% spread(x, y) df %>% spread(x, y) %>% gather(x, y, a:b, na.rm = TRUE) # Use 'convert = TRUE' to produce variables of mixed type df <- data.frame(row = rep(c(1, 51), each = 3), var = c("Sepal.Length", "Species", "Species_num"), value = c(5.1, "setosa", 1, 7.0, "versicolor", 2)) df %>% spread(var, value) %>% str df %>% spread(var, value, convert = TRUE) df %>% spread(var, value, convert = TRUE) %>% str # more examples with spread and gather stocks <- tibble( year = c(2015, 2015, 2016, 2016), half = c( 1, 2, 1, 2), return = c(1.88, 0.59, 0.92, 0.17) ) stocks %>% spread(year, return) %>% gather("year", "return", `2015`:`2016`) stocks <- tibble( year = c(2015, 2015, 2016, 2016), half = c( 1, 2, 1, 2), return = c(1.88, 0.59, 0.92, 0.17) ) # what does the argument convert do? stocks %>% spread(year, return) # lets figure out what convert does stocks %>% spread(year, return,convert=FALSE) # separate default_t <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) default_t <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% # warning removed a value g, since values 4 vals in row 2 separate(x, c("one", "two", "three")) # inserts an NA since 2nd row only has 2 values tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% separate(x, c("one", "two", "three")) # inserts an NA since 2nd row only has 2 values tibble(x = c("a,b,c", "d,e,f", "f,g,i")) %>% separate(x, c("one", "two", "three")) # optional parameters to separate filled_tl <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% separate(x, c("one", "two", "three"), fill="left") filled_tr <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% separate(x, c("one", "two", "three"), fill="right") filled_ted <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% separate(x, c("one", "two", "three"), extra="drop") filled_tem <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% separate(x, c("one", "two", "three"), extra="merge") tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% separate(x, c("one", "two", "three")) treatment <- tribble( ~ person, ~ treatment, ~response, "Derrick Whitmore", 1, 7, NA, 2, 10, NA, 3, 9, "Katherine Burke", 1, 4 ) treatment %>% fill(person) who1 <- who %>% gather(new_sp_m014:newrel_f65, key = "key", value = "cases", na.rm = TRUE) who1 who1 %>% count(key) # Lesson 2 # relational data operations planes %>% count(tailnum) %>% filter(n > 1) #> # A tibble: 0 × 2 #> # ... with 2 variables: tailnum , n weather %>% count(year, month, day, hour, origin) %>% filter(n > 1) flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier) flights2 # join data with a left join flights2 %>% select(-origin, -dest) %>% left_join(airlines, by = "carrier") # by specifying the column in common flights2 %>% # how does the result of this statement differ from statement above? select(-origin, -dest) %>% mutate(name = airlines$name[match(carrier, airlines$carrier)]) # set operations from Wickham's book df1 <- tribble( ~x, ~y, 1, 1, 2, 1 ) df2 <- tribble( ~x, ~y, 1, 1, 1, 2 ) # operations work across multiple columns dplyr::intersect(df1, df2) #> # A tibble: 1 × 2 #> x y #> #> 1 1 1 # Note that we get 3 rows, not 4 dplyr::union(df1, df2) #> # A tibble: 3 × 2 #> x y #> #> 1 1 2 #> 2 2 1 #> 3 1 1 dplyr::setdiff(df1, df2) #> # A tibble: 1 × 2 #> x y #> #> 1 2 1 dplyr::setdiff(df2, df1) #> # A tibble: 1 × 2 #> x y #> #> 1 1 2