library(tidyverse) # <- assignment operator select_diamonds <- select(diamonds,carat,price) select_diamonds # change to include other variables such as the variables that describe x,y,z # sometimes it is easier to specify what you do not want select(diamonds, -x,-y,-z,-price) select(diamonds,starts_with("c")) mode("c") class("c") mode(c) # we have many ways to create a filter greater than, greater than and equal, equal, less than, less than and equal filter(diamonds, carat > .5) filter(diamonds, carat >= .5) filter(diamonds, cut == "Ideal") filter(diamonds, cut %in% c( "Ideal", "Premium")) head(diamonds) # SUMMARY STATISTICS - creating summaries summarize(diamonds, carat_mean = mean(carat,na.rm=TRUE) ) # functions like min, max, number of rows summarize(diamonds, carat_mean = mean(carat,na.rm=TRUE), carat_median = median(carat,na.rm=TRUE), num_observations = n()) # investigate other summary statistics summarize(group_by(diamonds, cut), carat_mean = mean(carat,na.rm=TRUE) ) #ARRANGE - specify the order of the output - default is ascending order arrange(diamonds, cut, color) arrange(diamonds, desc(cut), color) s <- arrange(diamonds, desc(cut), min_rank(desc(color))) # ordering functions min_rank(), row_number(), dense_rank(), percent_rank(), cum_dist(), ntile() # rename a column in the results width is the new name for x rename(diamonds, width = x) #reorder diamonds to have x,y,z as the first 3 columns select(diamonds, x, y, z, everything()) # mutate adds a new column to the result m1 <- mutate(diamonds, volume = x*y*z) m2 <- select(m1, volume, carat) m3 <- filter(m2,color == "Ideal" ) # only variable in the result is the new col. volume transmute(diamonds, volume = x*y*z) h <- mutate(diamonds, volume_min = floor(x*y*z)) h2 <- mutate(diamonds, volume = round(x*y*z,1)) band <- mutate(diamonds, volume_min = floor(x*y*z), volume_max = ceiling(x*y*z)) ?floor # What are the different diamond colors? summarize(group_by(diamonds,color)) # How many distinct colors do we have in the dataset? summarize(diamonds , color_n=n_distinct(color)) # it gets confusing to track the nesting parentheses summarize(group_by(diamonds, cut ) , color_n=n_distinct(color)) # if we want to know number of diamonds with a specific cut count(diamonds, weight=cut) count(diamonds) # count the number of diamonds with a specific carat count(diamonds, weight=carat) # what is the difference here count(diamonds, carat) # another method count(select(diamonds, cut, carat)) # Pipes makes it easier to define the necessary transformations for the data # in this example wt is an argument to count, performing a weighted tally - which is typically what you want to do diamonds %>% count(cut, wt=carat)