library(tidyverse) text <- c("one", "two", "three", NA, "five") # how many characters in each string? base::nchar(text) #> [1] 3 3 5 NA 4 # this works fine paste("University", "of", "California", "Berkeley") #> [1] "University of California Berkeley" # this works fine too paste("University", "of", "California", "Berkeley") #> [1] "University of California Berkeley" # this is weird paste("University", "of", "California", "Berkeley", NULL) #> [1] "University of California Berkeley " # this is ugly paste("University", "of", "California", "Berkeley", NULL, character(0), "Go Bears!") #> [1] "University of California Berkeley Go Bears!" # default usage str_c("May", "The", "Force", "Be", "With", "You") #> [1] "MayTheForceBeWithYou" # str_c removes zero length objects str_c("May", "The", "Force", NULL, "Be", "With", "You", character(0)) #> [1] "MayTheForceBeWithYou" # changing separator str_c("May", "The", "Force", "Be", "With", "You", sep = "_") #> [1] "May_The_Force_Be_With_You" some_text <- c("one", "two", "three", NA, "five") # compare 'str_length' with 'nchar' nchar(some_text) #> [1] 3 3 5 NA 4 str_length(some_text) #> [1] 3 3 5 NA 4 some_factor <- factor(c(1,1,1,2,2,2), labels = c("good", "bad")) some_factor #> [1] good good good bad bad bad #> Levels: good bad # try 'nchar' on a factor # nchar(some_factor) #> Error in nchar(some_factor): 'nchar()' requires a character vector # now compare it with 'str_length' str_length(some_factor) #> [1] 4 4 4 3 3 3 ## ----str_sub----------------------------------------- # exract substrings from a string negative number mean go backwards string <- "now is the time" str_sub(string, start = 1L, end = -1L) lorem <- "Lorem Ipsum" # apply 'str_sub' str_sub(lorem, start = 1, end = 5) #> [1] "Lorem" # equivalent to 'substring' substring(lorem, first = 1, last = 5) #> [1] "Lorem" # another example with 3 different starting positions str_sub("adios", 1:3) #> [1] "adios" "dios" "ios" resto = c("brasserie", "bistrot", "creperie", "bouchon") # 'str_sub' with negative positions str_sub(resto, start = -4, end = -1) #> [1] "erie" "trot" "erie" "chon" # compared to substring (useless) substring(resto, first = -4, last = -1) #> [1] "" "" "" "" # before reviewing this read the help page for seq_len() # extracting sequentially str_sub(lorem, seq_len(nchar(lorem))) #> [1] "Lorem Ipsum" "orem Ipsum" "rem Ipsum" "em Ipsum" "m Ipsum" #> [6] " Ipsum" "Ipsum" "psum" "sum" "um" #> [11] "m" substring(lorem, seq_len(nchar(lorem))) #> [1] "Lorem Ipsum" "orem Ipsum" "rem Ipsum" "em Ipsum" "m Ipsum" #> [6] " Ipsum" "Ipsum" "psum" "sum" "um" #> [11] "m" # reverse substrings with negative positions str_sub(lorem, -seq_len(nchar(lorem))) #> [1] "m" "um" "sum" "psum" "Ipsum" #> [6] " Ipsum" "m Ipsum" "em Ipsum" "rem Ipsum" "orem Ipsum" #> [11] "Lorem Ipsum" # does not understand negative offsets substring(lorem, -seq_len(nchar(lorem))) #> [1] "Lorem Ipsum" "Lorem Ipsum" "Lorem Ipsum" "Lorem Ipsum" "Lorem Ipsum" #> [6] "Lorem Ipsum" "Lorem Ipsum" "Lorem Ipsum" "Lorem Ipsum" "Lorem Ipsum" #> [11] "Lorem Ipsum" ## ----replacing_strings--------------------------------------------------- # replacing 'Lorem' with 'Nullam' lorem <- "Lorem Ipsum" str_sub(lorem, 1, 5) <- "Nullam" lorem #> [1] "Nullam Ipsum" # replacing with negative positions lorem <- "Lorem Ipsum" str_sub(lorem, -1) <- "Nullam" lorem #> [1] "Lorem IpsuNullam" # multiple replacements lorem <- "Lorem Ipsum" str_sub(lorem, c(1,7), c(5,8)) <- c("Nullam", "Enim") lorem #> [1] "Nullam Ipsum" "Lorem Enimsum" ## ----duplicate_strings--------------------------------------------------- times <- 2 string <- "Hello" str_dup(string, times) # default usage str_dup("hola", 3) #> [1] "holaholahola" # use with differetn 'times' str_dup("adios", 1:3) #> [1] "adios" "adiosadios" "adiosadiosadios" # use with a string vector words <- c("lorem", "ipsum", "dolor", "sit", "amet") str_dup(words, 2) #> [1] "loremlorem" "ipsumipsum" "dolordolor" "sitsit" "ametamet" str_dup(words, 1:5) #> [1] "lorem" "ipsumipsum" "dolordolordolor" #> [4] "sitsitsitsit" "ametametametametamet" ## ----padding_strings----------------------------------------------------- string_ex <- "Gandalf" width <- 20 str_pad(string_ex, width, side = "left", pad = " ") # what happens if string length is longer than width? # valid values for side c("left", "right", "both") str_pad(string_ex, 2, side = "left", pad = " ") # default usage str_pad("hola", width = 7) #> [1] " hola" # pad both sides str_pad("adios", width = 7, side = "both") #> [1] " adios " # left padding with '#' str_pad("hashtag", width = 8, pad = "#") #> [1] "#hashtag" # pad both sides with '-' str_pad("hashtag", width = 9, side = "both", pad = "-") #> [1] "-hashtag-" ## ----text_formatting----------------------------------------------------- # format str_wrap(string, width = 80, indent = 0, exdent = 0) # quote (by Douglas Adams) some_quote <- c( "I may not have gone", "where I intended to go,", "but I think I have ended up", "where I needed to be") # same_quote in a single paragraph some_quote <- paste(some_quote, collapse = " ") # display paragraph with width=30 cat(str_wrap(some_quote, width = 30)) #> I may not have gone where I #> intended to go, but I think I #> have ended up where I needed #> to be # display paragraph with first line indentation of 2 cat(str_wrap(some_quote, width = 30, indent = 2), "\n") #> I may not have gone where I #> intended to go, but I think I #> have ended up where I needed #> to be # display paragraph with following lines indentation of 3 cat(str_wrap(some_quote, width = 30, exdent = 3), "\n") #> I may not have gone where I #> intended to go, but I think I #> have ended up where I needed #> to be ## ----trim_strings-------------------------------------------------------- string_ex <- " Gandalf the Grey " str_trim(string_ex, side = "both") # text with whitespaces bad_text <- c("This", " example ", "has several ", " whitespaces ") # remove whitespaces on the left side str_trim(bad_text, side = "left") #> [1] "This" "example " "has several " "whitespaces " # remove whitespaces on the right side str_trim(bad_text, side = "right") #> [1] "This" " example" "has several" " whitespaces" # remove whitespaces on both sides str_trim(bad_text, side = "both") #> [1] "This" "example" "has several" "whitespaces" ## ----extracting_words---------------------------------------------------- # format word(string, start = 1L, end = start, sep = fixed(" ")) #example string_ex <- "I am looking for someone to share in an adventure that I am arranging, and it's very difficult to find anyone." # some sentence change <- c("Be the change", "you want to be") change # extract first word word(change, 1) #> [1] "Be" "you" # extract second word word(change, 2) #> [1] "the" "want" # extract last word word(change, -1) #> [1] "change" "be" # extract all but the first words word(change, 2, -1) #> [1] "the change" "want to be" ## ----wild_metacharacter-------------------------------------------------- library(htmltools) library(htmlwidgets) not <- c("not", "note", "knot", "nut") str_view(not, "n.t") ## ----locating_patterns--------------------------------------------------- x <- c("abcd", "a22bc1d", "ab3453cd46", "a1bc44d") # locate 1st sequence of 1 or more consecutive numbers str_locate(x, "[0-9]+") str_locate_all(x, "[0-9]+") ## ----escaping_characters------------------------------------------------- fives <- c("5.00", "5100", "5-00", "5 00") str_view(fives, "5\\.00") ## ----character_sets------------------------------------------------------ pns <- c('pan', 'pen', 'pin', 'pon', 'pun') str_view(pns, "p[aeiou]n") pnx <- c('pan', 'pen', 'pin', 'p0n', 'p.n', 'p1n', 'paun') str_view(pnx, "p[aeiou]n") my_lower = "[abcdefghijklmnopqrstuvwxyz]" # instead of listing all you can use the range character - my_upper <- "[A-Z]" basic <- c('1', 'a', 'A', '&', '-', '^') # digits str_view(basic, '[0-9]') # lower case letters str_view(basic, '[a-z]') # upper case letters str_view(basic, '[A-Z]') triplets <- c('123', 'abc', 'ABC', ':-)') #matching consecutive characters str_view(triplets, '[0-9][0-9][0-9]') str_view(triplets, '[A-Z][A-Z][A-Z]') ## ----negatation_set------------------------------------------------------ # specifying the pattern you do not want basic <- c('1', 'a', 'A', '&', '-', '^') str_view(basic, '[^A-Z]') # the caret must be the first character listed # this is different than the pattern above str_view(basic, '[A-Z^]') str_view(basic, '[A\\-]') #match everything BUT the caret str_view(basic, '[^^]') pnx <- c('pan', 'pen', 'pin', 'p0n', 'p.n', 'p1n', 'paun') # . metacharacter inside a character set loses its metaness and is just normal character str_view(pnx, "p[ae.io]n") #any metacharacter used to define a character set do not follow this rule so the closing bracket, the caret, the minus sign, the backslash basic <- c('1', 'a', 'A', '&', '-', '^', '[', ']') str_view(basic, "[a\\^\\-]") # there are also POSIX character classes that are represented via terms #alpha matches alphanumeric characters pnx <- c('pan', 'pen', 'pin', 'p0n', 'p.n', 'p1n', 'paun') str_view(pnx, "[[:alpha:]]") str_view(pnx, "[[:alpha:]]+") # use references \\1 \\2 \\3 to access matched components head(sentences) sentences %>% str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% head(5) head(sentence) # str_split to define method to break up a string sentences %>% head(5) %>% str_split(" ", simplify = TRUE) ret <- sentences %>% head(5) %>% str_split(" ")