Column and row binds
The equivalents to base R’s rbind
and cbind
are bind_rows
and bind_cols
, respectively.
bind_cols(iris_df, iris_df)[1:2,]
## New names:
## • `Sepal.Length` -> `Sepal.Length...1`
## • `Sepal.Width` -> `Sepal.Width...2`
## • `Petal.Length` -> `Petal.Length...3`
## • `Petal.Width` -> `Petal.Width...4`
## • `Species` -> `Species...5`
## • `Sepal.Length` -> `Sepal.Length...6`
## • `Sepal.Width` -> `Sepal.Width...7`
## • `Petal.Length` -> `Petal.Length...8`
## • `Petal.Width` -> `Petal.Width...9`
## • `Species` -> `Species...10`
[ scroll down to continue ]
bind_rows(iris_df, iris_df)[1:2,]
Extract column as vector
The subsetting operators [[
and $
can be used to extract from a tibble
single columns as vector.
iris_df[[5]][1:12]
## [1] "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa"
iris_df$Species[1:12]
## [1] "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa" "setosa"
Important dplyr
functions
filter()
and slice()
arrange()
select()
and rename()
distinct()
mutate()
and transmute()
summarise()
sample_n()
and sample_frac()
Slice and filter functions
Filter function
filter(iris_df, Sepal.Length > 7.5, Species=="virginica")
Base R code equivalent
iris_df[iris_df[, "Sepal.Length"] > 7.5 & iris_df[, "Species"]=="virginica", ]
Including boolean operators
filter(iris_df, Sepal.Length > 7.5 | Sepal.Length < 5.5, Species=="virginica")
Subset rows by position
dplyr
approach
slice(iris_df, 1:2)
Base R code equivalent
iris_df[1:2,]
Subset rows by names
Since tibbles
do not contain row names, row wise subsetting via the [,]
operator cannot be used. However, the corresponding behavior can be achieved by passing to select
a row position index obtained by basic R intersect utilities such as match
.
Create a suitable test tibble
df1 <- bind_cols(data_frame(ids1=paste0("g", 1:10)), as_data_frame(matrix(1:40, 10, 4, dimnames=list(1:10, paste0("CA", 1:4)))))
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` (with slightly different semantics) to convert to a tibble, or `as.data.frame()` to convert to a data frame.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
df1
dplyr
approach
slice(df1, match(c("g10", "g4", "g4"), ids1))
Base R equivalent
df1_old <- as.data.frame(df1)
rownames(df1_old) <- df1_old[,1]
df1_old[c("g10", "g4", "g4"),]
Sorting with arrange
Row-wise ordering based on specific columns
dplyr
approach
arrange(iris_df, Species, Sepal.Length, Sepal.Width)
For ordering descendingly use desc()
function
arrange(iris_df, desc(Species), Sepal.Length, Sepal.Width)
Base R code equivalent
iris_df[order(iris_df$Species, iris_df$Sepal.Length, iris_df$Sepal.Width), ]
iris_df[order(iris_df$Species, decreasing=TRUE), ]
Select columns with select
Select specific columns
select(iris_df, Species, Petal.Length, Sepal.Length)
Select range of columns by name
select(iris_df, Sepal.Length : Petal.Width)
Drop specific columns (here range)
select(iris_df, -(Sepal.Length : Petal.Width))
Renaming columns with rename
dplyr
approach
rename(iris_df, new_col_name = Species)
Base R code approach
colnames(iris_df)[colnames(iris_df)=="Species"] <- "new_col_names"
Obtain unique rows with distinct
dplyr
approach
distinct(iris_df, Species, .keep_all=TRUE)
Base R code approach
iris_df[!duplicated(iris_df$Species),]
Add columns
mutate
The mutate
function allows to append columns to existing ones.
mutate(iris_df, Ratio = Sepal.Length / Sepal.Width, Sum = Sepal.Length + Sepal.Width)
transmute
The transmute
function does the same as mutate
but drops existing columns
transmute(iris_df, Ratio = Sepal.Length / Sepal.Width, Sum = Sepal.Length + Sepal.Width)
bind_cols
The bind_cols
function is the equivalent of cbind
in base R. To add rows, use the corresponding bind_rows
function.
bind_cols(iris_df, iris_df)
## New names:
## • `Sepal.Length` -> `Sepal.Length...1`
## • `Sepal.Width` -> `Sepal.Width...2`
## • `Petal.Length` -> `Petal.Length...3`
## • `Petal.Width` -> `Petal.Width...4`
## • `Species` -> `Species...5`
## • `Sepal.Length` -> `Sepal.Length...6`
## • `Sepal.Width` -> `Sepal.Width...7`
## • `Petal.Length` -> `Petal.Length...8`
## • `Petal.Width` -> `Petal.Width...9`
## • `Species` -> `Species...10`
Summarize data
Summary calculation on single column
summarize(iris_df, mean(Petal.Length))
Summary calculation on many columns
summarize_all(iris_df[,1:4], mean)
Summarize by grouping column
summarize(group_by(iris_df, Species), mean(Petal.Length))
Aggregate summaries
summarize_all(group_by(iris_df, Species), mean)
Note: group_by
does the looping for the user similar to aggregate
or tapply
.
Merging tibbles
The dplyr
package provides several join functions for merging tibbles
by a common key column similar to the merge
function in base R. These *_join
functions include:
inner_join()
: returns join only for rows matching among both tibbles
full_join()
: returns join for all (matching and non-matching) rows of two tibbles
left_join()
: returns join for all rows in first tibble
right_join()
: returns join for all rows in second tibble
anti_join()
: returns for first tibble
only those rows that have no match in the second one
Sample tibbles
to illustrate *.join
functions.
df1 <- bind_cols(data_frame(ids1=paste0("g", 1:10)), as_data_frame(matrix(1:40, 10, 4, dimnames=list(1:10, paste0("CA", 1:4)))))
df1
df2 <- bind_cols(data_frame(ids2=paste0("g", c(2,5,11,12))), as_data_frame(matrix(1:16, 4, 4, dimnames=list(1:4, paste0("CB", 1:4)))))
df2
Inner join
inner_join(df1, df2, by=c("ids1"="ids2"))
Left join
left_join(df1, df2, by=c("ids1"="ids2"))
Right join
right_join(df1, df2, by=c("ids1"="ids2"))
Full join
full_join(df1, df2, by=c("ids1"="ids2"))
Anti join
anti_join(df1, df2, by=c("ids1"="ids2"))
For additional join options users want to cosult the *_join
help pages.