Reference for ‘tidyr’ package
November 2, 2015
tidy data: every column in the data frame represents a variable and every row represents an observation — also referred to as long format
Overview
gather()
andspread()
convert between long and wide dataseparate()
splits a single column into servaral variables and is commonly used in conjunction withgather()
library(tidyr); library(dplyr)
set.seed(1)
tidyr.ex <- data.frame(
participant = c("p1", "p2", "p3", "p4", "p5", "p6"),
info = c("g1m", "g1m", "g1f", "g2m", "g2m", "g2m"),
day1score = rnorm(n = 6, mean = 80, sd = 15),
day2score = rnorm(n = 6, mean = 88, sd = 8)
)
print(tidyr.ex)
## participant info day1score day2score
## 1 p1 g1m 70.60319 91.89943
## 2 p2 g1m 82.75465 93.90660
## 3 p3 g1f 67.46557 92.60625
## 4 p4 g2m 103.92921 85.55689
## 5 p5 g2m 84.94262 100.09425
## 6 p6 g2m 67.69297 91.11875
gather: convert wide data to long data
- takes columns and gathers them into new variable
gather(df, newVar1, newVar2, vector1, vector2)
tidyr.ex %>%
gather(day, score, c(day1score, day2score))
## participant info day score
## 1 p1 g1m day1score 70.60319
## 2 p2 g1m day1score 82.75465
## 3 p3 g1f day1score 67.46557
## 4 p4 g2m day1score 103.92921
## 5 p5 g2m day1score 84.94262
## 6 p6 g2m day1score 67.69297
## 7 p1 g1m day2score 91.89943
## 8 p2 g1m day2score 93.90660
## 9 p3 g1f day2score 92.60625
## 10 p4 g2m day2score 85.55689
## 11 p5 g2m day2score 100.09425
## 12 p6 g2m day2score 91.11875
spread: convert long data to wide data
- takes different levels of a factor and spreads them out into different columns
spread(df, var1, var2)
# returns back original data frame
tidyr.ex %>%
gather(day, score, c(day1score, day2score)) %>%
spread(day, score)
## participant info day1score day2score
## 1 p1 g1m 70.60319 91.89943
## 2 p2 g1m 82.75465 93.90660
## 3 p3 g1f 67.46557 92.60625
## 4 p4 g2m 103.92921 85.55689
## 5 p5 g2m 84.94262 100.09425
## 6 p6 g2m 67.69297 91.11875
separate: separates one column into many
- takes values in a column and creates new columns
separate(df, col, into, sep)
# sep can be numeric (length) or character
tidyr.ex %>%
gather(day, score, c(day1score, day2score)) %>%
separate(col = info, into = c("group", "gender"), sep = 2)
## participant group gender day score
## 1 p1 g1 m day1score 70.60319
## 2 p2 g1 m day1score 82.75465
## 3 p3 g1 f day1score 67.46557
## 4 p4 g2 m day1score 103.92921
## 5 p5 g2 m day1score 84.94262
## 6 p6 g2 m day1score 67.69297
## 7 p1 g1 m day2score 91.89943
## 8 p2 g1 m day2score 93.90660
## 9 p3 g1 f day2score 92.60625
## 10 p4 g2 m day2score 85.55689
## 11 p5 g2 m day2score 100.09425
## 12 p6 g2 m day2score 91.11875
unite: opposite of separate
- not used frequently
unit(df, newVarName, col1, col2)
# default combines with "_"
tidyr.ex %>%
gather(day, score, c(day1score, day2score)) %>%
separate(col = info, into = c("group", "gender"), sep = 2) %>%
unite(infoAgain, group, gender)
## participant infoAgain day score
## 1 p1 g1_m day1score 70.60319
## 2 p2 g1_m day1score 82.75465
## 3 p3 g1_f day1score 67.46557
## 4 p4 g2_m day1score 103.92921
## 5 p5 g2_m day1score 84.94262
## 6 p6 g2_m day1score 67.69297
## 7 p1 g1_m day2score 91.89943
## 8 p2 g1_m day2score 93.90660
## 9 p3 g1_f day2score 92.60625
## 10 p4 g2_m day2score 85.55689
## 11 p5 g2_m day2score 100.09425
## 12 p6 g2_m day2score 91.11875