https://twitter.com/milesmcbain/status/1194029161490202625?s=12
Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = verbose)
and
Sys.setenv("_R_CHECK_LENGTH_1_CONDITION_" = TRUE)
dplyr::group_indices()
data.frame(x=c(0,0,0,1,1,1), y=c(0,0,1,0,1,1))
group_indices(df, x, y)
df %>% mutate(pattern = group_indices_(df, .dots = c('x', 'y')))
library(dplyr)
case_when
cars %>% as_tibble %>% mutate(case_code = case_when(
speed == 4 & dist == 2 ~ "this",
dist > 6 & dist == 10 ~ "is",
speed >=10 & dist >= 18 ~ "awesome"))
df %>% group_by(A, B) %>% top_n(n=1, wt= C)
df %>% group_by(A,B) %>% slice(which.max(C))
df %>% group_by(A, B) %>% filter(value == max(C))
df %>% group_by(A,B) %>% filter(all(C>10))
df %>% group_by(A,B) %>% filter(any(C>10))
df %>% arrange(stopSequence) %>% group_by(id) %>% slice(c(1,n()))
library(purrr)
library(dplyr)
bob %>% map_if(is.factor, as.character)
bob %>% mutate_if(is.factor, as.character)
p <- ggplot(diamonds, aes(x = carat, y = price))
# Use cut_interval
p + geom_boxplot(aes(group = cut_interval(carat, n=10)))
# Use cut_number
p + geom_boxplot(aes(group = cut_number(carat, n =10)))
# Use cut_width
p + geom_boxplot(aes(group = cut_width(carat, width = 0.25)))
diamonds %>% count(cut_width(carat, 0.5))
ggplot(mpg) + geom_boxplot(aes(x = reoder(class, hwy, FUN = median), y = hwy))
ggplot(diamonds, aes(x = cut, y = price)) + geom_boxplot(varwidth = TRUE)
library(dplyr)
mammals2 <- mammals %>%
group_by(vore) %>%
mutate(n = n()/nrow(mammals))
ggplot(mammals2, aes(x = vore, y = sleep_total, fill = vore)) +
geom_violin(aes(weight = n), col = NA)
library(tibble)
df<- tibble(category=LETTERS[1:3], b=1:3)
x<- c("C", "A", "B")
# reorder
df %>% slice(match(x, category))
# A tibble: 3 × 2
category b
<chr> <int>
1 C 3
2 A 1
3 B 2
https://stackoverflow.com/questions/46129322/arranging-rows-in-custom-order-using-dplyr
> df<- data.frame(num = c(1,3,4,5,3,2), type = c("A", "B", "C", "C", "A", "B"))
> df
num type
1 1 A
2 3 B
3 4 C
4 5 C
5 3 A
6 2 B
> df %>% arrange(match(type, c("C", "A", "B")), desc(num))
num type
1 5 C
2 4 C
3 3 A
4 1 A
5 3 B
6 2 B
df %>% group_by(A,B) %>% filter(all(C >10))
The default combined density plot extends the range of all values to the total extent of the entire dataset, which may be a bit confusing. In the fourth plot, adjust for this by setting trim = TRUE inside
geom_density()
. However, be cautious. Since the distributions are cut off at the extreme ends, the area under the curve technically is not equal to one anymore.
require(GGally)
ggparcoord(iris, columns = 1:4, groupColumn = 5, scale = "globalminmax", order = "anyClass", alphaLines = 0.4)
As you can probably imagine, distance matrices (class dist) contain the measured distance between all pair-wise combinations of many points. For example, the eurodist dataset contains the distances between major European cities. dist objects lend themselves well to ggfortify::autoplot()
.
The stats::cmdscale()
function performs Classical Multi-Dimensional Scaling and returns point coodinates as a matrix. Although autoplot will work on this object, it will produce a heatmap, and not a scatter plot. However, if either eig = TRUE
, add = TRUE
or x.ret = TRUE
is specified, stats::cmdscale() will return a list instead of matrix. In these cases, ggfortify::autoplot
can deal with the output. Details on these arguments can be found in the docs (?cmdscale).
# ggfortify and eurodist are available
# Autoplot + ggplot2 tweaking
autoplot(eurodist) +
labs( x = "", y = "") +
coord_fixed() +
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))
# Autoplot of MDS
autoplot(cmdscale(eurodist, eig = TRUE), label = TRUE, label.size =3, size = 0)
also check purrr
, Hadely has not used plyr
for long time. ref...twitter
library(plyr)
myplots<- dlplyr(mtcars, .(cyl), function(df){
ggplot(df, aes(mpg, wt)) +
geom_point() +
xlim(range(mtcars$mpg)) +
ylim(range(mtcars$wt)) +
ggtilte(paste(df$cyl[1], "cylinders"))})
# by position
myplots[[2]]
# by name
myplots[["4"]]
library(gridExtra)
grid.arrange(myplots[[1]], myplots[[2]], ncol = 2)
do.call(grid.arrange, myplots)
library(ggfortify)
# perform clustering
iris_k<- kmeans(iris[-5], center = 3)
# autplot: coloring according to cluster
autoplot(iris_k, data = iris, frame = TRUE)
# autoplot: coloring according to species
autoplot(iris_k, data = iris, frame = TRUE, col = "Species")
df1 %>% left_join(df2) %>% left_join(df3)....
library(purrr)
tables<- list(df1,df2,df3)
reduce(tables, left_join, by = "key")
list(more_artists, more_bands, supergroups) %>%
# Return rows of more_artists in all three datasets
reduce(semi_join, by = c("first", "last"))
read http://novyden.blogspot.com/2013/09/how-to-expand-color-palette-with-ggplot.html
colorRampPalette(brewer.pal(9, "Set1"))(26)
[1] "#E41A1C" "#AC3A4D" "#755A7F" "#3D7AB1" "#3D8B99" "#449B75" "#4BAB52" "#5F975F" "#77787B" "#8F5998"
[11] "#AC5782" "#CD674E" "#EE771A" "#FF9308" "#FFBC18" "#FFE528" "#F4EA31" "#D7B42E" "#BB7E2A" "#AC5934"
[21] "#C66764" "#E07494" "#F381BD" "#D589B1" "#B791A5" "#999999"
# Create a "balloon plot" as alternative to a heatmap with ggplot2
#
# January 2017
# Author: Markus Konrad <[email protected]>, WZB Berlin Social Science Center
library(dplyr)
library(tidyr)
library(ggplot2)
# define the variables that will be displayed in the columns
vars <- c('awake', 'sleep_total', 'sleep_rem')
# prepare the data: we use the "msleep" dataset which comes with ggplot2
df <- msleep[!is.na(msleep$vore), c('name', 'vore', vars)] %>% # only select the columns we need from the msleep dataset
group_by(vore) %>% sample_n(5) %>% ungroup() %>% # select 5 random rows from each "vore" group as subset
gather(key = variable, value = value, -name, -vore) %>% # make a long table format: gather columns in rows
filter(!is.na(value)) %>% # remove rows with NA-values -> those will be empty spots in the plot
arrange(vore, name) # order by vore and name
# add a "row" column which will be the y position in the plot: group by vore and name, then set "row" as group index
df <- df %>% mutate(row = group_indices_(df, .dots=c('vore', 'name')))
# add a "col" column which will be the x position in the plot: group by variable, then set "col" as group index
df <- df %>% mutate(col = group_indices_(df, .dots=c('variable')))
# get character vector of variable names for the x axis. the order is important, hence arrange(col)!
vars_x_axis <- c(df %>% arrange(col) %>% select(variable) %>% distinct())$variable
# get character vector of observation names for the y axis. again, the order is important but "df" is already ordered
names_y_axis <- c(df %>% group_by(row) %>% distinct(name) %>% ungroup() %>% select(name))$name
# now plot
# make color dependent on vore, size and alpha dependent on value
# x and y must be set as factor() otherwise scale_x/y_discrete() won't work
ggplot(df, aes(x=factor(col), y=factor(row), color=vore, size=value, alpha=value)) +
geom_point() + # plot as points
geom_text(aes(label=value, x=col + 0.25), alpha=1.0, size=3) + # display the value next to the "balloons"
scale_alpha_continuous(range=c(0.3, 0.7)) +
scale_size_area(max_size = 5) +
scale_x_discrete(breaks=1:length(vars_x_axis), labels=vars_x_axis, position='top') + # set the labels on the X axis
scale_y_discrete(breaks=1:length(names_y_axis), labels=names_y_axis) + # set the labels on the Y axis
theme_bw() +
theme(axis.line = element_blank(), # disable axis lines
axis.title = element_blank(), # disable axis titles
panel.border = element_blank(), # disable panel border
panel.grid.major.x = element_blank(), # disable lines in grid on X-axis
panel.grid.minor.x = element_blank()) # disable lines in grid on X-axis
df_list<- split(df, df$A)
sapply(names(df_list), function (x) write.table(df_list[[x]], file=paste(x, "txt", sep=".")))
files<- as.list(dir(".", pattern= ".tsv"))
## need to add the file name into a column
datlist <- lapply(mix.files, function(f) {
dat = read.table(f, header =T, sep ="\t", quote = "\"")
dat$sample = gsub(".tsv", "", f)
return(dat)
})
data<- do.call(rbind, datlist)
## or use dplyr: bind_rows(datlist, .id = "sample")
## if each file has a common column, e.g. RNAseq HTSeq counts for many samples, and you want to make a big dataframe with first column
## is the gene-id and columns of raw counts
CCLE_counts<- reduce(datlist, left_join, by = "GeneID")
or https://github.com/vsbuffalo/devnotes/wiki/Data-Analysis-Patterns by Vince Buffalo.
### example setup:
DIR <- 'path/to/data' # change to directory you can write files to.
# filenames to make example work:
files <- c('sampleA_rep01.tsv', 'sampleA_rep02.tsv','sampleB_rep01.tsv',
'sampleB_rep02.tsv', 'sampleC_rep01.tsv', 'sampleC_rep02.tsv')
# write test files for example (iris a bunch of times)
walk(files, ~ write_tsv(iris, file.path(DIR, .)))
### Pattern:
# grab all files programmatically:
input_files <- list.files(DIR,
pattern='sample.*\\.tsv', full.names=TRUE)
# data loading pattern:
all_data <- tibble(file=input_files) %>%
# read data in (note: in general, best to
# pass col_names and col_types to map)
mutate(data=map(file, read_tsv)) %>%
# get the file basename (no path); if
# your metadata is in the path, change accordingly!
mutate(basename=basename(file)) %>%
# extract out the metadata from the base filename
extract(basename, into=c('sample', 'rep'),
regex='sample([^_]+)_rep([^_]+)\\.tsv') %>%
unnest(data) # optional, depends on what you need.
or use purrr::map_df
f <- list.files(
"my_folder",
pattern = "*.csv",
full.names = TRUE)
d <- purrr::map_df(f, readr::read_csv, .id = "id")
Also check
?purrr::map_dfr and ?purrr::map_dfc
read http://stackoverflow.com/questions/41880796/grouped-multicolumn-gather-with-dplyr-tidyr-purrr
have
#> # A tibble: 4 × 8
#> gene sample genotype1 genotype2 genotype3 freq1 freq2 freq3
#> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 gX s1 AA AC CC 0.8 0.15 0.05
#> 2 gX s2 AA AC CC 0.9 0.10 0.00
#> 3 gY s1 GG GT TT 0.7 0.20 0.10
#> 4 gY s2 GG GT TT 0.6 0.35 0.05
to
want
#> # A tibble: 12 × 4
#> gene sample genotype freq
#> <chr> <chr> <chr> <dbl>
#> 1 gX s1 AA 0.80
#> 2 gX s1 AC 0.15
#> 3 gX s1 CC 0.05
#> 4 gX s2 AA 0.90
#> 5 gX s2 AC 0.10
#> 6 gX s2 CC 0.00
#> 7 gY s1 GG 0.70
#> 8 gY s1 GT 0.20
#> 9 gY s1 TT 0.10
#> 10 gY s2 GG 0.60
#> 11 gY s2 GT 0.35
#> 12 gY s2 TT 0.05
library(sjmisc)
to_long(have, keys = "genos", values = c("genotype", "freq"),
c("genotype1", "genotype2", "genotype3"),
c("freq1", "freq2", "freq3"))
## A tibble: 12 × 5
## gene sample genos genotype freq
## <chr> <chr> <chr> <chr> <dbl>
## 1 gX s1 genotype1 AA 0.80
## 2 gX s2 genotype1 AA 0.90
## 3 gY s1 genotype1 GG 0.70
## 4 gY s2 genotype1 GG 0.60
## 5 gX s1 genotype2 AC 0.15
## 6 gX s2 genotype2 AC 0.10
## 7 gY s1 genotype2 GT 0.20
## 8 gY s2 genotype2 GT 0.35
## 9 gX s1 genotype3 CC 0.05
## 10 gX s2 genotype3 CC 0.00
## 11 gY s1 genotype3 TT 0.10
## 12 gY s2 genotype3 TT 0.05
library(data.table)
melt(setDT(have), id = 1:2, measure = patterns("genotype", "freq"))
> iris %>% as_tibble()
# A tibble: 150 × 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fctr>
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
7 4.6 3.4 1.4 0.3 setosa
8 5.0 3.4 1.5 0.2 setosa
9 4.4 2.9 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
# ... with 140 more rows
# convert columns to characters
>iris %>% as_tibble() %>% mutate_at(vars(Sepal.Length:Petal.Width), as.character) %>% head()
# A tibble: 6 × 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<chr> <chr> <chr> <chr> <fctr>
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
convert character columns back to double
iris %>% as_tibble() %>% mutate_at(vars(Sepal.Length:Petal.Width), as.character) %>% mutate_if(is.character, as.double)
# A tibble: 150 × 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fctr>
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
7 4.6 3.4 1.4 0.3 setosa
8 5.0 3.4 1.5 0.2 setosa
9 4.4 2.9 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
# ... with 140 more rows
# diff minus the previous number in sequence
> a<- c(1,2,5,7,9,14)
> diff(a)
[1] 1 3 2 2 5
## the long way
> a
[1] 1 2 5 7 9 14
> lag(a, 1)
[1] NA 1 2 5 7 9
> a - lag(a,1)
[1] NA 1 3 2 2 5
https://cran.r-project.org/web/packages/dplyr/vignettes/window-functions.html
A window function is a variation on an aggregation function. Where an aggregation function, like sum() and mean(), takes n inputs and return a single value, a window function returns n values. The output of a window function depends on all its input values, so window functions don’t include functions that work element-wise, like + or round(). Window functions include variations on aggregate functions, like cumsum() and cummean(), functions for ranking and ordering, like rank(), and functions for taking offsets, like lead() and lag().
select(flights, time_hour, air_time, everything())
see http://www.magesblog.com/2015/04/plotting-tables-alsongside-charts-in-r.html
# Create some sample data
CV_1 <- 0.2
CV_2 <- 0.3
Mean <- 65
sigma_1 <- sqrt(log(1 + CV_1^2))
mu_1 <- log(Mean) - sigma_1^2 / 2
sigma_2 <- sqrt(log(1 + CV_2^2))
mu_2 <- log(Mean) - sigma_2^2 / 2
q <- c(0.25, 0.5, 0.75, 0.9, 0.95)
SummaryTable <- data.frame(
Quantile=paste0(100*q,"%ile"),
Loss_1=round(qlnorm(q, mu_1, sigma_1),1),
Loss_2=round(qlnorm(q, mu_2, sigma_2),1)
)
# Create a plot
library(ggplot2)
plt <- ggplot(data.frame(x=c(20, 150)), aes(x)) +
stat_function(fun=function(x) dlnorm(x, mu_1, sigma_1),
aes(colour="CV_1")) +
stat_function(fun=function(x) dlnorm(x, mu_2, sigma_2),
aes(colour="CV_2")) +
scale_colour_discrete(name = "CV",
labels=c(expression(CV[1]), expression(CV[2]))) +
xlab("Loss") +
ylab("Density") +
ggtitle(paste0("Two log-normal distributions with same mean of ",
Mean,", but different CVs"))
# Create a table plot
library(gridExtra)
names(SummaryTable) <- c("Quantile",
expression(Loss(CV[1])),
expression(Loss(CV[2])))
# Set theme to allow for plotmath expressions
tt <- ttheme_default(colhead=list(fg_params = list(parse=TRUE)))
tbl <- tableGrob(SummaryTable, rows=NULL, theme=tt)
# Plot chart and table into one object
grid.arrange(plt, tbl,
nrow=2,
as.table=TRUE,
heights=c(3,1))
... %>%
select_if(~ !all(is.na(.)))
# OR equivalent
select_if(function(.) !all(is.na(.)))
janitor::remove_empty_cols()
df %>% replace(is.na(.), 0)
df %>% %>% mutate_all(coalesce, 0)
df %>% arrange(var1, var2) %>% mutate(my_rank = 1: n())
df %>% arrange(var1, var2) %>% mutate(my_rank = row_number())
https://twitter.com/robinson_es/status/953432465514876928
rlang::set_names() = purrr::set_names()
rlang::set_names(), tibble::rowid_to_column(), modelr::seq_range(), the .data pronoun, purrr::safely(), dplyr::pull(), stringr::str_replace_all() with a named vector
enframe, deframe, fct_reorder, fct_reorder2
https://stats.stackexchange.com/questions/8137/how-to-add-horizontal-lines-to-ggplot2-boxplot
bp <- ggplot(iris, aes(factor(Species), Sepal.Width, fill = Species)) + stat_boxplot(geom ='errorbar')
bp + geom_boxplot()
https://micahallen.org/blog-neuroconscience/
library(readr)
library(tidyr)
library(ggplot2)
library(Hmisc)
library(plyr)
library(RColorBrewer)
library(reshape2)
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
my_data<-read.csv(url("https://data.bris.ac.uk/datasets/112g2vkxomjoo1l26vjmvnlexj/2016.08.14_AnxietyPaper_Data%20Sheet.csv"))
head(X)
library(reshape2)
my_datal <- melt(my_data, id.vars = c("Participant"), measure.vars = c("AngerUH", "DisgustUH", "FearUH", "HappyUH"), variable.name = "EmotionCondition", value.name = "Sensitivity")
head(my_datal)
raincloud_theme = theme(
text = element_text(size = 10),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text = element_text(size = 14),
axis.text.x = element_text(angle = 45, vjust = 0.5),
legend.title=element_text(size=16),
legend.text=element_text(size=16),
legend.position = "right",
plot.title = element_text(lineheight=.8, face="bold", size = 16),
panel.border = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.line.x = element_line(colour = 'black', size=0.5, linetype='solid'),
axis.line.y = element_line(colour = 'black', size=0.5, linetype='solid'))
lb <- function(x) mean(x) - sd(x)
ub <- function(x) mean(x) + sd(x)
sumld<- ddply(my_datal, ~EmotionCondition, summarise, mean = mean(Sensitivity), median = median(Sensitivity), lower = lb(Sensitivity), upper = ub(Sensitivity))
head(sumld)
g <- ggplot(data = my_datal, aes(y = Sensitivity, x = EmotionCondition, fill = EmotionCondition)) +
geom_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .8) +
geom_point(aes(y = Sensitivity, color = EmotionCondition), position = position_jitter(width = .15), size = .5, alpha = 0.8) +
geom_boxplot(width = .1, guides = FALSE, outlier.shape = NA, alpha = 0.5) +
expand_limits(x = 5.25) +
guides(fill = FALSE) +
guides(color = FALSE) +
scale_color_brewer(palette = "Spectral") +
scale_fill_brewer(palette = "Spectral") +
# coord_flip() +
theme_bw() +
raincloud_theme
g
data<- data.frame(
x = 1:10,
y = rnorm(10),
name = c("Apple", "Banana", "Kiwi", "Orange", "Watermelon",
"Grapes", "Pear", "Cantelope", "Tomato", "Satusma")
)
my_data<- mutate(data, name_poor = case_when(
y < 0 ~ name,
TRUE ~ ""
))
ggplot(my_data, aes(x = x, y = y)) +
geom_point(size = 5) +
geom_text_repel(aes(label = name_poor), point.padding = 2)
library(tidyverse)
library(stringi)
n_patient = 2
n_samples = 3
n_readgroup = 4
n_mate = 2
df = data.frame(patient = rep(rep(LETTERS[1:n_patient], n_samples),2),
sample = rep(rep(seq(1:n_samples), each = n_patient),2),
readgroup = rep(stri_rand_strings(n_patient * n_samples * n_readgroup, 6, '[A-Z]'),2),
mate = rep(1:n_mate, each = n_patient * n_samples * n_readgroup)) %>%
mutate(file = sprintf("%s.%s.%s_%s", patient, sample, readgroup, mate)) %>%
arrange(file)
> head(df)
patient sample readgroup mate file
1 A 1 FCSDRJ 1 A.1.FCSDRJ_1
2 A 1 FCSDRJ 2 A.1.FCSDRJ_2
3 A 1 IAXDPR 1 A.1.IAXDPR_1
4 A 1 IAXDPR 2 A.1.IAXDPR_2
5 A 1 MLDBKZ 1 A.1.MLDBKZ_1
6 A 1 MLDBKZ 2 A.1.MLDBKZ_2
json2 <- df %>% nest(-(1:2),.key=readgroups) %>% nest(-1,.key=samples)
json3 <- df %>% nest(-(1:3),.key=mate) %>% nest(-(1:2),.key=readgroups) %>% nest(-1,.key=samples)
jsonlite::toJSON(json3,pretty=T)
# output
[
{
"patient": "A",
"samples": [
{
"sample": 1,
"readgroups": [
{
"readgroup": "FUPEYR",
"mate": [
{
"mate": 1,
"file": "A.1.FUPEYR_1"
},
{
"mate": 2,
"file": "A.1.FUPEYR_2"
}
...
And if necessary, generalize it:
vars <- names(df)[-1] # or whatever variables you want to nest, order matters!
var_pairs <- map((length(vars)-1):1,~vars[.x:(.x+1)])
json4 <- reduce(var_pairs,~{nm<-.y[1];nest(.x,.y,.key=!!enquo(nm))},.init=df)
jsonlite::toJSON(json4,pretty=T)
[
{
"patient": "A",
"sample": [
{
"sample": 1,
"readgroup": [
{
"readgroup": "FUPEYR",
"mate": [
{
"mate": 1,
"file": "A.1.FUPEYR_1"
},
{
"mate": 2,
"file": "A.1.FUPEYR_2"
}
...
https://github.com/dgrtwo/drlib/blob/master/R/reorder_within.R
#' Reorder an x or y axis within facets
#'
#' Reorder a column before plotting with faceting, such that the values are ordered
#' within each facet. This requires two functions: \code{reorder_within} applied to
#' the column, then either \code{scale_x_reordered} or \code{scale_y_reordered} added
#' to the plot.
#' This is implemented as a bit of a hack: it appends ___ and then the facet
#' at the end of each string.
#'
#' @param x Vector to reorder.
#' @param by Vector of the same length, to use for reordering.
#' @param within Vector of the same length that will later be used for faceting
#' @param fun Function to perform within each subset to determine the resulting
#' ordering. By default, mean.
#' @param sep Separator to distinguish the two. You may want to set this manually
#' if ___ can exist within one of your labels.
#' @param ... In \code{reorder_within} arguments passed on to \code{\link{reorder}}.
#' In the scale functions, extra arguments passed on to
#' \code{\link[ggplot2]{scale_x_discrete}} or \code{\link[ggplot2]{scale_y_discrete}}.
#'
#' @source "Ordering categories within ggplot2 Facets" by Tyler Rinker:
#' \url{https://trinkerrstuff.wordpress.com/2016/12/23/ordering-categories-within-ggplot2-facets/}
#'
#' @examples
#'
#' library(tidyr)
#' library(ggplot2)
#'
#' iris_gathered <- gather(iris, metric, value, -Species)
#'
#' # reordering doesn't work within each facet (see Sepal.Width):
#' ggplot(iris_gathered, aes(reorder(Species, value), value)) +
#' geom_boxplot() +
#' facet_wrap(~ metric)
#'
#' # reorder_within and scale_x_reordered work.
#' # (Note that you need to set scales = "free_x" in the facet)
#' ggplot(iris_gathered, aes(reorder_within(Species, value, metric), value)) +
#' geom_boxplot() +
#' scale_x_reordered() +
#' facet_wrap(~ metric, scales = "free_x")
#'
#' @export
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
new_x <- paste(x, within, sep = sep)
stats::reorder(new_x, by, FUN = fun)
}
#' @rdname reorder_within
#' @export
scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
#' @rdname reorder_within
#' @export
scale_y_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_y_discrete(labels = function(x) gsub(reg, "", x), ...)
}
library(tidyverse)
> test_scores<- data_frame(student = c("Amy", "Belle", "Candice"),
+ score= c("75-81-86","87-89-90","92-93-99"))
> test_scores
# A tibble: 3 x 2
student score
<chr> <chr>
1 Amy 75-81-86
2 Belle 87-89-90
3 Candice 92-93-99
> test_scores %>% separate(score, c("s1", "s2", "s3")) %>%
+ gather(key, score, -student) %>% select(-key)
# A tibble: 9 x 2
student score
<chr> <chr>
1 Amy 75
2 Belle 87
3 Candice 92
4 Amy 81
5 Belle 89
6 Candice 93
7 Amy 86
8 Belle 90
9 Candice 99
>
> separate_rows(test_scores, score)
# A tibble: 9 x 2
student score
<chr> <chr>
1 Amy 75
2 Amy 81
3 Amy 86
4 Belle 87
5 Belle 89
6 Belle 90
7 Candice 92
8 Candice 93
9 Candice 99
from https://twitter.com/tjmahr/status/1083094031826124800?s=12
library(ggplot2)
ggpreview <- function (..., device = "png") {
fname <- tempfile(fileext = paste0(".", device))
ggplot2::ggsave(filename = fname, device = device, ...)
system2("open", fname)
invisible(NULL)
}
g<- ggplot(mtcars, aes(x = hp, y = mpg)) + geom_point()
ggpreview(g, width = 5, height = 6, device = "pdf")
dplyr >= 0.8.0 see this post https://www.johnmackintosh.com/2019-02-28-first-look-at-mapping-and-splitting-in-dplyr/ and this tweethttps://twitter.com/coolbutuseless/status/1101447111978205184?s=12
(a) group_split() + walk() (b) group_by() + group_walk()
library(tidyverse)
> mtcars %>% group_split(cyl) %>% walk(~print(head(.x,2)))
# A tibble: 2 x 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
2 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
# A tibble: 2 x 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
# A tibble: 2 x 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
2 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
## the cyl variable is not in the dataframe
> mtcars %>% group_by(cyl) %>% group_walk(~print(head(.x,2)))
# A tibble: 2 x 10
mpg disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 22.8 108 93 3.85 2.32 18.6 1 1 4 1
2 24.4 147. 62 3.69 3.19 20 1 0 4 2
# A tibble: 2 x 10
mpg disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 160 110 3.9 2.62 16.5 0 1 4 4
2 21 160 110 3.9 2.88 17.0 0 1 4 4
# A tibble: 2 x 10
mpg disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 18.7 360 175 3.15 3.44 17.0 0 0 3 2
2 14.3 360 245 3.21 3.57 15.8 0 0 3 4
https://milesmcbain.xyz/hacking-r-library-paths/
set_lib_paths <- function(lib_vec) {
lib_vec <- normalizePath(lib_vec, mustWork = TRUE)
shim_fun <- .libPaths
shim_env <- new.env(parent = environment(shim_fun))
shim_env$.Library <- character()
shim_env$.Library.site <- character()
environment(shim_fun) <- shim_env
shim_fun(lib_vec)
}
> .libPaths()
[1] "/home/miles/R/x86_64-pc-linux-gnu-library/3.6"
[2] "/usr/local/lib/R/site-library"
[3] "/usr/lib/R/site-library"
[4] "/usr/lib/R/library"
> set_lib_paths("~/code/library")
> .libPaths()
[1] "/home/miles/code/library"
https://jennybc.github.io/purrr-tutorial/ls12_different-sized-samples.html
iris %>%
group_by(Species) %>%
nest() %>%
mutate(n = c(2, 5, 3)) %>%
mutate(samp = map2(data, n, sample_n)) %>%
select(Species, samp) %>%
unnest()
#> # A tibble: 10 x 5
#> Species Sepal.Length Sepal.Width Petal.Length Petal.Width
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 setosa 5.4 3.4 1.7 0.2
#> 2 setosa 5.5 3.5 1.3 0.2
#> 3 versicolor 6.6 2.9 4.6 1.3
#> 4 versicolor 6.9 3.1 4.9 1.5
#> 5 versicolor 5.8 2.7 3.9 1.2
#> 6 versicolor 6 2.7 5.1 1.6
#> 7 versicolor 6.2 2.9 4.3 1.3
#> 8 virginica 6.4 3.2 5.3 2.3
#> 9 virginica 6.5 3 5.5 1.8
#> 10 virginica 6.1 3 4.9 1.8
also check dplyr::sample_n()
and dplyr::sample_frac()
https://juliasilge.com/blog/reorder-within/
library(tidyverse)
library(babynames)
top_names <- babynames %>%
filter(year >= 1950,
year < 1990) %>%
mutate(decade = (year %/% 10) * 10) %>%
group_by(decade) %>%
count(name, wt = n, sort = TRUE) %>%
ungroup
top_names
top_names %>%
group_by(decade) %>%
top_n(15) %>%
ungroup %>%
mutate(decade = as.factor(decade),
name = reorder_within(name, n, decade)) %>%
ggplot(aes(name, n, fill = decade)) +
geom_col(show.legend = FALSE) +
facet_wrap(~decade, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
scale_y_continuous(expand = c(0,0)) +
labs(y = "Number of babies per decade",
x = NULL,
title = "What were the most common baby names in each decade?",
subtitle = "Via US Social Security Administration")
https://indrajeetpatil.github.io/pairwiseComparisons/ and https://cran.r-project.org/web/packages/ggsignif/vignettes/intro.html
library(pairwiseComparisons)
library(ggsignif)
library(ggplot2)
mtcars$cyl<- as.factor(mtcars$cyl)
df<- pairwise_comparisons(mtcars, cyl, wt, type = "parametric") %>%
dplyr::mutate(.data = ., groups = purrr::pmap(.l = list(group1, group2), .f = c)) %>%
dplyr::arrange(.data = . , group1)
p<- ggplot(mtcars, aes(cyl, wt)) +geom_boxplot()
p + ggsignif::geom_signif(
comparisons = df$groups,
map_signif_level = TRUE,
y_position = c(5.5,5.75,6),
annotations = df$label,
test = NULL,
na.rm = TRUE,
parse = TRUE
)
Thanks Shila Ghazanfar for the tip!
library(ggplot2)
library(patchwork)
df = data.frame(x = c("yes", "no", "maybe"))
g1 = ggplot(df, aes(x = x, fill = x)) + geom_bar()
g2 = g1 + scale_fill_discrete(limits = rev(levels(df$x)))
g1 + g2
https://stackoverflow.com/questions/8197559/emulate-ggplot2-default-color-palette
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
## 4 colors
n = 4
cols = gg_color_hue(n)
> library(tidyr)
> library(dplyr)
> mydf
V1 V2
2 1 a,b,c
3 2 a,c
4 3 b,d
5 4 e,f
6 . .
> mydf %>%
mutate(V2 = strsplit(as.character(V2), ",")) %>%
unnest(V2)
V1 V2
1 1 a
2 1 b
3 1 c
4 2 a
5 2 c
6 3 b
7 3 d
8 4 e
9 4 f
or use seperate_rows
:
> head(mydf)
geneid chrom start end strand length gene_count
ENSG00000223972.5 chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1 11869;12010;12179;12613;12613;12975;13221;13221;13453 12227;12057;12227;12721;12697;13052;13374;14409;13670 +;+;+;+;+;+;+;+;+ 1735 11
ENSG00000227232.5 chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1 14404;15005;15796;16607;16858;17233;17606;17915;18268;24738;29534 14501;15038;15947;16765;17055;17368;17742;18061;18366;24891;29570 -;-;-;-;-;-;-;-;-;-;- 1351 380
ENSG00000278267.1 chr1 17369 17436 - 68 14
ENSG00000243485.4 chr1;chr1;chr1;chr1;chr1 29554;30267;30564;30976;30976 30039;30667;30667;31097;31109 +;+;+;+;+ 1021 22
ENSG00000237613.2 chr1;chr1;chr1 34554;35277;35721 35174;35481;36081 -;-;- 1187 24
ENSG00000268020.3 chr1 52473 53312 + 840 14
> mydf %>% separate_rows(strand, chrom, gene_start, gene_end)
geneid length gene_count strand chrom start end
ENSG00000223972.5 1735 11 + chr1 11869 12227
ENSG00000223972.5 1735 11 + chr1 12010 12057
ENSG00000223972.5 1735 11 + chr1 12179 12227
ENSG00000223972.5 1735 11 + chr1 12613 12721
ENSG00000223972.5 1735 11 + chr1 12613 12697
ENSG00000223972.5 1735 11 + chr1 12975 13052
ENSG00000223972.5 1735 11 + chr1 13221 13374
ENSG00000223972.5 1735 11 + chr1 13221 14409
ENSG00000223972.5 1735 11 + chr1 13453 13670
ENSG00000227232.5 1351 380 - chr1 14404 14501
ENSG00000227232.5 1351 380 - chr1 15005 15038
ENSG00000227232.5 1351 380 - chr1 15796 15947
ENSG00000227232.5 1351 380 - chr1 16607 16765
ENSG00000227232.5 1351 380 - chr1 16858 17055
ENSG00000227232.5 1351 380 - chr1 17233 17368
ENSG00000227232.5 1351 380 - chr1 17606 17742
ENSG00000227232.5 1351 380 - chr1 17915 18061
ENSG00000227232.5 1351 380 - chr1 18268 18366
ENSG00000227232.5 1351 380 - chr1 24738 24891
ENSG00000227232.5 1351 380 - chr1 29534 29570
ENSG00000278267.1 68 5 - chr1 17369 17436
ENSG00000243485.4 1021 8 + chr1 29554 30039
ENSG00000243485.4 1021 8 + chr1 30267 30667
ENSG00000243485.4 1021 8 + chr1 30564 30667
ENSG00000243485.4 1021 8 + chr1 30976 31097
ENSG00000243485.4 1021 8 + chr1 30976 31109
ENSG00000237613.2 1187 24 - chr1 34554 35174
ENSG00000237613.2 1187 24 - chr1 35277 35481
ENSG00000237613.2 1187 24 - chr1 35721 36081
ENSG00000268020.3 840 0 + chr1 52473 53312
If you concern about the speed see data.table
solutions.
https://stackoverflow.com/questions/13773770/split-comma-separated-strings-in-a-column-into-separate-rows
library(data.table)
# method 1 (preferred)
setDT(v)[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed=TRUE))), by = AB
][!is.na(director)]
# method 2
setDT(v)[, strsplit(as.character(director), ",", fixed=TRUE), by = .(AB, director)
][,.(director = V1, AB)]
or even base R
# if 'director' is a character-column:
stack(setNames(strsplit(df$director,','), df$AB))
# if 'director' is a factor-column:
stack(setNames(strsplit(as.character(df$director),','), df$AB))
library(tidyverse) #dplyr version >=0.8.99.9000
world_total_pop<- world_bank_pop %>%
filter(indicator == "SP.POP.TOTL")
head(world_total_pop)
country indicator `2000` `2001` `2002` `2003` `2004` `2005` `2006` `2007`
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ABW SP.POP.T… 9.09e4 9.29e4 9.50e4 9.70e4 9.87e4 1.00e5 1.01e5 1.01e5
2 AFG SP.POP.T… 2.01e7 2.10e7 2.20e7 2.31e7 2.41e7 2.51e7 2.59e7 2.66e7
3 AGO SP.POP.T… 1.64e7 1.70e7 1.76e7 1.82e7 1.89e7 1.96e7 2.03e7 2.10e7
....
## calculate the mean of 2000 to 2007
tidyr_way<- world_total_pop %>%
pivot_longer(starts_with("20")) %>%
group_by(country) %>%
mutate(mean = mean(value, na.rm = TRUE)) %>%
pivot_wider(names_from = name)
purrr_across_way<- world_total_pop %>%
mutate(mean = pmap_dbl(across(starts_with("20")),
~mean(c(...), na.rm = TRUE)))
# or https://github.com/jennybc/row-oriented-workflows/blob/master/ex09_row-summaries.md
purrr_across_way<- world_total_pop %>%
mutate(mean = pmap_dbl(select(., starts_with("20")),
~mean(c(...), na.rm = TRUE)))
rowwise_flat_way<- world_total_pop %>%
rowwise() %>%
mutate(mean = mean(flatten_dbl(across(starts_with("20"))), na.rm =TRUE))
tidybase_way<- world_total_pop %>%
mutate(mean=rowMeans(across(starts_with("20")), na.rm = TRUE))
check https://github.com/jennybc/row-oriented-workflows as well. https://github.com/jennybc/row-oriented-workflows/blob/master/ex09_row-summaries.md
https://tladeras.shinyapps.io/learning_rowwise/
Manual facets: that base-R layout()
goodness coming to ggplot2: https://teunbrand.github.io/ggh4x/articles/Facets.html#manual-facets-1