I have a data frame as below:
> dfnew
C1 C2 C3 C4 C5 C6
1 A A G A G A
2 A T T T G G
3 T A G A T A
4 C A A A A G
5 C A T T T C
6 C A A A T A
7 T C T G A A
8 G A G C T A
9 C T A T G A
10 G A A A G G
11 G G T T T A
12 G A C T T A
13 T T C T T T
14 A T A G C T
15 A C A A A A
16 A A C A A A
17 T G G A A T
18 A A A A G T
19 G T G G <NA> <NA>
I want to get answer as below in one line of code in R without looping:
A 6 10 7 9 5 10
C 4 2 3 1 1 1
G 5 2 5 3 5 3
T 4 5 4 6 7 4
We can use sapply
to loop over the columns, convert it to factor
with levels
specified and get the frequency with table
sapply(dfnew, function(x) table(factor(x, levels = c("A", "C", "G", "T"))))
Or using tidyverse
library(dplyr)
library(tidyr)
dfnew %>%
gather(key, val, na.rm = TRUE) %>%
count(key, val) %>%
spread(key, n)
If you use stack
to reshape everything to long form, you can call table
on the result:
dfnew <- data.frame(C1 = c("A", "A", "T", "C", "C", "C", "T", "G", "C", "G", "G", "G", "T", "A", "A", "A", "T", "A", "G"),
C2 = c("A", "T", "A", "A", "A", "A", "C", "A", "T", "A", "G", "A", "T", "T", "C", "A", "G", "A", "T"),
C3 = c("G", "T", "G", "A", "T", "A", "T", "G", "A", "A", "T", "C", "C", "A", "A", "C", "G", "A", "G"),
C4 = c("A", "T", "A", "A", "T", "A", "G", "C", "T", "A", "T", "T", "T", "G", "A", "A", "A", "A", "G"),
C5 = c("G", "G", "T", "A", "T", "T", "A", "T", "G", "G", "T", "T", "T", "C", "A", "A", "A", "G", NA),
C6 = c("A", "G", "A", "G", "C", "A", "A", "A", "A", "G", "A", "A", "T", "T", "A", "A", "T", "T", NA),
stringsAsFactors = FALSE)
table(stack(dfnew))
#> ind
#> values C1 C2 C3 C4 C5 C6
#> A 6 10 7 9 5 10
#> C 4 2 3 1 1 1
#> G 5 2 5 3 5 3
#> T 4 5 4 6 7 4
using data.table
and its pipe worflow with [
:
library(data.table)
tab <- fread("
C1 C2 C3 C4 C5 C6
A A G A G A
A T T T G G
T A G A T A
C A A A A G
C A T T T C
C A A A T A
T C T G A A
G A G C T A
C T A T G A
G A A A G G
G G T T T A
G A C T T A
T T C T T T
A T A G C T
A C A A A A
A A C A A A
T G G A A T
A A A A G T
G T G G NA NA")
tab[, melt(.SD, measure.vars = paste0("C", 1:6), na.rm = TRUE)][
, dcast(.SD, value ~ variable, fun = length, drop = TRUE)
]
#> value C1 C2 C3 C4 C5 C6
#> 1: A 6 10 7 9 5 10
#> 2: C 4 2 3 1 1 1
#> 3: G 5 2 5 3 5 3
#> 4: T 4 5 4 6 7 4