Reshape Package in R

Reshape package is powerful tool for reshaping data and aggregation. Nice tutorial can be found at:
http://www.jstatsoft.org/v21/i12/paper

> require(reshape)

We’re going to use french_fries. But as it’s big, get some samples.

> ff <- french_fries[sample(nrow(french_fries), 20),]
> ff
    time treatment subject rep potato buttery grassy rancid painty
303    5         2      15   1    4.1     0.2    0.2    0.8    0.0
142    2         3      78   1    2.7     1.9    0.0    1.2    1.3
339    5         2      15   2    1.5     0.3    0.4    4.7    0.4
413    6         2      19   2   11.1     1.5    0.0   11.5   12.2
153    3         3      63   2    9.1     0.0    0.0    6.7    0.0
456    7         2      86   1    1.2     0.0    0.0   10.1    7.3
691   10         3      31   2   10.4     0.0    0.0    5.7    0.0
213    3         1      63   2   10.8     0.0    0.0    0.8    0.0
403    6         1      51   2    8.5     2.5    2.1    7.9    4.5
351    5         3      15   2    3.6     0.2    0.4    4.6    0.4
684   10         2      63   2    9.0     0.0    0.0    6.7   10.7
670   10         1      19   2   11.8     3.2    2.7    8.6    3.6
55     1         2      51   1   11.7     4.1    4.3    3.3    3.9
623    9         2      52   2    1.6     0.0    0.0    5.0    8.5
34     1         1      78   2    8.8     0.6    3.0    0.5    1.1
204    3         2      86   2    5.2     3.0    2.2    4.0    0.0
198    3         2      31   2   10.0     0.4    0.0    9.1    8.2
465    7         3      63   1    9.5     0.0    0.0    1.4    0.0
652   10         2      51   1   11.3     5.3    0.8    0.0    2.5
423    6         3      15   2    4.3     0.2    0.3    0.6    0.5

Melt it into id and variables.

> mff <- ff(id=1:4, ff)
> mff
    time treatment subject rep variable value
1      5         2      15   1   potato   4.1
2      2         3      78   1   potato   2.7
3      5         2      15   2   potato   1.5
4      6         2      19   2   potato  11.1
5      3         3      63   2   potato   9.1
6      7         2      86   1   potato   1.2
7     10         3      31   2   potato  10.4
8      3         1      63   2   potato  10.8
9      6         1      51   2   potato   8.5
10     5         3      15   2   potato   3.6
11    10         2      63   2   potato   9.0
12    10         1      19   2   potato  11.8
13     1         2      51   1   potato  11.7
14     9         2      52   2   potato   1.6
15     1         1      78   2   potato   8.8
16     3         2      86   2   potato   5.2
17     3         2      31   2   potato  10.0
18     7         3      63   1   potato   9.5
19    10         2      51   1   potato  11.3
20     6         3      15   2   potato   4.3
21     5         2      15   1  buttery   0.2
22     2         3      78   1  buttery   1.9
23     5         2      15   2  buttery   0.3
24     6         2      19   2  buttery   1.5
25     3         3      63   2  buttery   0.0
26     7         2      86   1  buttery   0.0
... omitted ...

Using cast, reshape the data to express (variable, repetition) as separate variables. In the formula, ‘…’ represents all the other variables that’s not specified in the formula.

> cast(mff, ... ~ variable + rep)
   time treatment subject potato_1 potato_2 buttery_1 buttery_2 grassy_1
1     1         1      78       NA      8.8        NA       0.6       NA
2     1         2      51     11.7       NA       4.1        NA      4.3
3     2         3      78      2.7       NA       1.9        NA      0.0
4     3         1      63       NA     10.8        NA       0.0       NA
5     3         2      31       NA     10.0        NA       0.4       NA
6     3         2      86       NA      5.2        NA       3.0       NA
7     3         3      63       NA      9.1        NA       0.0       NA
8     5         2      15      4.1      1.5       0.2       0.3      0.2
9     5         3      15       NA      3.6        NA       0.2       NA
10    6         1      51       NA      8.5        NA       2.5       NA
11    6         2      19       NA     11.1        NA       1.5       NA
12    6         3      15       NA      4.3        NA       0.2       NA
13    7         2      86      1.2       NA       0.0        NA      0.0
14    7         3      63      9.5       NA       0.0        NA      0.0
15    9         2      52       NA      1.6        NA       0.0       NA
16   10         1      19       NA     11.8        NA       3.2       NA
17   10         2      51     11.3       NA       5.3        NA      0.8
18   10         2      63       NA      9.0        NA       0.0       NA
19   10         3      31       NA     10.4        NA       0.0       NA
   grassy_2 rancid_1 rancid_2 painty_1 painty_2
1       3.0       NA      0.5       NA      1.1
2        NA      3.3       NA      3.9       NA
3        NA      1.2       NA      1.3       NA
4       0.0       NA      0.8       NA      0.0
5       0.0       NA      9.1       NA      8.2
6       2.2       NA      4.0       NA      0.0
7       0.0       NA      6.7       NA      0.0
8       0.4      0.8      4.7      0.0      0.4
9       0.4       NA      4.6       NA      0.4
10      2.1       NA      7.9       NA      4.5
11      0.0       NA     11.5       NA     12.2
12      0.3       NA      0.6       NA      0.5
13       NA     10.1       NA      7.3       NA
14       NA      1.4       NA      0.0       NA
15      0.0       NA      5.0       NA      8.5
16      2.7       NA      8.6       NA      3.6
17       NA      0.0       NA      2.5       NA
18      0.0       NA      6.7       NA     10.7
19      0.0       NA      5.7       NA      0.0

By having less than required variables in the forumla, we get aggregate. In the below, we have 20 data for treatment==1.

> cast(mff, treatment ~ .)
Aggregation requires fun.aggregate: length used as default
  treatment (all)
1         1    20
2         2    50
3         3    30 
> nrow(subset(mff, treatment==1))
[1] 20

Getting sum of values.

> cast(mff, treatment ~ ., sum)
  treatment (all)
1         1  81.0
2         2 198.3
3         3  65.0 

Cross tabulation.

> d <- data.frame(sex=c('male', 'female', 'female'),
+                 pregnant=c('no', 'no', 'yes'),
+                 value=c(10, 14, 4))
> d$sex <- factor(d$sex)
> d$pregnant <- factor(d$pregnant)
> d
     sex pregnant value
1   male       no    10
2 female       no    14
3 female      yes     4
> cast(d, sex ~ pregnant, sum)
     sex no yes
1 female 14   4
2   male 10   0
> xtabs(value ~ sex + pregnant, d)

Getting margins.

> cast(d, sex ~ pregnant, sum, margins=TRUE)
     sex no yes (all)
1 female 14   4    18
2   male 10   0    10
3  (all) 24   4    28

Figuring out missing values is done with length aggregation (which is default aggregation function of cast). In the data ‘d’, see that sex==male && pregnant==yes is missing.

> d
     sex pregnant value
1   male       no    10
2 female       no    14
3 female      yes     4
> cast(d, sex ~ pregnant)
     sex no yes
1 female 14   4
2   male 10  NA

Similar Posts:

Post a Comment

Your email is never published nor shared. Required fields are marked *