Rで統計的仮説検定:分散分析:二元配置分散分析:アンバランスケース

Rでデータサイエンス

分散分析:二元配置分散分析:アンバランスケース

サンプル

library(dplyr)
seed <- 20230906
set.seed(seed = seed)
A <- c(rep("a1", 8), rep("a2", 5), rep("a3", 7))
B <- c(rep(c("b1", "b2"), 10))
m_A <- ifelse(A == "a1", 10, ifelse(A == "a2", 13, 15))
m_B <- ifelse(B == "b1", 3, 5)
m <- m_A * m_B + rnorm(length(A))
y <- rnorm(n = length(m), mean = m)
(df0 <- data.frame(A, B, y))
    A  B        y
1  a1 b1 29.91343
2  a1 b2 50.00045
3  a1 b1 29.06207
4  a1 b2 50.84555
5  a1 b1 28.91922
6  a1 b2 50.96287
7  a1 b1 28.27495
8  a1 b2 49.98828
9  a2 b1 39.42510
10 a2 b2 64.91906
11 a2 b1 38.82944
12 a2 b2 65.18426
13 a2 b1 35.91582
14 a3 b2 74.30268
15 a3 b1 48.01233
16 a3 b2 75.39627
17 a3 b1 45.92087
18 a3 b2 76.64767
19 a3 b1 43.91362
20 a3 b2 73.68897
# 各群のサンプルサイズ
paste0(A, B) %>%
    table()
# アンバランス
.
a1b1 a1b2 a2b1 a2b2 a3b1 a3b2 
   4    4    3    2    3    4 

分散分析表の作成

library(knitr)
library(kableExtra)
options(knitr.kable.NA = "")
anova_tb <- matrix(nrow = 5, ncol = 5)  # 分散分析表
colnames(anova_tb) <- c("偏差平方和", "自由度", "分散", "分散比(F値)", "p値")
row.names(anova_tb) <- c("全体変動", "因子A", "因子B", "交互作用:A×B", "誤差変動")
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動
因子A
因子B
交互作用:A×B
誤差変動

計画行列の作成

(mat_des <- model.matrix(y ~ A * B, data = df0))
   (Intercept) Aa2 Aa3 Bb2 Aa2:Bb2 Aa3:Bb2
1            1   0   0   0       0       0
2            1   0   0   1       0       0
3            1   0   0   0       0       0
4            1   0   0   1       0       0
5            1   0   0   0       0       0
6            1   0   0   1       0       0
7            1   0   0   0       0       0
8            1   0   0   1       0       0
9            1   1   0   0       0       0
10           1   1   0   1       1       0
11           1   1   0   0       0       0
12           1   1   0   1       1       0
13           1   1   0   0       0       0
14           1   0   1   1       0       1
15           1   0   1   0       0       0
16           1   0   1   1       0       1
17           1   0   1   0       0       0
18           1   0   1   1       0       1
19           1   0   1   0       0       0
20           1   0   1   1       0       1
attr(,"assign")
[1] 0 1 1 2 3 3
attr(,"contrasts")
attr(,"contrasts")$A
[1] "contr.treatment"

attr(,"contrasts")$B
[1] "contr.treatment"

拡大係数行列の作成

(mat_aug <- cbind(mat_des, y = df0$y))
   (Intercept) Aa2 Aa3 Bb2 Aa2:Bb2 Aa3:Bb2        y
1            1   0   0   0       0       0 29.91343
2            1   0   0   1       0       0 50.00045
3            1   0   0   0       0       0 29.06207
4            1   0   0   1       0       0 50.84555
5            1   0   0   0       0       0 28.91922
6            1   0   0   1       0       0 50.96287
7            1   0   0   0       0       0 28.27495
8            1   0   0   1       0       0 49.98828
9            1   1   0   0       0       0 39.42510
10           1   1   0   1       1       0 64.91906
11           1   1   0   0       0       0 38.82944
12           1   1   0   1       1       0 65.18426
13           1   1   0   0       0       0 35.91582
14           1   0   1   1       0       1 74.30268
15           1   0   1   0       0       0 48.01233
16           1   0   1   1       0       1 75.39627
17           1   0   1   0       0       0 45.92087
18           1   0   1   1       0       1 76.64767
19           1   0   1   0       0       0 43.91362
20           1   0   1   1       0       1 73.68897

積和行列の作成

(mat_pro <- t(mat_aug) %*% mat_aug)
            (Intercept)      Aa2      Aa3      Bb2  Aa2:Bb2  Aa3:Bb2          y
(Intercept)      20.000   5.0000   7.0000  10.0000   2.0000   4.0000  1000.1229
Aa2               5.000   5.0000   0.0000   2.0000   2.0000   0.0000   244.2737
Aa3               7.000   0.0000   7.0000   4.0000   0.0000   4.0000   437.8824
Bb2              10.000   2.0000   4.0000  10.0000   2.0000   4.0000   631.9361
Aa2:Bb2           2.000   2.0000   0.0000   2.0000   2.0000   0.0000   130.1033
Aa3:Bb2           4.000   0.0000   4.0000   4.0000   0.0000   4.0000   300.0356
y              1000.123 244.2737 437.8824 631.9361 130.1033 300.0356 55224.7837

平方和の算出

n <- dim(mat_pro) %>%
    unique
(total_sum_of_squares <- mat_pro[n, n])  # 観測値の2乗和と同じ。
[1] 55224.78
# 観測値の2乗和。
sum((df0$y)^2)
[1] 55224.78

切片の平方和

  • 平均調整総平方和
obj_row <- mat_pro %>%
    row.names %>%
    grep("Intercept", ., invert = F)
observations <- mat_pro[obj_row, n]
(rss1 <- total_sum_of_squares - lm.fit(x = mat_pro[obj_row, obj_row, drop = F], y = observations)$coef * observations)
(Intercept) 
    5212.49 
anova_tb[1, 1] <- rss1
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 5212.49
因子A
因子B
交互作用:A×B
誤差変動

要因Aの平方和

要因Aと要因Aとの交互作用を除いたモデル
obj_row <- mat_pro %>%
    row.names %>%
    grep("A|y", ., invert = T)
observations <- mat_pro[obj_row, n]
(rss2a <- total_sum_of_squares - sum(lm.fit(x = mat_pro[obj_row, obj_row], y = observations)$coef * observations))
[1] 1734.308
前項のモデルに要因Aを加えたモデル
obj_row <- mat_pro %>%
    row.names %>%
    grep("A.+:|y", ., invert = T)
observations <- mat_pro[obj_row, n]
(rss2b <- total_sum_of_squares - sum(lm.fit(x = mat_pro[obj_row, obj_row], y = observations)$coef * observations))
[1] 80.66119
anova_tb[2, 1] <- rss2a - rss2b
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 5212.490
因子A 1653.647
因子B
交互作用:A×B
誤差変動

要因Bの平方和

要因Bと要因Bとの交互作用を除いたモデル
obj_row <- mat_pro %>%
    row.names %>%
    grep("B|y", ., invert = T)
observations <- mat_pro[obj_row, n]
(rss3a <- total_sum_of_squares - sum(lm.fit(x = mat_pro[obj_row, obj_row], y = observations)$coef * observations))
[1] 3261.421
前項のモデルに要因Bを加えたモデル
obj_row <- mat_pro %>%
    row.names %>%
    grep(":B|y", ., invert = T)
observations <- mat_pro[obj_row, n]
(rss3b <- total_sum_of_squares - sum(lm.fit(x = mat_pro[obj_row, obj_row], y = observations)$coef * observations))
[1] 80.66119
anova_tb[3, 1] <- rss3a - rss3b
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 5212.490
因子A 1653.647
因子B 3180.760
交互作用:A×B
誤差変動

要因Aと要因Bの交互作用の平方和

要因Aと要因Bの交互作用を除いたモデル
obj_row <- mat_pro %>%
    row.names %>%
    grep(":|y", ., invert = T)
observations <- mat_pro[obj_row, n]
(rss4a <- total_sum_of_squares - sum(lm.fit(x = mat_pro[obj_row, obj_row], y = observations)$coef * observations))
[1] 80.66119
前項のモデルに要因A×要因Bを加えたモデル
obj_row <- mat_pro %>%
    row.names %>%
    grep("y", ., invert = T)
observations <- mat_pro[obj_row, n]
(rss4b <- total_sum_of_squares - sum(lm.fit(x = mat_pro[obj_row, obj_row], y = observations)$coef * observations))
[1] 22.76362
anova_tb[4, 1] <- rss4a - rss4b
anova_tb[5, 1] <- rss4b
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 5212.49047
因子A 1653.64705
因子B 3180.76024
交互作用:A×B 57.89756
誤差変動 22.76362
# 平均調整総平方和とは一致しない。
anova_tb[2:5, 1] %>%
    sum()

[1] 4915.068

自由度

anova_tb[1, 2] <- nrow(df0) - 1
anova_tb[2, 2] <- A %>%
    unique() %>%
    {
        length(.) - 1
    }
anova_tb[3, 2] <- B %>%
    unique() %>%
    {
        length(.) - 1
    }
anova_tb[4, 2] <- anova_tb[2, 2] * anova_tb[3, 2]
anova_tb[5, 2] <- anova_tb[1, 2] - sum(anova_tb[2:4, 2])
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 5212.49047 19
因子A 1653.64705 2
因子B 3180.76024 1
交互作用:A×B 57.89756 2
誤差変動 22.76362 14

不変分散

anova_tb[2, 3] <- anova_tb[2, 1]/anova_tb[2, 2]
anova_tb[3, 3] <- anova_tb[3, 1]/anova_tb[3, 2]
anova_tb[4, 3] <- anova_tb[4, 1]/anova_tb[4, 2]
anova_tb[5, 3] <- anova_tb[5, 1]/anova_tb[5, 2]
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 5212.49047 19
因子A 1653.64705 2 826.823524
因子B 3180.76024 1 3180.760237
交互作用:A×B 57.89756 2 28.948782
誤差変動 22.76362 14 1.625973

分散比

anova_tb[2, 4] <- anova_tb[2, 3]/anova_tb[5, 3]
anova_tb[3, 4] <- anova_tb[3, 3]/anova_tb[5, 3]
anova_tb[4, 4] <- anova_tb[4, 3]/anova_tb[5, 3]
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 5212.49047 19
因子A 1653.64705 2 826.823524 508.50996
因子B 3180.76024 1 3180.760237 1956.21944
交互作用:A×B 57.89756 2 28.948782 17.80397
誤差変動 22.76362 14 1.625973

p値

anova_tb[2, 5] <- pf(q = anova_tb[2, 4], df1 = anova_tb[2, 2], df2 = anova_tb[5, 2], lower.tail = FALSE)
anova_tb[3, 5] <- pf(q = anova_tb[3, 4], df1 = anova_tb[3, 2], df2 = anova_tb[5, 2], lower.tail = FALSE)
anova_tb[4, 5] <- pf(q = anova_tb[4, 4], df1 = anova_tb[4, 2], df2 = anova_tb[5, 2], lower.tail = FALSE)
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値) p値
全体変動 5212.49047 19
因子A 1653.64705 2 826.823524 508.50996 0.0000000
因子B 3180.76024 1 3180.760237 1956.21944 0.0000000
交互作用:A×B 57.89756 2 28.948782 17.80397 0.0001426
誤差変動 22.76362 14 1.625973

(参考) Anova {car}による結果

car::Anova(lm(formula = y ~ A * B, data = df0), type = 2)
Anova Table (Type II tests)

Response: y
          Sum Sq Df  F value                Pr(>F)    
A         1653.6  2  508.510   0.00000000000008512 ***
B         3180.8  1 1956.219 < 0.00000000000000022 ***
A:B         57.9  2   17.804             0.0001426 ***
Residuals   22.8 14                                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::Anova(lm(formula = y ~ B * A, data = df0), type = 2)
Anova Table (Type II tests)

Response: y
          Sum Sq Df  F value                Pr(>F)    
B         3180.8  1 1956.219 < 0.00000000000000022 ***
A         1653.6  2  508.510   0.00000000000008512 ***
B:A         57.9  2   17.804             0.0001426 ***
Residuals   22.8 14                                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

参考引用資料

最終更新

Sys.time()
[1] "2024-04-11 15:18:54 JST"

R、Quarto、Package

R.Version()$version.string
[1] "R version 4.3.3 (2024-02-29 ucrt)"
quarto::quarto_version()
[1] '1.4.542'
packageVersion(pkg = "tidyverse")
[1] '2.0.0'

著者