Rで統計的仮説検定:分散分析:一元配置分散分析

Rでデータサイエンス

分散分析:一元配置分散分析

検定のコンセプト

  • 3群以上のそれぞれの群の母平均の差の検定。
  • \(H_0\):各群の母平均は等しい。
  • それぞれの群が「正規分布に従い」かつ「母分散が等しい」場合に帰無仮説の下で分散比はF分布に従う。
  • 全体平方和 = 群間平方和 + 群内平方和
  • 郡内平方和:各群の偏差平方和の和。
  • 群間平方和:各群の平均値の全体平均からの偏差平方和。
  • 「データ全体のバラツキを群間のバラツキと群内のバラツキに分解し、群間のバラツキが群内のバラツキに比べて有意に大きければ群間の平均値に差があると導きます」(https://bellcurve.jp/statistics/blog/15556.html)

サンプル:母平均が異なる場合

library(dplyr)
library(knitr)
library(kableExtra)
seed <- 20230823
set.seed(seed = seed)
# 各群のサンプルサイズ
n1 <- 15
n2 <- 20
n3 <- 30
# 各群の母平均値
m1 <- 5
m2 <- 1
m3 <- 2
# 各群の母分散(標準偏差)
sd1 <- sd2 <- sd3 <- 1
# 各群の変数
g1 <- rnorm(n = n1, mean = m1, sd = sd1)
g2 <- rnorm(n = n2, mean = m2, sd = sd2)
g3 <- rnorm(n = n3, mean = m3, sd = sd3)
(df <- data.frame(group = rep(x = c("g1", "g2", "g3"), times = c(n1, n2, n3)), value = c(g1, g2, g3)))
   group      value
1     g1  5.3684270
2     g1  4.6179074
3     g1  4.9877861
4     g1  1.5018181
5     g1  6.5867877
6     g1  4.6096702
7     g1  4.6906882
8     g1  5.4528394
9     g1  5.8444347
10    g1  6.9562095
11    g1  3.7810045
12    g1  4.8595054
13    g1  5.3964281
14    g1  5.2214576
15    g1  4.7497939
16    g2  2.0485258
17    g2  2.2013865
18    g2  0.3297786
19    g2 -0.1177708
20    g2  2.2049402
21    g2 -0.1269150
22    g2  1.2014856
23    g2  0.3599929
24    g2  2.8469938
25    g2  1.0423259
26    g2  2.7743601
27    g2  2.0474405
28    g2  1.4218076
29    g2  1.9641822
30    g2  0.1260984
31    g2 -0.8393017
32    g2  1.8206394
33    g2  1.1272640
34    g2  1.1755174
35    g2 -0.9301331
36    g3  1.7325398
37    g3  2.9558005
38    g3  2.7040243
39    g3  3.7917801
40    g3  2.4755809
41    g3  1.2343055
42    g3  0.8166677
43    g3  2.6116241
44    g3  1.4483605
45    g3  2.2323040
46    g3  2.2483419
47    g3  2.0539699
48    g3  2.6389746
49    g3  1.2613489
50    g3 -0.5254186
51    g3  2.2930887
52    g3  0.9756143
53    g3  1.1712612
54    g3  1.7737473
55    g3  1.4247293
56    g3  2.9652550
57    g3  1.8377868
58    g3  2.5035726
59    g3  2.1387183
60    g3  2.5785218
61    g3 -0.0456603
62    g3  1.3532960
63    g3  1.6007492
64    g3  3.0275605
65    g3  2.9147624

分散分析表の作成

options(knitr.kable.NA = "")
anova_tb <- matrix(nrow = df$group %>%
    unique() %>%
    length(), ncol = 4)  # 分散分析表
colnames(anova_tb) <- c("偏差平方和", "自由度", "分散", "分散比(F値)")
row.names(anova_tb) <- c("全体変動", "群間変動", "郡内変動(残差)")
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値)
全体変動
群間変動
郡内変動(残差)

偏差平方和の算出

全体偏差平方和
(mu <- df$value %>%
    mean())
[1] 2.392255
anova_tb[1, 1] <- sum((df$value - mu)^2)
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値)
全体変動 209.4241
群間変動
郡内変動(残差)
群間偏差平方和
(df_aggregate <- aggregate(x = df$value, by = list(df$group), FUN = mean) %>%
    {
        colnames(.) <- c("group", "mean")
        .
    })
  group     mean
1    g1 4.974984
2    g2 1.133931
3    g3 1.939774
anova_tb[2, 1] <- sum((df_aggregate$mean - mu)^2 * c(n1, n2, n3))
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値)
全体変動 209.4241
群間変動 137.8671
郡内変動(残差)
郡内偏差平方和
(df_aggregate <- aggregate(x = df$value, by = list(df$group), FUN = function(x) sum((x - mean(x))^2)))
  Group.1        x
1      g1 21.79508
2      g2 24.41345
3      g3 25.34849
anova_tb[3, 1] <- df_aggregate$x %>%
    sum()
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値)
全体変動 209.42411
群間変動 137.86710
郡内変動(残差) 71.55701
# 結果は同一である。
anova_tb[1, 1] - anova_tb[2, 1]
[1] 71.55701

自由度

# 全体変動
anova_tb[1, 2] <- nrow(df) - 1  # n -1
# 群間変動
anova_tb[2, 2] <- length(unique(df$group)) - 1  # 群雛 - 1
# 郡内変動
anova_tb[3, 2] <- nrow(df) - length(unique(df$group))  # n - 群雛
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値)
全体変動 209.42411 64
群間変動 137.86710 2
郡内変動(残差) 71.55701 62

分散

anova_tb[1, 3] <- anova_tb[1, 1]/anova_tb[1, 2]
anova_tb[2, 3] <- anova_tb[2, 1]/anova_tb[2, 2]
anova_tb[3, 3] <- anova_tb[3, 1]/anova_tb[3, 2]
anova_tb %>%
    kable(format = "html") %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値)
全体変動 209.42411 64 3.272252
群間変動 137.86710 2 68.933549
郡内変動(残差) 71.55701 62 1.154145

分散比(F値)

# 群間分散 / 郡内分散
(anova_tb[2, 4] <- anova_tb[2, 3]/anova_tb[3, 3])
[1] 59.72692

分散分析表

knitr::kable(x = anova_tb, format = "html", align = c(rep("r", 4))) %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値)
全体変動 209.42411 64 3.272252
群間変動 137.86710 2 68.933549 59.72692
郡内変動(残差) 71.55701 62 1.154145

p値

pf(q = anova_tb[2, 4], df1 = anova_tb[2, 2], df2 = anova_tb[3, 2], lower.tail = FALSE)
[1] 0.000000000000003486513
  • 3群の母平均は等しい、との帰無仮説は棄却される。

(参考) anova {stats}による結果

anova(object = aov(formula = value ~ group, data = df))
Analysis of Variance Table

Response: value
          Df  Sum Sq Mean Sq F value               Pr(>F)    
group      2 137.867  68.934  59.727 0.000000000000003487 ***
Residuals 62  71.557   1.154                                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

サンプル:母平均が同一の場合

set.seed(seed = seed)
n1 <- 15
n2 <- 20
n3 <- 30
m1 <- 5
m2 <- 5
m3 <- 5
sd1 <- sd2 <- sd3 <- 1
g1 <- rnorm(n = n1, mean = m1, sd = sd1)
g2 <- rnorm(n = n2, mean = m2, sd = sd2)
g3 <- rnorm(n = n3, mean = m3, sd = sd3)
(df <- data.frame(group = rep(x = c("g1", "g2", "g3"), times = c(n1, n2, n3)), value = c(g1, g2, g3)))
   group    value
1     g1 5.368427
2     g1 4.617907
3     g1 4.987786
4     g1 1.501818
5     g1 6.586788
6     g1 4.609670
7     g1 4.690688
8     g1 5.452839
9     g1 5.844435
10    g1 6.956209
11    g1 3.781005
12    g1 4.859505
13    g1 5.396428
14    g1 5.221458
15    g1 4.749794
16    g2 6.048526
17    g2 6.201387
18    g2 4.329779
19    g2 3.882229
20    g2 6.204940
21    g2 3.873085
22    g2 5.201486
23    g2 4.359993
24    g2 6.846994
25    g2 5.042326
26    g2 6.774360
27    g2 6.047441
28    g2 5.421808
29    g2 5.964182
30    g2 4.126098
31    g2 3.160698
32    g2 5.820639
33    g2 5.127264
34    g2 5.175517
35    g2 3.069867
36    g3 4.732540
37    g3 5.955800
38    g3 5.704024
39    g3 6.791780
40    g3 5.475581
41    g3 4.234305
42    g3 3.816668
43    g3 5.611624
44    g3 4.448361
45    g3 5.232304
46    g3 5.248342
47    g3 5.053970
48    g3 5.638975
49    g3 4.261349
50    g3 2.474581
51    g3 5.293089
52    g3 3.975614
53    g3 4.171261
54    g3 4.773747
55    g3 4.424729
56    g3 5.965255
57    g3 4.837787
58    g3 5.503573
59    g3 5.138718
60    g3 5.578522
61    g3 2.954340
62    g3 4.353296
63    g3 4.600749
64    g3 6.027561
65    g3 5.914762
anova_tb[] <- NA
mu <- df$value %>%
    mean()
anova_tb[1, 1] <- sum((df$value - mu)^2)
df_aggregate <- aggregate(x = df$value, by = list(df$group), FUN = mean) %>%
    {
        colnames(.) <- c("group", "mean")
        .
    }
anova_tb[2, 1] <- sum((df_aggregate$mean - mu)^2 * c(n1, n2, n3))
df_aggregate <- aggregate(x = df$value, by = list(df$group), FUN = function(x) sum((x - mean(x))^2))
anova_tb[3, 1] <- df_aggregate$x %>%
    sum()
anova_tb[1, 2] <- nrow(df) - 1
anova_tb[2, 2] <- length(unique(df$group)) - 1
anova_tb[3, 2] <- nrow(df) - length(unique(df$group))
anova_tb[1, 3] <- anova_tb[1, 1]/anova_tb[1, 2]
anova_tb[2, 3] <- anova_tb[2, 1]/anova_tb[2, 2]
anova_tb[3, 3] <- anova_tb[3, 1]/anova_tb[3, 2]
anova_tb[2, 4] <- anova_tb[2, 3]/anova_tb[3, 3]

分散分析表

kable(x = anova_tb, format = "html", align = c(rep("r", 4))) %>%
    kable_styling()
偏差平方和 自由度 分散 分散比(F値)
全体変動 72.0301738 64 1.1254715
群間変動 0.4731598 2 0.2365799 0.2049828
郡内変動(残差) 71.5570140 62 1.1541454

p値

pf(q = anova_tb[2, 4], df1 = anova_tb[2, 2], df2 = anova_tb[3, 2], lower.tail = FALSE)
[1] 0.8152112
  • 3群の母平均は等しい、との帰無仮説は棄却されない。

(参考) anova {stats}による結果

anova(object = aov(formula = value ~ group, data = df))
Analysis of Variance Table

Response: value
          Df Sum Sq Mean Sq F value Pr(>F)
group      2  0.473 0.23658   0.205 0.8152
Residuals 62 71.557 1.15415               

参考引用資料

最終更新

Sys.time()
[1] "2024-04-10 04:42:18 JST"

R、Quarto、Package

R.Version()$version.string
[1] "R version 4.3.3 (2024-02-29 ucrt)"
quarto::quarto_version()
[1] '1.4.542'
packageVersion(pkg = "tidyverse")
[1] '2.0.0'

著者