データの読み込みと前処理

データの読み込み

# CSVファイルの読み込み
df <- read_csv("datasheet.csv")
## New names:
## Rows: 1510 Columns: 14
## ── Column specification
## ────────────────────────────────────────────────────────────
## Delimiter: "," chr (1): HbA1c dbl (12): ...1, ID, age, sex, race, weight, height,
## smoke, drink, regular_e... date (1): visit_date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify
## the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# データ構造の確認
glimpse(df)
## Rows: 1,510
## Columns: 14
## $ ...1             <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ ID               <dbl> 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,…
## $ visit_date       <date> 2020-01-14, 2020-03-19, 2020-07-07, 2020-08-15, 2020…
## $ age              <dbl> 64, 64, 64, 64, 60, 60, 60, 60, 54, 54, 54, 54, 54, 5…
## $ sex              <dbl> 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ race             <dbl> 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ weight           <dbl> 93.1, 93.1, 93.1, 93.1, 68.4, 68.4, 68.4, 68.4, 77.7,…
## $ height           <dbl> 173.3, 173.3, 173.3, 173.3, 168.1, 168.1, 168.1, 168.…
## $ smoke            <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ drink            <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,…
## $ regular_exercise <dbl> 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ interv           <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,…
## $ HbA1c            <chr> "5.94126651006355", NA, "6.42409394851312", "6.643686…
## $ CRE              <dbl> 0.6574349, 0.1556760, 0.2058530, 0.8931811, 0.5259037…
# 欠損値の確認
colSums(is.na(df))
##             ...1               ID       visit_date              age 
##                0                0                0                0 
##              sex             race           weight           height 
##                0                0                0                0 
##            smoke            drink regular_exercise           interv 
##                0                0                0                0 
##            HbA1c              CRE 
##              287                0

データの前処理

# 日付型への変換
df <- df %>%
  mutate(
    visit_date = ymd(visit_date),
    # regular_exerciseを因子型に変換
    regular_exercise = factor(regular_exercise),
    # sexを因子型に変換
    sex = factor(sex, levels = c(1, 2), labels = c("男性", "女性")),
    # raceを因子型に変換
    race = factor(race, levels = c(1, 2, 3), labels = c("白人", "黒人", "その他")),
    # smoke, drinkを因子型に変換
    smoke = factor(smoke, levels = c(0, 1), labels = c("非喫煙", "喫煙")),
    drink = factor(drink, levels = c(0, 1), labels = c("飲酒なし", "飲酒あり")),
    # intervを因子型に変換
    interv = factor(interv, levels = c(0, 1), labels = c("対照群", "介入群"))
  )

# BMIの計算
df <- df %>%
  mutate(bmi = weight / ((height/100)^2))

# クレアチニンクリアランスの計算(Cockcroft-Gault式)
df <- df %>%
  mutate(
    CrCL = case_when(
      sex == "男性" ~ (140 - age) * weight / (72 * CRE),
      sex == "女性" ~ (140 - age) * weight / (72 * CRE) * 0.85
    )
  )

# 患者ごとの最初と最後のvisit_date
df <- df %>%
  group_by(ID) %>%
  mutate(
    first_visit_date = min(visit_date),
    last_visit_date = max(visit_date)
  ) %>%
  ungroup()

# HbA1cが7以上の場合のイベント作成
df <- df %>%
  mutate(event = ifelse(HbA1c >= 7, 1, 0))

除外基準の適用

# 初回来院時のHbA1c >= 7または欠測の患者を除外
excluded_ids <- df %>%
  group_by(ID) %>%
  filter(visit_date == first_visit_date) %>%
  filter(HbA1c >= 7 | is.na(HbA1c)) %>%
  pull(ID)

df_clean <- df %>%
  filter(!ID %in% excluded_ids)

解析用データセットの作成

# baselineデータフレーム
baseline <- df_clean %>%
  group_by(ID) %>%
  filter(visit_date == first_visit_date) %>%
  ungroup()

# event_dateデータフレーム
event_date <- df_clean %>%
  group_by(ID) %>%
  filter(event == 1) %>%
  filter(visit_date == min(visit_date)) %>%
  select(ID, visit_date, event) %>%
  rename(
    event_date = visit_date,
    status = event
  ) %>%
  ungroup()

# df_mergedデータフレーム
df_merged <- baseline %>%
  left_join(event_date, by = "ID") %>%
  mutate(
    status = replace_na(status, 0),
    time = case_when(
      status == 0 ~ as.numeric(last_visit_date - first_visit_date),
      status == 1 ~ as.numeric(event_date - first_visit_date)
    )
  )

解析結果

患者背景表

# 年齢カテゴリーの作成
baseline <- baseline %>%
  mutate(
    age_cat = cut(age, 
                  breaks = c(-Inf, 40, 60, 80, Inf),
                  labels = c("40歳以下", "41-60歳", "61-80歳", "81歳以上")),
    CrCL_cat = cut(CrCL,
                   breaks = c(-Inf, 30, 60, Inf),
                   labels = c("30未満", "30-60", "60以上"))
  )

# 背景表の作成
vars <- c("age", "age_cat", "sex", "smoke", "drink", "CrCL", "CrCL_cat", 
          "bmi", "regular_exercise")

baseline_table <- CreateTableOne(
  vars = vars,
  strata = "interv",
  data = baseline,
  test = TRUE
)

print(baseline_table, 
      nonnormal = c("age", "CrCL", "bmi"),
      showAllLevels = TRUE)
##                       Stratified by interv
##                        level    対照群                  介入群                 
##   n                                117                     105                 
##   age (median [IQR])             51.00 [44.00, 57.00]    51.00 [44.00, 56.00]  
##   age_cat (%)          40歳以下     20 (17.1)               17 (16.2)          
##                        41-60歳      82 (70.1)               73 (69.5)          
##                        61-80歳      15 (12.8)               15 (14.3)          
##                        81歳以上      0 ( 0.0)                0 ( 0.0)          
##   sex (%)              男性         57 (48.7)               61 (58.1)          
##                        女性         60 (51.3)               44 (41.9)          
##   smoke (%)            非喫煙       82 (70.1)               67 (63.8)          
##                        喫煙         35 (29.9)               38 (36.2)          
##   drink (%)            飲酒なし     68 (58.1)               65 (61.9)          
##                        飲酒あり     49 (41.9)               40 (38.1)          
##   CrCL (median [IQR])           182.21 [105.64, 257.95] 162.42 [103.45, 291.23]
##   CrCL_cat (%)         30未満        0 ( 0.0)                1 ( 1.0)          
##                        30-60         8 ( 6.8)                4 ( 3.8)          
##                        60以上      109 (93.2)              100 (95.2)          
##   bmi (median [IQR])             25.87 [21.76, 28.84]    23.80 [20.78, 28.37]  
##   regular_exercise (%) 0            28 (23.9)               28 (26.7)          
##                        1            32 (27.4)               35 (33.3)          
##                        2            42 (35.9)               33 (31.4)          
##                        3            15 (12.8)                9 ( 8.6)          
##                       Stratified by interv
##                        p      test   
##   n                                  
##   age (median [IQR])    0.926 nonnorm
##   age_cat (%)             NaN        
##                                      
##                                      
##                                      
##   sex (%)               0.207        
##                                      
##   smoke (%)             0.395        
##                                      
##   drink (%)             0.662        
##                                      
##   CrCL (median [IQR])   0.846 nonnorm
##   CrCL_cat (%)          0.354        
##                                      
##                                      
##   bmi (median [IQR])    0.102 nonnorm
##   regular_exercise (%)  0.558        
##                                      
##                                      
## 

生存時間解析

カプランマイヤー曲線

# 生存曲線の作成
fit <- survfit(Surv(time, status) ~ interv, data = df_merged)

# プロット
ggsurvplot(
  fit,
  data = df_merged,
  risk.table = TRUE,
  conf.int = TRUE,
  xlab = "観察期間(日)",
  ylab = "無イベント生存率",
  risk.table.height = 0.25,
  ggtheme = theme_bw(),
  legend.labs = c("対照群", "介入群"),
  legend.title = ""
)

Cox比例ハザードモデル

# Coxモデルの作成
cox_model <- coxph(
  Surv(time, status) ~ interv + age + sex + race + smoke + drink + 
    CrCL + bmi + regular_exercise + interv:regular_exercise,
  data = df_merged
)

# 結果の表示
tbl_regression(
  cox_model,
  exponentiate = TRUE
) %>%
  add_global_p()
Characteristic HR 95% CI p-value
interv

<0.001
    対照群
    介入群 0.14 0.05, 0.39
age 0.99 0.97, 1.02 0.6
sex

0.6
    男性
    女性 0.87 0.54, 1.42
race

0.7
    白人
    黒人 0.78 0.44, 1.40
    その他 0.91 0.49, 1.69
smoke

0.8
    非喫煙
    喫煙 0.94 0.55, 1.62
drink

0.4
    飲酒なし
    飲酒あり 0.82 0.51, 1.33
CrCL 1.00 1.00, 1.00 0.5
bmi 1.02 0.98, 1.07 0.4
regular_exercise

0.7
    0
    1 1.35 0.66, 2.78
    2 0.98 0.49, 1.97
    3 0.90 0.40, 2.03
interv * regular_exercise

0.11
    介入群 * 1 0.11 0.01, 1.09
    介入群 * 2 0.48 0.10, 2.41
    介入群 * 3 1.29 0.20, 8.30
Abbreviations: CI = Confidence Interval, HR = Hazard Ratio

サブグループ解析

# regular_exerciseごとのカプランマイヤー曲線
fit_subgroup <- survfit(Surv(time, status) ~ interv + regular_exercise, 
                       data = df_merged)

# プロット
ggsurvplot(
  fit_subgroup,
  data = df_merged,
  risk.table = TRUE,
  conf.int = TRUE,
  xlab = "観察期間(日)",
  ylab = "無イベント生存率",
  risk.table.height = 0.25,
  ggtheme = theme_bw(),
  facet.by = "regular_exercise",
  legend.labs = c("対照群", "介入群"),
  legend.title = ""
)
## Warning in (function (survsummary, times, survtable = c("cumevents",
## "risk.table", : The length of legend.labs should be 8