# Load Data set
<- read.table("data/synthetic_data_ReCoDe.txt", header=TRUE, sep="\t")
death
#Months need to be in number and not character
$monthofbirth <- as.character(death$monthofbirth)
death$monthofbirth[death$monthofbirth == "January"] <- "01"
death$monthofbirth[death$monthofbirth == "February"] <- "02"
death$monthofbirth[death$monthofbirth == "March"] <- "03"
death$monthofbirth[death$monthofbirth == "April"] <- "04"
death$monthofbirth[death$monthofbirth == "May"] <- "05"
death$monthofbirth[death$monthofbirth == "June"] <- "06"
death$monthofbirth[death$monthofbirth == "July"] <- "07"
death$monthofbirth[death$monthofbirth == "August"] <- "08"
death$monthofbirth[death$monthofbirth == "September"] <- "09"
death$monthofbirth[death$monthofbirth == "October"] <- "10"
death$monthofbirth[death$monthofbirth == "November"] <- "11"
death$monthofbirth[death$monthofbirth == "December"] <- "12"
death
#Format of year XXXX- MonthXX- day XX (As a date)
$dateofbirth <- paste0(death$yearofbirth,"-", death$monthofbirth, "-01")
death$dateofbirth <- as.Date(death$dateofbirth)
death$date_of_death <- as.Date(death$date_of_death)
death$now_cens <- as.Date("2023-01-01") #Change to "end of study" censoring date death
Data curation
The UK Biobank (synthetic) dataset
Throughout this project, we will use a synthetic version of the UK Biobank data. The UK Biobank is a large cohort data set composed of over 500,000 volunteers recruited across United Kingdom between 2006-2010. For this project, we will use data from 252,877 participants who have high quality lung function data to select participants with airflow obstruction (AO) (defined as FEV1/FVC < LLN). We will use AO as out outcome to determine the mortality risk in people with AO vs people without AO.
Essential Variables
Event (death) variables | Description | Coding | Outcome variable | Description | Coding |
---|---|---|---|---|---|
cofdmain | Main cause of death coded by International Classification of Disease v10 (ICD10) This variable will be used later in the course to stratify the analysis by type of death | 0 = No event (alive/censored) 1 = Event (dead) | AO.fev1fvc | Airflow obstruction derived as: forced expiratory volume at 1 s/ forced vital capacity below the lower limit of normal (from NHANES equations) (FEV1/FVC<LLN) | 0 = No airflow obstruction 1 = Airflow obstruction |
Date of death | Date of participant’s death obtained from the National Death Registry | As character but must be changed into numeric format |
Additional variables used in the multivariable (adjusted model)
Covariates name | Description | Coding |
---|---|---|
eid | Unique participant Identification Number for anonymised data | Interger |
Age | Age of participant at recruitment (when they entered the study) | Interger |
Sex | Sex of participant | Male, Female |
bmi | Body Max Index of participant | Numeric |
UKB_centre | UK Biobank Centre at which the participant was recruited | Character |
Ethnicity | Genetically-derived ethnicity of the participant | White vs Other |
Smoking_status | Smoking Status of the participant at recruitment | Never, current, previous |
Data Curation
Note: the Surv()
function in the in the {survival}
package, which will be used in the Survival analysis chapter, accepts different formatting of the event date e.g. TRUE/FALSE
, where TRUE
is event and FALSE
is censored; 1/0
where 1
is event and 0
is censored; or 2/1
where 2
is event and 1
is censored.
Please make sure that the event is properly formatted. In this example we will be using 0/1.
Now that the dates are formatted, we need to calculate the difference between start and end dates in some units, in this example the time will be calculated in years.
As participants could enter the study at any point between 2006-2010, there is a delayed entry into the study. This type of data is know as left-truncated, and as we will define an “end date” the data will also be right-censored. We therefore use age as the time-scale to account for these.
$timetoevent <- death$date_of_death - death$dateofbirth
death$timetocens <- death$now_cens - death$dateofbirth
death
#Select final time if not dead then censoring date
$time <- ifelse(is.na(death$date_of_death), death$timetocens, death$timetoevent)
death
$time_years <- death$time/365.25 #To have time in years
death
##Censoring of participants- do we know when they died? if so = 1 (Event occurs)
##Otherwise = 0 as we do not know when this could happen.
$allcause_death <- as.character(death$cofdmain)
death$allcause_death[is.na(death$allcause_death)] <- 0 #No event occured (ALIVE)
death$allcause_death[death$allcause_death > 0] <- 1 #Event occurs (Death) death
Overview of the data
library(knitr)
<- head(death[, c("Sex","Smoking_status", "allcause_death", "time_years")], 15)
head_data
# Create a table
kable(head_data, caption = "First 15 Observations of Survival Analysis Data")
Sex | Smoking_status | allcause_death | time_years |
---|---|---|---|
Female | Never | 1 | 67.56742 |
Female | Never | 0 | 58.25051 |
Male | Never | 0 | 72.41889 |
Female | Never | 0 | 64.75291 |
Female | Never | 0 | 82.58453 |
Male | Previous | 0 | 65.41821 |
Female | Previous | 0 | 64.25188 |
Female | Never | 0 | 79.67146 |
Male | Previous | 0 | 81.50308 |
Female | Previous | 0 | 67.08556 |
Female | Never | 0 | 64.08487 |
Male | Never | 0 | 84.00000 |
Male | Current | 0 | 62.16564 |
Female | Never | 0 | 74.58453 |
Female | Never | 0 | 71.33470 |
How many participants have experienced the event? (e.g. dead)
library(dplyr)
library(tidyr)
library(knitr)
# Create a summary table
<- death %>%
summary_table group_by(Sex, allcause_death) %>%
summarise(count = n(), .groups = 'drop') %>%
pivot_wider(names_from = Sex, values_from = count, values_fill = list(count = 0)) %>%
mutate(Total = Male + Female)
# Display the summary table
kable(summary_table, caption = "Number of participants that have died by sex")
allcause_death | Female | Male | Total |
---|---|---|---|
0 | 142563 | 95574 | 238137 |
1 | 6986 | 7754 | 14740 |
How many participants have airflow obstruction (outcome)?
library(dplyr)
library(tidyr)
library(knitr)
<- death %>%
summary_table group_by(Sex, AO.fev1fvc) %>%
summarise(count = n(), .groups = 'drop') %>%
pivot_wider(names_from = Sex, values_from = count, values_fill = list(count = 0)) %>%
mutate(Total = Male + Female)
# Display the summary table
kable(summary_table, caption = "Number of participants with airflow obstruction by sex")
AO.fev1fvc | Female | Male | Total |
---|---|---|---|
0 | 139183 | 91844 | 231027 |
1 | 10366 | 11484 | 21850 |
#save(death, file = "data/result.RData")