Skip to content

Commit 4df1dca

Browse files
authored
Merge pull request #152 from Sage-Bionetworks/GEN-1476-use-tier1a-code
[GEN-1476] use detailed columns to extract tier1a code
2 parents eb1dc27 + 577b299 commit 4df1dca

File tree

3 files changed

+154
-24
lines changed

3 files changed

+154
-24
lines changed

scripts/case_selection/export_bpc_selected_cases.R

+11-23
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ print("get clinical data")
8484
sex_mapping <- synTableQuery("SELECT * FROM syn7434222")$asDataFrame()
8585
race_mapping <- synTableQuery("SELECT * FROM syn7434236")$asDataFrame()
8686
ethnicity_mapping <- synTableQuery("SELECT * FROM syn7434242")$asDataFrame()
87-
# sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame()
87+
#sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame()
8888

8989
# output setup
9090
phase_no_space <- sub(" ","_",sub(" ","",phase))
@@ -120,35 +120,23 @@ existing_patients = selected_cases[selected_cases %in% clinical$patient_id]
120120
samples_per_patient <- clinical$sample_id[clinical$patient_id %in% selected_cases]
121121

122122
print("map data for each instrument")
123-
# mapping data for each instrument
124123
# instrument - patient_characteristics
125-
patient_output <- data.frame("record_id" = existing_patients)
126-
patient_output$redcap_repeat_instrument <- rep("")
127-
patient_output$redcap_repeat_instance <- rep("")
128-
129-
patient_output$genie_patient_id <- patient_output$record_id
130-
patient_output$birth_year <- clinical$birth_year[match(patient_output$genie_patient_id, clinical$patient_id)]
131-
patient_output$naaccr_ethnicity_code <- clinical$ethnicity[match(patient_output$genie_patient_id, clinical$patient_id)]
132-
patient_output$naaccr_race_code_primary <- clinical$primary_race[match(patient_output$genie_patient_id, clinical$patient_id)]
133-
patient_output$naaccr_race_code_secondary <- clinical$secondary_race[match(patient_output$genie_patient_id, clinical$patient_id)]
134-
patient_output$naaccr_race_code_tertiary <- clinical$tertiary_race[match(patient_output$genie_patient_id, clinical$patient_id)]
135-
patient_output$naaccr_sex_code <- clinical$sex[match(patient_output$genie_patient_id, clinical$patient_id)]
136-
137-
# mapping to code
138-
patient_output$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_output$naaccr_ethnicity_code, ethnicity_mapping$CBIO_LABEL)]
139-
patient_output$naaccr_race_code_primary <- race_mapping$CODE[match(patient_output$naaccr_race_code_primary, race_mapping$CBIO_LABEL)]
140-
patient_output$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_output$naaccr_race_code_secondary, race_mapping$CBIO_LABEL)]
141-
patient_output$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_output$naaccr_race_code_tertiary, race_mapping$CBIO_LABEL)]
142-
patient_output$naaccr_sex_code <- sex_mapping$CODE[match(patient_output$naaccr_sex_code,sex_mapping$CBIO_LABEL)]
124+
patient_output <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping)
125+
# check missing values
126+
# get naaccr code columns
127+
naaccr_col <- grep("naaccr", colnames(patient_output), value = TRUE)
128+
# error out if NAs or empty strings are detected in naaccr code columns
129+
check_for_missing_values(patient_output, naaccr_col)
130+
143131
print("recode")
144132
# recode
145133
# cannotReleaseHIPAA = NA
146134
patient_output$birth_year[which(patient_output$birth_year == "cannotReleaseHIPAA")] <- NA
147-
# -1 Not collected = 9 Unknown
135+
# -1 Not collected = 9 Unknown whether Spanish or not
148136
patient_output$naaccr_ethnicity_code[which(patient_output$naaccr_ethnicity_code == -1)] <- 9
149-
# -1 Not collected = 99 Unknown
137+
# -1 Not collected = 99 Unknown by patient
150138
patient_output$naaccr_race_code_primary[which(patient_output$naaccr_race_code_primary == -1)] <- 99
151-
# -1 Not collected = 88 according to NAACCR
139+
# -1 Not collected = 88 No further race documented according to NAACCR
152140
patient_output$naaccr_race_code_secondary[which(patient_output$naaccr_race_code_secondary == -1)] <- 88
153141
patient_output$naaccr_race_code_tertiary[which(patient_output$naaccr_race_code_tertiary == -1)] <- 88
154142

scripts/case_selection/shared_fxns.R

+56-1
Original file line numberDiff line numberDiff line change
@@ -255,4 +255,59 @@ get_main_genie_clinical_id <- function(release){
255255
}
256256
}
257257
return(NULL)
258-
}
258+
}
259+
260+
#' Mapping data for patient_characteristics
261+
#'
262+
#' @param clinical A data frame of released clinical data for selected cases
263+
#' @param existing_patients A data frame of available patient after case selection
264+
#' @param ethnicity_mapping The NAACCR_ETHNICITY_MAPPING data frame
265+
#' @param race_mapping The NAACCR_RACE_MAPPING data frame
266+
#' @param sex_mapping The NAACCR_SEX_MAPPING data frame
267+
#' @return A data frame with mapped code
268+
remap_patient_characteristics <- function(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping){
269+
270+
patient_df <- data.frame("record_id" = existing_patients)
271+
patient_df$redcap_repeat_instrument <- rep("")
272+
patient_df$redcap_repeat_instance <- rep("")
273+
274+
patient_df$genie_patient_id <- patient_df$record_id
275+
patient_df$birth_year <- clinical$birth_year[match(patient_df$genie_patient_id, clinical$patient_id)]
276+
patient_df$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
277+
patient_df$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
278+
patient_df$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
279+
patient_df$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
280+
patient_df$naaccr_sex_code <- clinical$sex_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
281+
282+
# mapping to code
283+
patient_df$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_df$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)]
284+
patient_df$naaccr_race_code_primary <- race_mapping$CODE[match(patient_df$naaccr_race_code_primary, race_mapping$DESCRIPTION)]
285+
patient_df$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_df$naaccr_race_code_secondary, race_mapping$DESCRIPTION)]
286+
patient_df$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_df$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)]
287+
patient_df$naaccr_sex_code <- sex_mapping$CODE[match(patient_df$naaccr_sex_code,sex_mapping$DESCRIPTION)]
288+
289+
return(patient_df)
290+
}
291+
292+
#' Check for missing values in naaccr columns
293+
#'
294+
#' @param data The data frame to check against
295+
#' @param columns The target columns
296+
check_for_missing_values <- function(data, columns) {
297+
# Check for NA values or empty strings
298+
missingness_col <- c()
299+
for (col in columns) {
300+
if (col %in% c("naaccr_race_code_tertiary", "naaccr_race_code_secondary")) {
301+
# filter out CHOP, PROV, JHU centers with known NAs in NAACCR code columns
302+
relevant_rows <- data[!grepl("CHOP|PROV|JHU", data$genie_patient_id), ]
303+
} else{
304+
relevant_rows <- data
305+
}
306+
if (any(is.na(relevant_rows[[col]]) | relevant_rows[[col]] == "" )){
307+
missingness_col <- c(col, missingness_col)
308+
}
309+
}
310+
if (length(missingness_col) > 0) {
311+
warning(paste0("Warning: Missing or empty values found in column(s): ", paste(missingness_col,collapse=", ")))
312+
}
313+
}

scripts/case_selection/tests/test_shared_fxns.R

+87
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,90 @@ test_that("get_main_genie_clinical_id returns NULL when data_clinical.txt does n
5252
result <- get_main_genie_clinical_id(release)
5353
expect_null(result)
5454
})
55+
56+
test_that("remap_patient_characteristics works as expected", {
57+
58+
# Mock input data
59+
clinical <- data.frame(
60+
patient_id = c(1, 2, 3),
61+
birth_year = c(1980, 1990, 2000),
62+
ethnicity_detailed = c("Hispanic", "Non-Hispanic", "Hispanic"),
63+
primary_race_detailed = c("White", "Black", "Asian"),
64+
secondary_race_detailed = c("Unknown", "White", "Black"),
65+
tertiary_race_detailed = c("Asian", "Unknown", "White"),
66+
sex_detailed = c("Male", "Female", "Male")
67+
)
68+
69+
existing_patients <- c(1, 2, 3)
70+
71+
ethnicity_mapping <- data.frame(
72+
DESCRIPTION = c("Hispanic", "Non-Hispanic"),
73+
CODE = c("1", "2")
74+
)
75+
76+
race_mapping <- data.frame(
77+
DESCRIPTION = c("White", "Black", "Asian", "Unknown"),
78+
CODE = c("1", "2", "3", "99")
79+
)
80+
81+
sex_mapping <- data.frame(
82+
DESCRIPTION = c("Male", "Female"),
83+
CODE = c("M", "F")
84+
)
85+
86+
# Expected output
87+
expected_output <- data.frame(
88+
record_id = c(1, 2, 3),
89+
redcap_repeat_instrument = c("", "", ""),
90+
redcap_repeat_instance = c("", "", ""),
91+
genie_patient_id = c(1, 2, 3),
92+
birth_year = c(1980, 1990, 2000),
93+
naaccr_ethnicity_code = c("1", "2", "1"),
94+
naaccr_race_code_primary = c("1", "2", "3"),
95+
naaccr_race_code_secondary = c("99", "1", "2"),
96+
naaccr_race_code_tertiary = c("3", "99", "1"),
97+
naaccr_sex_code = c("M", "F", "M")
98+
)
99+
100+
# Run the function
101+
result <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping)
102+
103+
# Test if the output is as expected
104+
expect_equal(result, expected_output)
105+
})
106+
107+
test_that("check_for_missing_values - no missing or empty values", {
108+
data <- data.frame(
109+
col1 = c(1, 2, 3),
110+
col2 = c("a", "b", "c"),
111+
genie_patient_id = c("a", "b", "CHOP123"),
112+
naaccr_race_code_tertiary = c("a", "b", "c"),
113+
naaccr_race_code_secondary = c("a", "b", "c")
114+
)
115+
expect_no_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")))
116+
117+
})
118+
119+
test_that("check_for_missing_values - missingness values are detected in NAACCR code columns in centers other than CHOP, PROV, JHU", {
120+
data <- data.frame(
121+
col1 = c(1, NA, ""),
122+
col2 = c("a", "b", "c"),
123+
genie_patient_id = c("CHOP123", "b", "PROV234"),
124+
naaccr_race_code_tertiary = c("a", "", "c"),
125+
naaccr_race_code_secondary = c("a", "b", "c")
126+
)
127+
expect_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")),
128+
"Warning: Missing or empty values found in column\\(s\\): naaccr_race_code_tertiary, col1")
129+
})
130+
131+
test_that("check_for_missing_values - missingness values are detected in NAACCR code columns in CHOP, PROV, JHU centers", {
132+
data <- data.frame(
133+
col1 = c(1, NA, ""),
134+
col2 = c("a", "", "c"),
135+
genie_patient_id = c("CHOP123", "b", "PROV234"),
136+
naaccr_race_code_tertiary = c("", "b", "c"),
137+
naaccr_race_code_secondary = c("a", "b", NA)
138+
)
139+
expect_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")),
140+
"Warning: Missing or empty values found in column\\(s\\): col2, col1")
141+
})

0 commit comments

Comments
 (0)