4 Tidying Data

library(knitr)
library(tidyverse)
#> Loading tidyverse: ggplot2
#> Loading tidyverse: tibble
#> Loading tidyverse: tidyr
#> Loading tidyverse: readr
#> Loading tidyverse: purrr
#> Loading tidyverse: dplyr
#> Conflicts with tidy packages ----------------------------------------------
#> filter(): dplyr, stats
#> lag():    dplyr, stats
library(PKPDmisc)
ebe_cov <- read_csv("../data/ebe_cov.csv")
#> Parsed with column specification:
#> cols(
#>   ID = col_integer(),
#>   ETA1 = col_double(),
#>   ETA2 = col_double(),
#>   ETA3 = col_integer(),
#>   ETA4 = col_double(),
#>   ETA5 = col_double(),
#>   ETA6 = col_double(),
#>   ETA7 = col_double(),
#>   ETA8 = col_double(),
#>   ETA9 = col_double(),
#>   BW = col_double(),
#>   BMI = col_double(),
#>   AGE = col_integer(),
#>   AST = col_integer(),
#>   ALT = col_integer(),
#>   CRCL = col_double(),
#>   SEX = col_integer(),
#>   RACE = col_integer()
#> )

Remove all the columns with ETAS and name dataframe covs

covs <- ebe_cov %>% select(-contains("ETA"))

Find all columns with an NA value, and the associated ID

has_missing <- covs %>% 
  group_by(ID) %>%
  select_if(~any(is.na(.))) %>%
  ungroup()
head(has_missing)
#> # A tibble: 6 x 3
#>      ID    BW   SEX
#>   <int> <dbl> <int>
#> 1     1 109.4     1
#> 2     4 120.2     0
#> 3     5  83.0     0
#> 4     6  64.2     0
#> 5     7  74.4     0
#> 6     8  68.4     0

missing_ids <- has_missing %>%
  gather(cov, values, -ID) %>%
  filter(is.na(values))
missing_ids
#> # A tibble: 2 x 3
#>      ID   cov values
#>   <int> <chr>  <dbl>
#> 1    69    BW     NA
#> 2    65   SEX     NA

Impute the NA values using the mean for any continous variable, and largest group for categorical

has_missing %>%
  select(-ID) %>%
  mutate(SEX = as.factor(SEX)) %>% summary
#>        BW          SEX    
#>  Min.   : 51.7   0   :48  
#>  1st Qu.: 75.2   1   :12  
#>  Median : 88.1   NA's: 1  
#>  Mean   : 90.3            
#>  3rd Qu.:103.6            
#>  Max.   :159.2            
#>  NA's   :1
replacement_values <- has_missing %>% 
  summarize(BW = mean(BW, na.rm = T)) %>% 
  mutate(SEX = 0)
covs <- covs %>% 
  replace_na(replace = as.list(replacement_values)) 
covs %>%
  filter(ID %in% missing_ids$ID)
#> # A tibble: 2 x 9
#>      ID    BW   BMI   AGE   AST   ALT  CRCL   SEX  RACE
#>   <int> <dbl> <dbl> <int> <int> <int> <dbl> <dbl> <int>
#> 1    65  97.5  32.7    47    18    23   116     0     0
#> 2    69  90.3  27.5    39    32    53   161     0     1

4.1 data checkout for all covariates

Set all categorical covariates to factors with appropriate labels

covs_f <- covs %>% 
  mutate(SEXC = factor(SEX, 
                       levels = c(0, 1), 
                       labels = c("FEMALE", "MALE")
                       ),
        RACEC = factor(RACE, 
                       levels = c(0:2), 
                       labels = c("WHITE", "BLACK", "ASIAN")
                       )
         ) %>%
  select(-SEX, -RACE)
g_cont_covs <- covs_f %>% gather(cov, value, BW:CRCL)

Plot a scatter plot of all continuous covariates versus ID to check for visual outliers

g_cont_covs %>%
  ggplot(aes(x = ID, y = value)) +
  geom_point() +
  facet_wrap(~cov, scales = "free")

Plot a violin/box plot of all continuous covariates versus SEX to check for visual trends

g_cont_covs %>%
  ggplot(aes(x = SEXC, y = value)) +
  geom_violin() +
  geom_jitter(width = 0.1) +
  facet_wrap(~cov, scales = "free")

Plot a violin/box plot of all continuous covariates versus all categorical covariates to check for visual trends

g_cont_covs %>%
  gather(catcov, catvals, RACEC, SEXC) %>%
  ggplot(aes(x = catvals, y = value)) +
  geom_violin() +
  geom_jitter(width = 0.1) +
  facet_grid(cov~catcov, scales = "free")
#> Warning: attributes are not identical across measure variables; they will
#> be dropped

devtools::session_info()
#> Session info -------------------------------------------------------------
#>  setting  value                       
#>  version  R version 3.4.0 (2017-04-21)
#>  system   x86_64, mingw32             
#>  ui       RTerm                       
#>  language (EN)                        
#>  collate  English_United States.1252  
#>  tz       Europe/Prague               
#>  date     2017-06-05
#> Packages -----------------------------------------------------------------
#>  package    * version  date       source                            
#>  assertthat   0.2.0    2017-04-11 CRAN (R 3.4.0)                    
#>  backports    1.1.0    2017-05-22 CRAN (R 3.4.0)                    
#>  base       * 3.4.0    2017-04-21 local                             
#>  bindr        0.1      2016-11-13 CRAN (R 3.4.0)                    
#>  bindrcpp   * 0.1      2016-12-11 CRAN (R 3.4.0)                    
#>  bookdown     0.4      2017-05-20 CRAN (R 3.4.0)                    
#>  broom        0.4.2    2017-02-13 CRAN (R 3.4.0)                    
#>  cellranger   1.1.0    2016-07-27 CRAN (R 3.4.0)                    
#>  codetools    0.2-15   2016-10-05 CRAN (R 3.4.0)                    
#>  colorspace   1.3-2    2016-12-14 CRAN (R 3.4.0)                    
#>  compiler     3.4.0    2017-04-21 local                             
#>  datasets   * 3.4.0    2017-04-21 local                             
#>  devtools     1.13.1   2017-05-13 CRAN (R 3.4.0)                    
#>  digest       0.6.12   2017-01-27 CRAN (R 3.4.0)                    
#>  dplyr      * 0.6.0    2017-06-02 Github (tidyverse/dplyr@b064c4b)  
#>  evaluate     0.10     2016-10-11 CRAN (R 3.4.0)                    
#>  forcats      0.2.0    2017-01-23 CRAN (R 3.4.0)                    
#>  foreign      0.8-67   2016-09-13 CRAN (R 3.4.0)                    
#>  ggplot2    * 2.2.1    2016-12-30 CRAN (R 3.4.0)                    
#>  glue         1.0.0    2017-04-17 CRAN (R 3.4.0)                    
#>  graphics   * 3.4.0    2017-04-21 local                             
#>  grDevices  * 3.4.0    2017-04-21 local                             
#>  grid         3.4.0    2017-04-21 local                             
#>  gtable       0.2.0    2016-02-26 CRAN (R 3.4.0)                    
#>  haven        1.0.0    2016-09-23 CRAN (R 3.4.0)                    
#>  hms          0.3      2016-11-22 CRAN (R 3.4.0)                    
#>  htmltools    0.3.6    2017-04-28 CRAN (R 3.4.0)                    
#>  httr         1.2.1    2016-07-03 CRAN (R 3.4.0)                    
#>  jsonlite     1.5      2017-06-01 CRAN (R 3.4.0)                    
#>  knitr      * 1.16     2017-05-18 CRAN (R 3.4.0)                    
#>  labeling     0.3      2014-08-23 CRAN (R 3.4.0)                    
#>  lattice      0.20-35  2017-03-25 CRAN (R 3.4.0)                    
#>  lazyeval     0.2.0    2016-06-12 CRAN (R 3.4.0)                    
#>  lubridate    1.6.0    2016-09-13 CRAN (R 3.4.0)                    
#>  magrittr     1.5      2014-11-22 CRAN (R 3.4.0)                    
#>  memoise      1.1.0    2017-04-21 CRAN (R 3.4.0)                    
#>  methods      3.4.0    2017-04-21 local                             
#>  mnormt       1.5-5    2016-10-15 CRAN (R 3.4.0)                    
#>  modelr       0.1.0    2016-08-31 CRAN (R 3.4.0)                    
#>  munsell      0.4.3    2016-02-13 CRAN (R 3.4.0)                    
#>  nlme         3.1-131  2017-02-06 CRAN (R 3.4.0)                    
#>  parallel     3.4.0    2017-04-21 local                             
#>  PKPDmisc   * 1.0.0    2017-06-02 Github (dpastoor/PKPDmisc@23e1f49)
#>  plyr         1.8.4    2016-06-08 CRAN (R 3.4.0)                    
#>  psych        1.7.5    2017-05-03 CRAN (R 3.4.0)                    
#>  purrr      * 0.2.2.2  2017-05-11 CRAN (R 3.4.0)                    
#>  R6           2.2.1    2017-05-10 CRAN (R 3.4.0)                    
#>  Rcpp         0.12.11  2017-05-22 CRAN (R 3.4.0)                    
#>  readr      * 1.1.1    2017-05-16 CRAN (R 3.4.0)                    
#>  readxl       1.0.0    2017-04-18 CRAN (R 3.4.0)                    
#>  reshape2     1.4.2    2016-10-22 CRAN (R 3.4.0)                    
#>  rlang        0.1.1    2017-05-18 CRAN (R 3.4.0)                    
#>  rmarkdown    1.5.9000 2017-06-03 Github (rstudio/rmarkdown@ea515ef)
#>  rprojroot    1.2      2017-01-16 CRAN (R 3.4.0)                    
#>  rvest        0.3.2    2016-06-17 CRAN (R 3.4.0)                    
#>  scales       0.4.1    2016-11-09 CRAN (R 3.4.0)                    
#>  stats      * 3.4.0    2017-04-21 local                             
#>  stringi      1.1.5    2017-04-07 CRAN (R 3.4.0)                    
#>  stringr      1.2.0    2017-02-18 CRAN (R 3.4.0)                    
#>  tibble     * 1.3.3    2017-05-28 CRAN (R 3.4.0)                    
#>  tidyr      * 0.6.3    2017-05-15 CRAN (R 3.4.0)                    
#>  tidyverse  * 1.1.1    2017-01-27 CRAN (R 3.4.0)                    
#>  tools        3.4.0    2017-04-21 local                             
#>  utils      * 3.4.0    2017-04-21 local                             
#>  withr        1.0.2    2016-06-20 CRAN (R 3.4.0)                    
#>  xml2         1.1.1    2017-01-24 CRAN (R 3.4.0)                    
#>  yaml         2.1.14   2016-11-12 CRAN (R 3.4.0)