--- title: 'ILSE: simulated examples' author: "Wei Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{ILSE: simulated examples} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` # Load ILSE package The package can be loaded with the command: ```{r} library("ILSE") ``` # ILSE can handle (non)missing data with continuous variables First, we generate a small simulated data. ```{r eval = FALSE} set.seed(1) n <- 100 p <- 6 X <- MASS::mvrnorm(n, rep(0, p), cor.mat(p, rho=0.5)) beta0 <- rep(c(1,-1), times=3) Y <- -2+ X %*% beta0 + rnorm(n, sd=1) ``` ## A special case: without missing values Then, we fit the linear regression model without missing values based on ILSE. ```{r eval = FALSE} ilse1 <- ilse(Y~X) print(ilse1) ``` We can also create a (data.frame) object as input for ILSE. ```{r eval = FALSE} dat <- data.frame(Y=Y, X=X) ilse1 <- ilse(Y~., data=dat) print(ilse1) Coef(ilse1) # access the coefficients Fitted.values(ilse1)[1:5] Residuals(ilse1)[1:5] ``` Check the significant variables by bootstratp. ```{r eval = FALSE} s1 <- summary(ilse1) s1 ``` ## Handle data with missing values First, we randomly remove some entries in X. ```{r eval = FALSE} mis_rate <- 0.3 set.seed(1) na_id <- sample(1:(n*p), n*p*mis_rate) Xmis <- X Xmis[na_id] <- NA ncomp <- sum(complete.cases(Xmis)) message("Number of complete cases is ", ncomp, '\n') ``` Second, we use *lm* to fit linear regression model based on complete cases, i.e., CC analysis. We can not detect any siginificant covariates. ```{r eval = FALSE} lm1 <- lm(Y~Xmis) s_cc <- summary.lm(lm1) s_cc ``` Third, we use *ILSE* to fit the linear regression model based on all data. We can fit a linear regression model without intercept by setting formula: ```{r eval = FALSE} ilse2 <- ilse(Y~Xmis+0, data=NULL, verbose=T) print(ilse2) ``` Then, we fit a linear regression model with intercept by following command ```{r eval = FALSE} ilse2 <- ilse(Y~Xmis, data=NULL, verbose=T) print(ilse2) ``` Fourth, *Bootstrap* is applied to evaluate the standard error and p-values of each coefficients estimated by ILSE. We observe four significant coefficients. ```{r eval = FALSE} s2 <- summary(ilse2, Nbt=20) s2 ``` In *ILSE* package, we also provide Full Information Maximum Likelihood for Linear Regression *fimlreg*. We show how to use it to handle the above missing data. ```{r eval = FALSE} fimllm <- fimlreg(Y~Xmis) print(fimllm) ``` We also use *bootstrap* to evaluate the standard error and p-values of each coefficients estimated by ILSE. We observe only one significant coefficients. ```{r eval = FALSE} s_fiml <- summary(fimllm, Nbt=20) s_fiml ``` ## Visualization We visualize the p-vaules of each methods , where red line denotes 0.05 in y-axis and blue line 0.1 in y-axis. ```{r eval = FALSE} pMat <- cbind(CC=s_cc$coefficients[,4], ILSE=s2[,4], FIML=s_fiml[,4]) library(ggplot2) df1 <- data.frame(Pval= as.vector(pMat[-1,]), Method =factor(rep(c('CC', "ILSE", "FIML"),each=p)), covariate= factor(rep(paste0("X", 1:p), times=3))) ggplot(data=df1, aes(x=covariate, y=Pval, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5) + geom_hline(yintercept = 0.05, color='red') + geom_hline(yintercept = 0.1, color='blue') ``` # ILSE can handle missing data with continuos and categorical variables Base on the above data, we add a new column, a categorical variable (Sex), into the data.frame. This variable is not associated with the outcome variable. ```{r eval = FALSE} dat <- data.frame(Y=Y, X=Xmis) dat$Sex <- factor(rep(c('male', 'female'), times=n/2)) dat$Sex[sample(1:n, n*mis_rate)] <- NA ilse1 <- ilse(Y~., data=dat, verbose = T) ``` We can change the bootstrap times in calculate the standard errors, Z value and p-values of coefficients. ```{r eval = FALSE} s3 <- summary(ilse1, Nbt=40) s3 ``` # ILSE can correctly identify the important variables ## generate data First, we generate data from a linear regression model with three inportant variables(1,3,5) and three unimportant variables(2,4,6). ```{r eval = FALSE} set.seed(10) n <- 100 p <- 6 X <- MASS::mvrnorm(n, rep(0, p), cor.mat(p, rho=0.5)) beta0 <- rep(c(1,0), times=3) Y <- -2+ X %*% beta0 + rnorm(n, sd=1) message("The true regression coefficients are: ", paste0(beta0, ' ')) ``` We randomly assign missing values in the design matrix. ```{r eval = FALSE} mis_rate <- 0.3 set.seed(1) na_id <- sample(1:(n*p), n*p*mis_rate) Xmis <- X Xmis[na_id] <- NA ``` Next, we use ILSE to fit model. ```{r eval = FALSE} dat <- data.frame(Y=Y, X=Xmis) ilse1 <- ilse(Y~., data=dat, verbose = T) s3 <- summary(ilse1) s3 ``` Fit model by using lm and FIML, finally compare ILSE with these two methods. ```{r eval = FALSE} lm1 <- lm(Y~Xmis) s_cc <- summary.lm(lm1) fimllm <- fimlreg(Y~Xmis) s_fiml <- summary(fimllm) ``` ## Visualization We visualize the p-vaules of each methods , where red line denotes 0.05 in y-axis. Under significance level 0.05, we found both ILSE and FIML can identify all important variables (X1, X3 and X5), while CC method only identified X1 and X5. ```{r eval = FALSE} library(ggthemes) pMat <- cbind(CC=s_cc$coefficients[,4], ILSE=s3[,4], FIML=s_fiml[,4]) df1 <- data.frame(Pval= as.vector(pMat[-1,]), Method =factor(rep(c('CC', "ILSE", "FIML"),each=p)), covariate= factor(rep(paste0("X", 1:p), times=3))) ggplot(data=df1, aes(x=covariate, y=Pval, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5) + geom_hline(yintercept = 0.05, color='red') + scale_fill_economist() ``` # ILSE can handle data with high missing rate Here, we generate a data with 80% missing values, then use ILSE to fit model. ```{r eval = FALSE} # generate data from linear model set.seed(10) n <- 100 p <- 6 X <- MASS::mvrnorm(n, rep(0, p), cor.mat(p, rho=0.5)) beta0 <- rep(c(1,-1), times=3) Y <- -2+ X %*% beta0 + rnorm(n, sd=1) # generate missing values mis_rate <- 0.8 set.seed(1) na_id <- sample(1:(n*p), n*p*mis_rate) Xmis <- X Xmis[na_id] <- NA # retain 4 complete cases. Xmis[1:4,] <- X[1:4, ] sum(complete.cases(Xmis)) ``` CC method will failed. ```{r eval = FALSE} lm1 <- lm(Y~Xmis) summary.lm(lm1) ``` However, ILSE can still work. ```{r eval = FALSE} ilse2 <- ilse(Y~Xmis, verbose = T) s2 <- summary(ilse2) s2 ``` # ILSE can handle large-scale data We generate a large-scale data with n=1000 and p = 50 ```{r eval = FALSE} n <- 1000 p <- 50 X <- MASS::mvrnorm(n, rep(0, p), cor.mat(p, rho=0.5)) beta0 <- rep(c(1,-1), length=p) Y <- -2+ X %*% beta0 + rnorm(n, sd=1) mis_rate <- 0.3 set.seed(1) na_id <- sample(1:(n*p), n*p*mis_rate) Xmis <- X Xmis[na_id] <- NA Xmis[1:10,] <- X[1:10,] lm1 <- lm(Y~Xmis) lm1 system.time(ilse2 <- ilse(Y~Xmis, data=NULL, verbose=T)) ``` # Session information ```{r} sessionInfo() ```