--- title: 'SpaCOAP: mouse spleen dataset' author: "Wei Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{SpaCOAP: mouse spleen dataset} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` This vignette introduces the SpaCOAP workflow for the analysis of spatial multi-omics dataset. We demonstrate the use of SpaCOAP to one mouse spleen data collected from the SPOTS platform that are [here](https://github.com/feiyoung/SpaCOAP/tree/master/vignettes_data), which can be downloaded to the current working path by the following command: ```{r eval=FALSE} url1 <- "https://github.com/feiyoung/SpaCOAP/raw/master/vignettes_data/" rna_object <- "seu_rna_over_Spleen.RDS?raw=true" download.file(paste0(url1, rna_object),"seu_rna_over_Spleen.RDS",mode='wb') protein_object <- "seu_adt_over_Spleen.RDS?raw=true" download.file(paste0(url1,protein_object), 'seu_adt_over_Spleen.RDS', mode='wb') ## download annotation download.file(paste0(url1, "cell_clusters_anno.rds?raw=true"), "cell_clusters_anno.rds", mode="wb") ``` ### Data preparation Then we load the data to R. ```{r eval = FALSE} seu_rna_over <- readRDS("./seu_rna_over_Spleen.RDS") seu_adt_over <- readRDS("./seu_adt_over_Spleen.RDS") load("./cell_clusters_anno.rds") ``` The package can be loaded with the command: ```{r eval = FALSE} library(SpaCOAP) # ``` Define some functions for use. ```{r eval = FALSE} searchRadius <- function(pos, lower.med=8, upper.med=10, radius.upper= NULL){ if (!inherits(pos, "matrix")) stop("method is only for matrix object!") ## Automatically determine the upper radius n_spots <- nrow(pos) idx <- sample(n_spots, min(100, n_spots)) dis <- dist(pos[idx,]) if(is.null(radius.upper)){ #radius.upper <- max(dis) radius.upper <- sort(dis)[20] ## select the nearest 20 spots. } radius.lower <- min(dis[dis>0]) Adj_sp <- DR.SC:::getneighborhood_fast(pos, radius=radius.upper) Med <- summary(Matrix::rowSums(Adj_sp))['Median'] if(Med < lower.med) stop("The radius.upper is too smaller that cannot find median neighbors greater than 4.") start.radius <- 1 Med <- 0 message("Find the adjacency matrix by bisection method...") maxIter <- 30 k <- 1 while(!(Med >= lower.med && Med <=upper.med)){ # ensure that each spot has about 4~6 neighborhoods in median. Adj_sp <- DR.SC:::getneighborhood_fast(pos, radius=start.radius) Med <- summary(Matrix::rowSums(Adj_sp))['Median'] if(Med < lower.med){ radius.lower <- start.radius start.radius <- (radius.lower + radius.upper)/2 }else if(Med >upper.med){ radius.upper <- start.radius start.radius <- (radius.lower + radius.upper)/2 } message("Current radius is ", round(start.radius, 2)) message("Median of neighborhoods is ", Med) if(k > maxIter) { message("Reach the maximum iteration but can not find a proper radius!") break; } k <- k + 1 } return(start.radius) } acc_fun <- function(y1, y2){ n1 <- length(unique(y1)) n2 <- length(unique(y2)) if(n1 n2 a <- y1 y1 <- y2 y2 <- a n1 <- length(unique(y1)) n2 <- length(unique(y2)) } cm <- as.matrix(table(Actual = y1, Predicted = y2)) rnames <-row.names(cm) cnames <- colnames(cm) union_names <- union(rnames, cnames) n <- length(union_names) cm_new <- matrix(0, n, n) row.names(cm_new) <- colnames(cm_new) <- union_names for(r in 1:n2){ cm_new[rnames,cnames[r]] <- cm[rnames,cnames[r]] } sum(diag(cm_new)) / length(y1) } kappa_fun <- function(y1, y2){ require(irr) dat <- data.frame(y1, y2) k_res <- kappa2(dat) k_res$value } ``` ## Obtain data matrices and tunning parameters Wrap the data matrix from the SeuratObject, including the RNA expression count matrix `X_count`, covraite matrix `H` associated with protein markters, control variables (here only an intercept term) `Z`, and spatial coordinate matrix `pos`. ```{r eval = FALSE} X_count <- Matrix::t(seu_rna_over[["RNA"]][seu_rna_over[['RNA']]@var.features,]) X_count <- as.matrix(X_count) H <- t(as.matrix(seu_adt_over[["ADT"]]@data)) Z <- matrix(1, nrow(H), 1) pos <- cbind(seu_rna_over$X0, seu_rna_over$X1) radius_use <- searchRadius(pos, radius.upper = NULL) set.seed(1) n_spots <- nrow(pos) idx <- sample(n_spots, min(100, n_spots)) dis <- dist(pos[idx,]) Adj_sp <- SpaCOAP:::getneighbor_weightmat(pos, radius = radius_use, width=median(dis)) ``` ### Determine the structure dimension Next, we select the number of factors and the rank of regreesion coefficient matrix using the proposed criterion. ```{r eval = FALSE} q_max <- 20 d <- ncol(H) rank_max <- d tic <- proc.time() reslist_max <- SpaCOAP(X_count, Adj_sp, H, Z, rank_use = rank_max, q=q_max, epsELBO = 1e-8, maxIter = 30) toc <- proc.time() time_spacoap_max <- toc[3] - tic[3] ``` We apply the criterion, taking into account that the real data does not necessarily originate from our model. Therefore, we opt for a conservative approach that retains more information. ```{r eval = FALSE} threshold=c(1e-15, 1e-20) thre1 <- threshold[1] beta_svalues <- svd(reslist_max$bbeta)$d beta_svalues <- beta_svalues[beta_svalues>thre1] ratio1 <- beta_svalues[-length(beta_svalues)] / beta_svalues[-1] ratio1[1:10] ## Here, we set hr = 9 rather 1 to retain more information thre2 <- threshold[2] B_svalues <- svd(reslist_max$B)$d B_svalues <- B_svalues[B_svalues>thre2] ratio_fac <- B_svalues[-length(B_svalues)] / B_svalues[-1] ratio_fac[1:6] # [1] 6.123909e+00 2.749414e+00 1.376498e+00 1.314487e+00 3.310699e+14 # Here, we choose q=5 since huge decrease of singular values happen in q=5. hq <- 5 ``` ### Fitting SpaCOAP First, we run the proposed SpaCOAP method. ```{r eval = FALSE} hr <- 9;hq <- 5 featureList <- list() tic <- proc.time() reslist <- SpaCOAP(X_count, Adj_sp, H=H, Z= Z, rank_use = hr, q=hq, epsELBO = 1e-8) toc <- proc.time() time_spacoap <- toc[3] - tic[3] Matrix::rankMatrix(reslist$bbeta) svd_x <- svd(reslist$bbeta, nu = hr, nv=hr) H_spacoap <- H %*% svd_x$v (R2_spacoap <- ProFAST::get_r2_mcfadden(embeds= cbind(H_spacoap, reslist$F), y=as.factor(cell_clusters))) featureList[['SpaCOAP']] <- cbind(H_spacoap, reslist$F) ``` ### Compare SpaCOAP and other methods First, we run COAP and calculate the MacFadden's R-square. ```{r eval = FALSE} ##COAP library(COAP) tic <- proc.time() res_coap <- RR_COAP(X_count, Z = cbind(Z, H), rank_use= 2+hr, q=hq, epsELBO = 1e-7, maxIter = 30) toc <- proc.time() time_coap <- toc[3] - tic[3] svd_x <- svd(res_coap$bbeta[,-c(1)], nu = hr, nv=hr) H_coap <- H %*% svd_x$v (R2_coap <- ProFAST::get_r2_mcfadden(embeds= cbind(res_coap$H, H_coap), y=as.factor(cell_clusters))) featureList[['COAP']] <- cbind(res_coap$H, H_coap) ``` Second, we run MRRR and calculate the MacFadden's R-square. ```{r eval = FALSE} ## MRRR mrrr_run <- function(Y, X, rank0, q=NULL, family=list(poisson()), familygroup=rep(1,ncol(Y)), epsilon = 1e-4, sv.tol = 1e-2, maxIter = 2000, trace=TRUE, truncflag=FALSE, trunc=500){ # epsilon = 1e-4; sv.tol = 1e-2; maxIter = 30; trace=TRUE # Y <- X_count; X <- cbind(Z, H); rank0 = r + ncol(Z) require(rrpack) n <- nrow(Y); p <- ncol(Y) if(!is.null(q)){ rank0 <- rank0+q X <- cbind(X, diag(n)) } if(truncflag){ ## Trunction Y[Y>trunc] <- trunc } svdX0d1 <- svd(X)$d[1] init1 = list(kappaC0 = svdX0d1 * 5) offset = NULL control = list(epsilon = epsilon, sv.tol = sv.tol, maxit = maxIter, trace = trace, gammaC0 = 1.1, plot.cv = TRUE, conv.obj = TRUE) fit.mrrr <- mrrr(Y=Y, X=X[,-1], family = family, familygroup = familygroup, penstr = list(penaltySVD = "rankCon", lambdaSVD = 1), control = control, init = init1, maxrank = rank0) return(fit.mrrr) } tic <- proc.time() res_mrrr <- mrrr_run(X_count, cbind(Z,H), rank0=hr+ncol(Z), q=hq) str(res_mrrr) toc <- proc.time() time_mrrr <- toc[3] - tic[3] hbbeta_mrrr <-t(res_mrrr$coef[1:ncol(cbind(Z,H)), ]) svd_x <- svd(hbbeta_mrrr[,-c(1)], nu = hr, nv=hr) H_mrrr <- H %*% svd_x$v Theta_hb <- (res_mrrr$coef[(ncol(cbind(Z,H))+1): (nrow(cbind(Z,H))+ncol(cbind(Z,H))), ]) svdTheta <- svd(Theta_hb, nu=hq, nv=hq) F_mrrr <- svdTheta$u (R2_mrrr <- ProFAST::get_r2_mcfadden(embeds= cbind(H_mrrr, F_mrrr), y=as.factor(cell_clusters))) featureList[['MRRR']] <- cbind(H_mrrr, F_mrrr) ``` Finally, we execute FAST and compute the MacFadden's R-squared. It is worth noting that we refrain from comparing with PLNPCA in this demo due to its time-consuming nature. ```{r eval = FALSE} fast_run <- function(X_count, Adj_sp, q, verbose=TRUE, epsELBO=1e-8){ require(ProFAST) reslist <- FAST_run(XList = list(X_count), AdjList = list(Adj_sp), q = q, fit.model = 'poisson', verbose=verbose, epsLogLik=epsELBO) reslist$hV <- reslist$hV[[1]] return(reslist) } tic <- proc.time() res_fast <- fast_run(X_count, Adj_sp, q=hq, verbose=TRUE, epsELBO=1e-8) toc <- proc.time() time_fast <- toc - tic (R2_fast <- ProFAST::get_r2_mcfadden(embeds= res_fast$hV, y=as.factor(cell_clusters))) featureList[['FAST']] <- res_fast$hV ``` ### Summarize the metrics We evaluate the correlation between the low-dimensional representations learned by SpaCOAP and other methods with the annotated cell clusters using two metrics. The first metric is the adjusted MacFadden's $\mathrm{R}^2$, referred to as $\mathrm{MacR}^2$. This measure quantifies the amount of biological information captured by the representations, where a higher value signifies superior performance in representation learning. ```{r eval = FALSE} R2List <- list() cell_label <- cell_clusters for(im in 1: length(featureList)){ message("im = ", im) R2List[[im]] <- ProFAST::get_r2_mcfadden(embeds= featureList[[im]], y=cell_label) } names(R2List) <- names(featureList) ``` ```{r eval = FALSE} R2Vec <- unlist(R2List) names(R2Vec) <- names(R2List) barplot(R2Vec, ylim=c(0, 0.8)) ``` Finally, we employ a classification model trained through randomForest, leveraging the cell clusters and the learned low-dimensional representations by SpaCOAP and other methods. We then compare the prediction performance of this model. To achieve this, we randomly divide the samples into training and testing data with a ratio of $7:3$. We evaluate the model's performance using prediction accuracy (ACC) and Cohen's Kappa on the testing data. In contrast of ACC, Kappa offers a more robust measure of accuracy, as it takes into account the possibility that the agreement occurs by chance. We repeat this division process ten times to ensure the reliability of our findings. ```{r eval = FALSE} N <- 10 n <- length(cell_label) methodNames <- c("SpaCOAP", "COAP", "MRRR", "FAST") n_methods <- length(methodNames) metricList <- list(ACC = matrix(NA,N, n_methods), Kappa = matrix(NA,N, n_methods)) for(ii in 1: length(metricList)) colnames(metricList[[ii]]) <- methodNames library(randomForest) for(i in 1:N){ # i <- 1 message("i = ", i) set.seed(i) idx_train <- sort(sample(n, round(n*0.7))) idx_test <- sort(setdiff(1:n, idx_train)) rf_spacoap <- randomForest(featureList[['SpaCOAP']][idx_train,], y=cell_label[idx_train]) hy_spacoap <- predict(rf_spacoap, newdata=featureList[['SpaCOAP']][idx_test,]) metricList$ACC[i,1] <- acc_fun(hy_spacoap, cell_label[idx_test]) metricList$Kappa[i,1] <- kappa_fun(hy_spacoap, cell_label[idx_test]) rf_coap <- randomForest(featureList[['COAP']][idx_train,], y=cell_label[idx_train]) hy_coap <- predict(rf_coap, newdata=featureList[['COAP']][idx_test,]) metricList$ACC[i,2] <- acc_fun(hy_coap, cell_label[idx_test]) metricList$Kappa[i,2] <- kappa_fun(hy_coap, cell_label[idx_test]) colnames(featureList[['MRRR']]) <- paste0("MRRR", 1:ncol(featureList[['MRRR']])) rf_MRRR <- randomForest(featureList[['MRRR']][idx_train,], y=cell_label[idx_train]) hy_MRRR <- predict(rf_MRRR, newdata=featureList[['MRRR']][idx_test,]) metricList$ACC[i,3] <- acc_fun(hy_MRRR, cell_label[idx_test]) metricList$Kappa[i,3] <- kappa_fun(hy_MRRR, cell_label[idx_test]) colnames(featureList[['FAST']]) <- paste0("FAST", 1:ncol(featureList[['FAST']])) rf_FAST <- randomForest(featureList[['FAST']][idx_train,], y=cell_label[idx_train]) hy_FAST <- predict(rf_FAST, newdata=featureList[['FAST']][idx_test,]) metricList$ACC[i,4] <- acc_fun(hy_FAST, cell_label[idx_test]) metricList$Kappa[i,4] <- kappa_fun(hy_FAST, cell_label[idx_test]) } DT::datatable(t(round(sapply(metricList, colMeans, na.rm=T),2) )) ```
**Session Info** ```{r} sessionInfo() ```