## scivdp.R generates and analyzes nearly collinear data. ## The key concepts--singular value decomposition and variance decomposition ## proportions--are explained in D. Belsley, E. Kuh, and R. Welsch, Regression ## Diagnostics (1980) and D. Belsley, Conditioning Diagnostics (1991). ## Step 1: generate nxk design matrix X with two near dependencies n <- 100; k <- 5 set.seed(676,kind="default") # Set seed to facilitate replication. xvec <- rnorm(n*k) X <- array(xvec, c(n,k)) X[,1] <- 1 X[,3] <- X[,1]+X[,2]+rnorm(n)*0.05 X[,5] <- X[,4]+rnorm(n)*0.05 ## Step 2: analyze design matrix X2 <- X^2 length <- (t(rep(1,n))%*%X2)^0.5 # Lengths of columns of design matrix lm <- rep(1, n)%*%length scaledX <- X/lm # Scaled design matrix has columns of unit length. svdsX <- svd(scaledX) # Singular value decompositon mu <- svdsX$d # The singular values of X etatilde <- mu[1]/mu # Scaled condition indices of X. Each large condition # index corresponds to a near linear dependence. T <- svdsX$v # Eigenvectors of X'X t2 <- t(T)*t(T) mu2 <- mu^2 mu2mat <- mu2%*%t(rep(1,k)) tom <- t2/mu2mat vif <- t(rep(1,k))%*%tom # variance inflation factors vifmat <- rep(1,k)%*%vif vdp <- tom/vifmat # variance decomposition proportions rep(1,k)%*%vdp # Check that proportions sum to one. table <- cbind(etatilde, vdp) table ## A large element in the first column indicates a near linear dependence. ## Large elements to its right indicate which variables are involved in this ## near dependence.