# # PCA_scores.R (May 2013, by Xiaoling Dou) # ############################################################ ############################################################ ### find the influence score vectors by the PCA method ##### ############################################################ # This function requires a dataset, chromosome number and # # loci. # # The loci should be a vector consisting of integers no # # larger than the total number of locie. # # The dataset should contain ID, phenotype, sex and # # genotypes. # # The first row of the dataset is the description of each # # column, the second row contains the name of chromosomes, # # and the third row contains the locations of marker loci. # ############################################################ ############################################################ #rm(list=ls(all=TRUE)) ISVs<- function(X=dataset, chromosome, loci){ # if (length(loci) != length(cl)) stop("length(loci) should be equal to length(cl) !") ### QTL data arrangement ### NM<- dim(X) N<- NM[1]-3; M<- NM[2]-3 #N: sample size; M: number of loci# geno<- matrix(0, nrow=N, ncol=M) X2<- X[-c(1:3),-c(1:3)] for(i in 1:M){ x0<- as.vector(X2[,i]) x1<- replace(x0, which(x0=="A"), -1) x2<- replace(x1, which(x1=="B"), 1) x3<- replace(x2, which(x2=="H"), 0) geno[,i]<- as.numeric(x3) #genotypes, N*M matrix# } X23<- X[2:3,-c(1:3)] #chromosome number and locations# chr.location<- paste(X23[1,], X23[2,], sep="_") colnames(geno)<- chr.location #chromosome and the locations# pheno<- as.numeric(X[-c(1:3), 2]) #phenotype: pheno# sex<- as.numeric(X[-c(1:3), 3]) #sex of the mice# chro<- which(X23[1,]==chromosome) loca.chr<- as.numeric(X23[2, chro]) #locations on the given chromosome# x<- cbind(pheno, sex, geno) #phenotype, sex and genotypes: N*(M+2) matrix# ### Find Lod scores, parameters and EIFs ### Lod<- numeric(M); theta1<- matrix(0, 5, M); logf1<- matrix(0, N, M) y<- pheno #phenotype# ui<- rep(1, N) vi<- sex #sex# xi0<- cbind(ui,vi) bh0<- solve(t(xi0) %*% xi0) %*% t(xi0) %*% y #coefficients under H0# e0<- y- xi0 %*% bh0 L0<- t(e0) %*% e0 sig0<- as.vector(L0/N) #sig0 is the squared sigma0.# theta0<- c(bh0, sig0) #parameters under H0# logc0<- (-log(2 * pi * sig0))/2 ; loge0<- (y-as.vector(xi0 %*% bh0))^2 /(-2* sig0) logf0<- logc0+loge0 ei<- matrix(0, nrow=N, ncol=M) for (i in 1:M){ zi<- geno[,i] #genotypes:z# wi<- 2* (zi^2) -1 #w# xi1<- cbind(zi,wi,ui,vi) ## bh1<- solve(t(xi1) %*% xi1)%*% t(xi1) %*% y #coefficients under H1# ei[,i]<- y - xi1 %*% bh1 L1<- t(ei[,i])%*% ei[,i] sig1<- as.vector(L1/N) ; theta1[,i]<- c(bh1, sig1) #sig1 is the squared sigma1; theta1 is the parameters under H1.# Lod[i]<- log((L0 / L1)^(N / 2))/log(10) #This is the LOD score.# logc1<- (-log(2 * pi * sig1))/2 ; loge1<- (y-(xi1 %*% bh1))^2 /(-2* sig1) logf1[,i]<- logc1+loge1 } logD<- (logf1 - logf0)*N / log(10) #Using it to find EIF of the lod scores.# EIF<- t(t(logD)- Lod) #EIF(N*M) is the empirical influence matrix of all the individuals for all LOD scores.# ### Specify the chromosome and the loci ### m <- length(chro) if (sum(loci > m) >= 1) stop(paste("loci should be a vector containing some integers no larger than ", m, "!")) Eif <- EIF[,chro[loci]] Rifun<- function(i){ #i is the index of the loci; equation (18) in the paper.# u<- rep(1, N); v<- sex U<- cbind(u,v) zi<- geno[,i]; wi<- 2*((zi)^2) -1 ZiU <- cbind(zi, wi, U) Hi1 <- ZiU %*% solve(t(ZiU) %*% ZiU) %*% t(ZiU) H0 <- U %*% solve(t(U) %*% U) %*% t(U) Ri<- Hi1-H0 return(Ri) } k<- length(loci) #Find Cov,the equation (17), in the paper.# Cov<- matrix(0, nrow=k, ncol=k) R<- list(); r<- 0 for(i in chro[loci]){ r<- r+1 R[[r]]<- Rifun(i) } for(i in 1:k){ for(j in 1:k){ Cov[i,j]<- sum(diag( R[[i]] %*% R[[j]] )) * 2/ ( 2* log(10) )^2 } } ICov<- solve(Cov) #ICov is the inverse matrix of Cov.# vec1<- rep(1,k) #Use matrix J to remove the parallel shift.# J<- diag(k)- ( vec1 %*% t(vec1) %*% ICov ) /as.vector(t(vec1) %*% ICov %*% vec1) JCJ<- J %*% Cov %*% t(J) require(MASS) Gjcj<- ginv(JCJ) #Moore-Penrose generalized invers of the matrix JCJ # SgiS<- Eif %*% t(J) %*% Gjcj %*% J %*% t(Eif) eSgiS<- eigen(SgiS) #Find the eigenvalues and eigenvectors.# eval<- eSgiS$values evec<- eSgiS$vectors Eval<- which(eval/sum(eval) > 0.1) Evalue<- eval[Eval] ISV<- matrix(0, nrow=N, ncol=length(Eval)) for(l in Eval){ ISV[,l]<- sqrt(Evalue[l])* evec[,l] } return(list(Lod=Lod, EIF=EIF, evalue=eval, evector=evec, ISV=ISV, Eval=Eval)) #Lod scores, EIF and eigenvalues, eigenvectors and influence score vectors are returned.# } ############################################################ # Function for plotting influence score vectors. # # The figure shows the individual having the maximum absolute influence scores. # plot.isv<- function(ISV=isv, d=dimension, mx=1){ ds <- dim(ISV)[2] if (d > ds) stop(paste("The dimension d should not larger than ", ds, "!")) isvd<- ISV[,d] m.x <- which(rank(abs(isvd)) > length(isvd) - mx) m.isv<- max(abs(isv))+10 oldpar<- par(no.readonly=TRUE) on.exit(par(oldpar)) par(cex=1.2) plot(isvd, ylim=c(-m.isv, m.isv), xlab="Index", ylab=paste("Influence Score Vector", d)) text(m.x +8, isvd[m.x]-8, paste(m.x, " (", round(isvd[m.x],3),")"),col=1,cex=0.7) par(oldpar) } ############################################################ # Function for plotting two influence score vectors. # # The figure shows the individuals having the maximum influence scores. # plot2d.isv<- function(ISV=isv, d1=dimension, d2=dimension, mx=1){ ds <- dim(ISV)[2] if (d1 > ds) stop(paste("The dimension d1 should not larger than ", ds, "!")) if (d2 > ds) stop(paste("The dimension d2 should not larger than ", ds, "!")) isvd1<- ISV[,d1] isvd2<- ISV[,d2] m.x <- which(rank(isvd1^2+isvd2^2) > length(isvd1) - mx) m.isv<- max(abs(isv))+10 oldpar<- par(no.readonly=TRUE) on.exit(par(oldpar)) par(cex=1.2) plot(isvd1, isvd2, xlim=c(-m.isv, m.isv), ylim=c(-m.isv, m.isv), xlab=paste("Influence Score Vector", d1), ylab=paste("Influence Score Vector", d2)) text(isvd1[m.x]+8, isvd2[m.x]-8, paste(m.x),col=1,cex=0.7) par(oldpar) } ############################################################ ############################################################ ### dataset #### X1<-matrix(scan("Data_LogADI.txt", what="character"), 173, byrow=TRUE) ### Input the dataset, specify chromosome and marker loci ### isvs<- ISVs(X=X1, chromosome=3, loci=c(3,4,5)) isv <- isvs$ISV # Plot isv # # Indicate the number(s) d (d1, d2) for the influence score vector(s) # plot.isv(ISV=isv, d=2, mx=3); readline() plot2d.isv(ISV=isv, d1=1, d2=2, mx=3) ############################################################