#
# PCA_scores.R (May 2013, by Xiaoling Dou)
#
############################################################
############################################################
### find the influence score vectors by the PCA method #####
############################################################
# This function requires a dataset, chromosome number and  #
# loci.                                                    #
# The loci should be a vector consisting of integers no    #
# larger than the total number of locie.                   #
# The dataset should contain ID, phenotype, sex and        #
# genotypes.                                               #
# The first row of the dataset is the description of each  #
# column, the second row contains the name of chromosomes, #
# and the third row contains the locations of marker loci. #
############################################################
############################################################

#rm(list=ls(all=TRUE))

ISVs<- function(X=dataset, chromosome, loci){

# if (length(loci) != length(cl)) stop("length(loci) should be equal to length(cl) !")

### QTL data arrangement ###

        NM<- dim(X)
        N<- NM[1]-3; M<- NM[2]-3 #N: sample size; M: number of loci#
        geno<- matrix(0, nrow=N, ncol=M) 
        X2<- X[-c(1:3),-c(1:3)] 
        for(i in 1:M){
        x0<- as.vector(X2[,i])
        x1<- replace(x0, which(x0=="A"), -1)
        x2<- replace(x1, which(x1=="B"), 1)
        x3<- replace(x2, which(x2=="H"), 0)
        geno[,i]<- as.numeric(x3)               #genotypes, N*M matrix#
        }

        X23<- X[2:3,-c(1:3)] #chromosome number and locations#
        chr.location<- paste(X23[1,], X23[2,], sep="_")
        colnames(geno)<- chr.location           #chromosome and the locations#

        pheno<- as.numeric(X[-c(1:3), 2])       #phenotype: pheno#
        sex<- as.numeric(X[-c(1:3), 3])         #sex of the mice#

        chro<- which(X23[1,]==chromosome)
        loca.chr<- as.numeric(X23[2, chro])     #locations on the given chromosome#

        x<- cbind(pheno, sex, geno)             #phenotype, sex and genotypes: N*(M+2) matrix#

### Find Lod scores, parameters and EIFs ###
        Lod<- numeric(M); theta1<- matrix(0, 5, M); logf1<- matrix(0, N, M)
        y<- pheno                               #phenotype#
        ui<- rep(1, N)  
        vi<- sex  #sex#
        xi0<- cbind(ui,vi)
        bh0<- solve(t(xi0) %*% xi0) %*% t(xi0) %*% y  #coefficients under H0#
        e0<- y- xi0 %*% bh0
        L0<-  t(e0) %*% e0 
        sig0<- as.vector(L0/N)                  #sig0 is the squared sigma0.#
        theta0<- c(bh0, sig0)                   #parameters under H0#
        logc0<- (-log(2 * pi * sig0))/2 ; loge0<- (y-as.vector(xi0 %*% bh0))^2 /(-2* sig0)
        logf0<- logc0+loge0

        ei<- matrix(0, nrow=N, ncol=M)
        for (i in 1:M){
        zi<- geno[,i]                                                                     #genotypes:z#
        wi<- 2* (zi^2) -1                                                                 #w#
        xi1<- cbind(zi,wi,ui,vi) ##
        bh1<- solve(t(xi1) %*% xi1)%*% t(xi1) %*% y   #coefficients under H1#
        ei[,i]<- y - xi1 %*% bh1
        L1<- t(ei[,i])%*% ei[,i]
        sig1<- as.vector(L1/N) ; theta1[,i]<- c(bh1, sig1)  #sig1 is the squared sigma1; theta1 is the parameters under H1.#
        Lod[i]<- log((L0 / L1)^(N / 2))/log(10)       #This is the LOD score.#

        logc1<- (-log(2 * pi * sig1))/2 ; loge1<- (y-(xi1 %*% bh1))^2 /(-2* sig1)
        logf1[,i]<- logc1+loge1
        }

        logD<- (logf1 - logf0)*N / log(10)            #Using it to find EIF of the lod scores.#

        EIF<- t(t(logD)- Lod)                         #EIF(N*M) is the empirical influence matrix of all the individuals for all LOD scores.#

### Specify the chromosome and the loci ###
        m <- length(chro)

        if (sum(loci > m) >= 1) stop(paste("loci should be a vector containing some integers no larger than ", m, "!"))

        Eif <- EIF[,chro[loci]]

        Rifun<- function(i){  #i is the index of the loci; equation (18) in the paper.#
        u<- rep(1, N); v<- sex
        U<- cbind(u,v)
        zi<- geno[,i]; wi<- 2*((zi)^2) -1 
        ZiU <- cbind(zi, wi, U)
        Hi1 <- ZiU %*% solve(t(ZiU) %*% ZiU) %*% t(ZiU)
        H0 <- U %*% solve(t(U) %*% U) %*% t(U)
        Ri<- Hi1-H0
        return(Ri)
        }

        k<- length(loci)                              #Find Cov,the equation (17), in the paper.#
        Cov<- matrix(0, nrow=k, ncol=k)
        R<- list(); r<- 0
        for(i in chro[loci]){
                r<- r+1
                R[[r]]<- Rifun(i)
        }

for(i in 1:k){
        for(j in 1:k){
                Cov[i,j]<- sum(diag( R[[i]] %*% R[[j]] )) * 2/ ( 2* log(10) )^2 
        }
}

ICov<- solve(Cov)                                     #ICov is the inverse matrix of Cov.#

vec1<- rep(1,k)                                       #Use matrix J to remove the parallel shift.#
J<- diag(k)- ( vec1 %*% t(vec1) %*% ICov ) /as.vector(t(vec1) %*% ICov %*% vec1)

JCJ<- J %*% Cov %*% t(J)

require(MASS)
Gjcj<- ginv(JCJ)                                      #Moore-Penrose generalized invers of the matrix JCJ #

SgiS<- Eif %*% t(J) %*% Gjcj %*% J %*% t(Eif)
eSgiS<- eigen(SgiS)                                   #Find the eigenvalues and eigenvectors.#
eval<- eSgiS$values
evec<- eSgiS$vectors

Eval<- which(eval/sum(eval) > 0.1)
Evalue<- eval[Eval]
ISV<- matrix(0, nrow=N, ncol=length(Eval))
for(l in Eval){
        ISV[,l]<- sqrt(Evalue[l])* evec[,l] 
}

        return(list(Lod=Lod, EIF=EIF, evalue=eval, evector=evec, ISV=ISV, Eval=Eval))                   #Lod scores, EIF and eigenvalues, eigenvectors and influence score vectors are returned.#
        }

############################################################
# Function for plotting influence score vectors. #
# The figure shows the individual having the maximum absolute influence scores. #

plot.isv<- function(ISV=isv, d=dimension, mx=1){
        ds <- dim(ISV)[2]
        if (d > ds) stop(paste("The dimension d should not larger than ", ds, "!"))

        isvd<- ISV[,d]
        m.x <- which(rank(abs(isvd)) > length(isvd) - mx)
        m.isv<- max(abs(isv))+10
        oldpar<- par(no.readonly=TRUE)
        on.exit(par(oldpar))
        par(cex=1.2)
        plot(isvd, ylim=c(-m.isv, m.isv), xlab="Index", ylab=paste("Influence Score Vector", d))
        text(m.x +8, isvd[m.x]-8, paste(m.x, " (", round(isvd[m.x],3),")"),col=1,cex=0.7)
        par(oldpar)
}

############################################################
# Function for plotting two influence score vectors. #
# The figure shows the individuals having the maximum influence scores. #

plot2d.isv<- function(ISV=isv, d1=dimension, d2=dimension, mx=1){
        ds <- dim(ISV)[2]
        if (d1 > ds) stop(paste("The dimension d1 should not larger than ", ds, "!"))
        if (d2 > ds) stop(paste("The dimension d2 should not larger than ", ds, "!"))

        isvd1<- ISV[,d1]
        isvd2<- ISV[,d2]
        m.x <- which(rank(isvd1^2+isvd2^2) > length(isvd1) - mx)
        m.isv<- max(abs(isv))+10
        oldpar<- par(no.readonly=TRUE)
        on.exit(par(oldpar))
        par(cex=1.2)
        plot(isvd1, isvd2, xlim=c(-m.isv, m.isv), ylim=c(-m.isv, m.isv), xlab=paste("Influence Score Vector", d1), ylab=paste("Influence Score Vector", d2))
        text(isvd1[m.x]+8, isvd2[m.x]-8, paste(m.x),col=1,cex=0.7)
        par(oldpar)
}


############################################################
############################################################


### dataset  ####
X1<-matrix(scan("Data_LogADI.txt", what="character"), 173, byrow=TRUE)

### Input the dataset, specify chromosome and marker loci ###
isvs<- ISVs(X=X1, chromosome=3, loci=c(3,4,5))
isv <- isvs$ISV

# Plot isv #
# Indicate the number(s) d (d1, d2) for the influence score vector(s) #
plot.isv(ISV=isv, d=2, mx=3); readline()
plot2d.isv(ISV=isv, d1=1, d2=2, mx=3)
############################################################