##http://www.broadinstitute.org/collaboration/giant/images/1/15/SNP_gwas_mc_merge_nogc.tbl.uniq.gz #don't run, this is where the GWAS results originate
awk '{print $1, $2, $3, $7, $5}' ~/igss/SNP_gwas_mc_merge_nogc.tbl.uniq > ~/igss/bmi-gwas.result #bash, get select columns of the gwas file


make_pgs<-function(plink.file=NULL,gwas.file="/tmp/GWAS.result",wd="/tmp/grs/",out.name,clump=TRUE) {
    #plink.file self-explanatory, the plink file that will be used.
    #gwas.file the set of gwas results. they need to be in the right format when you feed them to make_pgs: marker name, allele 1, allele 2, p value, effect size
    #wd the working directory. need to change if using windows.
    #clump: do you want to compute a pgs based on "clumped" snps?
    #################################
    #0. preamble
    if (is.null(plink.file)) stop("need to pass plink file")
    options(stringsAsFactors=FALSE)
    tr<-list()
    getwd()->orig.dir
    system(paste("mkdir ",wd))
    setwd(wd)
    system("unlink gen.bim && unlink gen.bed && unlink gen.fam")
    ##i'm just creating symbolic links here so that i can work in one directory. not really imperative that you do something like this, but if you don't youll need to change directory stuff downstream.
    system(paste("ln -s ",plink.file,".bed ./gen.bed",sep=""))
    system(paste("ln -s ",plink.file,".bim ./gen.bim",sep=""))
    system(paste("ln -s ",plink.file,".fam ./gen.fam",sep=""))
    #################################
    #1. remove ambiguous snps from gwas files (strand issues). remember that we're only going to use SNPs which have a quickly identifiable strand.
    system(paste("awk '{ print $1, $2 $3, $4, $5}' ",gwas.file," > temp"))
    read.table("temp")->tmp
    toupper(tmp[,2])->tmp[,2]
    write.table(tmp,file="temp",quote=FALSE,row.names=FALSE,col.names=FALSE)
    #the below "ff" strings are inserted just to make life at the bash command line easier as otherwise one has too many levels of quotes to quickly parse.
    "awk '{ if ($2 == ffACff || $2 == ffAGff || $2 == ffCAff || $2 == ffCTff || $2 == ffGAff || $2== ffGTff || $2 == ffTCff || $2 == ffTGff ) print $0}' temp > GWAS.noambig"->txt
    gsub('ff','"',txt)->txt
    system(txt)
    system("awk '{print $1}' GWAS.noambig > GWAS.snps")
    #################################
    #2. clumping!
    if (clump) { #only gets run when you are clumping
        #this just adds a header to a file.
        system('echo "SNP Allele1Allele2 P W" > head.txt')
        system("cat head.txt GWAS.noambig > GWAS2.noambig")
        #Clump data in 2 rounds using plink2
        #1st clumping & extract tops snps for 2nd round
        fun<-function(i) {
            paste("plink --bfile gen --chr ",i,"  --clump GWAS2.noambig  --clump-p1 1 --clump-p2 1 --clump-r2 .5 --clump-kb 250 --out traitX",i,".round1 --silent",sep="")->cmd
            system(cmd)
            system(paste("awk '{print $3, $5}' traitX",i,".round1.clumped > traitX",i,".round2.input",sep=""))
            system(paste("awk '{print $3}' traitX",i,".round1.clumped > traitX",i,".extract2",sep=""))
            cmd
        }
        library(parallel)
        detectCores()->nw
        makeCluster(nw)->cl 
        clusterApply(cl,1:22,fun)->garbage
        garbage[[1]]->tr$clump1
        #2nd clumping & extract tops snps for profile
        fun<-function(i) {
            paste("plink  --bfile gen --chr ",i," --extract traitX",i,".extract2 --clump traitX",i,".round2.input --clump-p1 1 --clump-p2 1 --clump-r2 .2 --clump-kb 5000 --out traitX",i,".round2 --silent",sep="")->cmd
            system(cmd)
            system(paste("awk '{print $3}' traitX",i,".round2.clumped > traitX",i,".selected",sep=""))
            cmd
        }
        clusterApply(cl,1:22,fun)->garbage
        garbage[[1]]->tr$clump2
        stopCluster(cl)
        system("cat traitX1.selected traitX2.selected traitX3.selected traitX4.selected traitX5.selected traitX6.selected traitX7.selected traitX8.selected traitX9.selected traitX10.selected traitX11.selected traitX12.selected traitX13.selected traitX14.selected traitX15.selected traitX16.selected traitX17.selected traitX18.selected traitX19.selected traitX20.selected traitX21.selected traitX22.selected > traitX.selected")
    } else { #gets run if you don't clump. just making a traitX.selected file available for later work.
        system("echo 'SNP' > /tmp/head.txt")
        system("cat /tmp/head.txt GWAS.snps > traitX.selected")
    }
    #################################
    #3. Cleaning plink files
    #this is going to prune the gwas files down to just the (1) clumped & (2) non-ambig snps (the latter are those from the GWAS.snps file)
    read.table("traitX.selected",header=TRUE)->selected
    read.table(gwas.file,header=TRUE)->effects
    #this is just in case any snps are duplicated in gwas files.
    effects[!duplicated(effects[,1]),]->effects
    #first the clumped SNPs
    effects[,1] %in% selected$SNP -> index
    effects[index,]->effects
    effects[,2]<-toupper(effects[,2])
    effects[,3]<-toupper(effects[,3])
    #now the hard part: ambig snps
    #make sure strands are aligned
    bim <- read.table(file="gen.bim",header=FALSE)
    names(bim)<-c("chr","snp","a","b","a1.bim","a2.bim")
    names(effects)<-c("snp","a1.eff","a2.eff","pv","beta")
    NULL->bim$a->bim$b
    merge(bim,effects,by="snp")->test
    #table(test$a1.bim,test$a1.eff) #useful if you want to see what is going on.
    #get rid of ambig strands from bim (already yanked from gwas data)
    test$a1.bim=="T" & test$a2.bim=="A" -> i1
    test$a1.bim=="A" & test$a2.bim=="T" -> i2
    test[!(i1 | i2),]->test
    test$a1.bim=="C" & test$a2.bim=="G" -> i1
    test$a1.bim=="G" & test$a2.bim=="C" -> i2
    test[!(i1 | i2),]->test
    #now make everything a/c. this is so that we can make sense of the risk allele
    for (nm in c("a1.bim","a2.bim","a1.eff","a2.eff")) {
        ifelse(test[[nm]]=="T","A",test[[nm]])->test[[nm]]
        ifelse(test[[nm]]=="G","C",test[[nm]])->test[[nm]]
    }
    ifelse(test$a1.bim!=test$a1.eff,-1*test$beta,test$beta)->test$beta #flipping
    test[,c("snp","a1.eff","beta")]->z
    #now read the bim file back in and overwrite the old allele names
    read.table("gen.bim")->bim 
    bim[,c(2,5)]->bim
    names(bim)<-c("snp","a1.eff")
    NULL->z$a1.eff
    merge(z,bim)->z
    z[,c("snp","a1.eff","beta")]->z
    #
    write.table(z,file="score_file.txt",quote=FALSE,row.names=FALSE,col.names=FALSE)
    nrow(z)->tr$final.n
    #################################
    #4. create score!
    setwd(orig.dir)
    system(paste("plink --bfile ",wd,"gen --score ",wd,"score_file.txt --silent --out ",out.name,sep=""))
    dump("tr",file=paste(out.name,".metadata",sep=""))
    #system(paste("rm -r ",wd))
    tr
}


setwd("~/igss/")
#first score without clumping
make_pgs(plink.file="~/igss/ceu-qc-small",gwas.file="~/igss/bmi-gwas.result",out.name="bmi-2",clump=FALSE,wd="/tmp/pgs-c2/")

#now compute a clumped score
#to do this, it will help to focus the gwas results on just those markers in our .bim file
read.table("~/igss/bmi-gwas.result",header=TRUE)->bmi.gwas
read.table("~/igss/ceu-qc-small.bim",header=FALSE)->bim
bmi.gwas[bmi.gwas[,1] %in% bim[,2],]->bmi.gwas
write.table(bmi.gwas,file="bmi-gwas-clump.result",quote=FALSE,row.names=FALSE)

setwd("~/igss/")
make_pgs(plink.file="~/igss/ceu-qc-small",gwas.file="~/igss/bmi-gwas-clump.result",out.name="bmi-2cl",clump=TRUE,wd="/tmp/pgs-c2-cl/")

read.table("~/igss/bmi.phen")->phen
phen[,c(2,3)]->phen
read.table("~/igss/bmi-2.profile")->prof
prof[,c(2,6)]->prof1
read.table("~/igss/bmi-2cl.profile")->prof
prof[,c(2,6)]->prof2
merge(phen,prof1,by=1)->tmp
merge(tmp,prof2,by=1)->tmp
tmp[,-1]->tmp
apply(tmp,2,as.numeric)->tmp
cor(tmp)


###########################################################
#For use of make_pgs later.

## #options(repos="http://cran.stat.ucla.edu/")
## #install.packages("devtools")
## #library(devtools)
## #install_github("ben-domingue/HeritHelper")
## #library(HeritHelper)
## #make_pgs(plink.file="~/igss/ceu-qc-small",gwas.file="~/igss/bmi-gwas.result",out.name="bmi-2",clump=FALSE,wd="/tmp/pgs-c2/")

#OR
## wget https://github.com/ben-domingue/HeritHelper/archive/master.zip
## unzip master.zip
## R CMD build HeritHelper-master 
## R CMD INSTALL HeritHelper_0.0.1.tar.gz