################################################################################
# MSAP_calc.r
# Functions to transform and analzyse MSAP data
# Walter Durka - October/November/December 2012     v. 1.0
#
# changes made on 26.03.2014 in MSAP_code           v. 1.1
# changes made on 05.09.2014 in MSAP_code           v. 1.2
# changes made on 07.02.2017 in MSAP_code to 
# correct errors introduced in v1.1 and 1.2         v. 1.3
################################################################################
# Input file format to be read by function Extract_MSAP_epigenotypes:
# column 1 = population ID
# column 2 = sample ID
# column 3 = restriction enzyme "H" or "M"
# columns 4 to n = 0/1 for presence/absence of marker bands
# row 1 = table head containing column names (pop, sample, HM, locusIDs)
# rows 2 to 2*N + 1 = data
# --::
# pop sample HM marker1 marker2
# 1   1      H  1       0
# 1   1      M  1       0
# 1   2      H  1       0
# 1   2      M  0       1
# 2   1      H  0       1
# 2   1      M  1       0
# 2   2      H  0       1
# 2   2      M  0       1
#
################################################################################


# Function to interpret the well sorted primary data table
MSAP_code <- function(MSAP.data, whichcode = "Mix2.unmet",whichloci,V_tag)
{                             # codes for H/M: 11 10 01 00
                              #                 2 1  -1  0
if (whichcode == "Herrera"       ) code <- c(0,1,1,NA)
if (whichcode == "Vergeer"       ) code <- c(0,1,1,NA)
if (whichcode == "Salmon"        ) code <- c(0,1,1,0)
if (whichcode == "Lira-M1"       ) code <- c(1,NA,0,0)
if (whichcode == "Lira-M2"       ) code <- c(1,0,0,0)
if (whichcode == "Paun.primary"  ) code <- c(2,1,-1,0)
if (whichcode == "Paun.unmet"    ) code <- c(1,0,0,0)
if (whichcode == "Paun.MeCpG"    ) code <- c(1,0,1,0)
if (whichcode == "Paun.hemi"     ) code <- c(1,1,0,0)
if (whichcode == "Mix1.unmet"    ) code <- c(1,0,0,0)
if (whichcode == "Mix1.meth"     ) code <- c(0,1,1,0)
if (whichcode == "Mix2.unmet"    ) code <- c(1,0,0,0)
if (whichcode == "Mix2.MeCpG"    ) code <- c(0,0,1,0)
if (whichcode == "Mix2.hemi"     ) code <- c(0,1,0,0)

# modified and then deactivated these lines as data should be sorted outside of this function! 07.02.2017
#MSAP.data <- MSAP.data[order(MSAP.data[,3]),] # sort by enzyme H-M
#MSAP.data <- MSAP.data[order(MSAP.data[,2]),] # sort by pop             
#MSAP.data <- MSAP.data[order(MSAP.data[,1]),] # sort by sampleID

o.<-MSAP.data[MSAP.data[,3]=="H",];o.<-o.[,-(3)];  # delete H/M-column
n_samples = nrow(o.);          #n_samples
n_markers = ncol(MSAP.data)-3; #n_markers
# Rename Variables: add V_tag
b <- colnames(o.)
for (i in  1: n_markers) b[i+2] <- paste(V_tag,b[i+2],sep="")
colnames(o.)<-b

for (nS in 1:n_samples){
   for (nM in 1:n_markers){
                                    o.[nS,nM+2]<-code[4]         # Paun: 0
   if (MSAP.data[nS*2-1,nM+3] == 1) o.[nS,nM+2]<-code[2]         # Paun: 1
   if (MSAP.data[nS*2,  nM+3] == 1) o.[nS,nM+2]<-code[3]         # Paun: -1
   if (MSAP.data[nS*2-1,nM+3] == 1 & MSAP.data[nS*2,nM+3] == 1)
                                    o.[nS,nM+2]<-code[1]         # Paun: 2
   }}

   # delete the loci that are not relevant########################################
   whichloci <- data.frame(whichloci)
   o.     <- data.frame(t(o.[,-c(1,2)]));
   o.$use <- whichloci[,1]
   o.     <- data.frame(t(o.[o.$use==TRUE,,drop=FALSE]));   # delete loci                       ##26.03.2014
   o.     <- o.[-c(n_samples+1),,drop=FALSE];               # delete the additional line again  ##26.03.2014
return (o.)
} # end of function MSAP_code
################################################################################


################################################################################
# count 2.1.-1,0 codes per marker   N_codes
################################################################################
Count_epigenotypes <- function (d.) {
#  d. <- d
  n_markers <- ncol(d.)-3
  loc.all <- data.frame(matrix(nrow=n_markers,ncol=1,TRUE)); colnames(loc.all)<-c("use")
  o.<- MSAP_code(d.,"Paun.primary",loc.all,"")
  N_codes.<- data.frame(matrix(nrow=n_markers,ncol=4)); #str(N_codes)
  colnames(N_codes.)<-c("c-1","c0","c1","c2");
  for (i in 1 : n_markers)
      N_codes.[i,]<-table(c(o.[,i],-1,0,1,2))-1    # count -1,0,0,2 states

  return(N_codes.)
} # end of function Count_codes
################################################################################

################################################################################
# delete monomorphic loci
################################################################################
delete.monomorphic.loci_go <- function(d.,N_leading_columns=2){
  IDs<-d.[,-(N_leading_columns+1:ncol(d.))];  # e.g. Samplename + Pop
  d. <-d.[,-(1:N_leading_columns)]
  n_marker  <-ncol(d.)
  n_samples <-nrow(d.)
  keep.loci <- data.frame(matrix(nrow=n_marker,ncol=1,TRUE)); colnames(keep.loci)<-c("keep")
  for (i in 1: n_marker)
      if (length(table(d.[i],useNA="always")) == 2) keep.loci[i,1] <- FALSE # if only 2 types are present, inlcuding "NA", then marker is monomorphic

  dt<-data.frame(t(d.));
  dt$use <-  keep.loci$keep                    # add line with information
  dt <- data.frame(t(dt[dt$use==TRUE,]));      # delete loci
  d. <- dt[-c(n_samples +1),,drop=FALSE];      # delete the additional line again
  return(cbind(IDs,d.))
 }# End of  delete.monomorphic.loci

################################################################################
# read data and extract epigenotypes according to various codes
################################################################################
Extract_MSAP_epigenotypes <- function(inputfile="MSAP_data.txt",Epicode = "Mix1",
                                      outputfile="MSAP_out.txt",MinPoly=1,
                                      delete.monomorphic.loci=TRUE) {
# READ primary data ############################################################
# expected input format:
# column 1 = population
# column 2 = sample ID
# column 3 = restriction enzyme "H" or "M"
# columns 4 to n = 0/1 for presence/absence of bands

d<-read.table(inputfile, header=T);# str(d);names(d); d[1:10,1:10]

d <- d[order(d[,3]),] # sort by enzyme H-M
d <- d[order(d[,2]),] # sort by sampleID
d <- d[order(d[,1]),] # sort by pop     # necessary if sample-IDs are repeated within pops    ## 05.09.2014
n_samples = nrow(d)/2;#n_samples
n_markers = ncol(d)-3;#n_markers

sampleIDs <- d[d[,3]=="H",];sampleIDs<-sampleIDs[,-(3:ncol(d))];#str(o.)  # Samplename + Pop
sampleIDs2<-d[,-(4:ncol(d))];  # Samplename + Pop  + H/M

#sampleIDs
################################################################################

################################################################################
# delete loci from primary data table with less than MinPoly polymorphisms
################################################################################

  N_codes <- Count_epigenotypes(d);#N_codes
  keep.loci <- data.frame(matrix(nrow=n_markers+3,ncol=1,TRUE)); colnames(keep.loci)<-c("keep")
  for (i in 1: n_markers) if ( max(N_codes[i,]) > n_samples-MinPoly) keep.loci[i,1] <- FALSE
  dt<-data.frame(t(d[,-c(1,2,3)]));
  for (i in 1: n_markers) if (keep.loci[i,1]) dt$use[i] <- 1 else dt$use[i] <-0
  dt <- data.frame(t(dt[dt$use==1,]));            # delete loci
  dt <- dt[-c(2*n_samples+1),];                   # delete the additional line again
  d<- cbind(sampleIDs2,dt)
  n_markers = ncol(d)-3;#n_markers                 # redefine n_markers
  N_codes <- Count_epigenotypes(d);#N_codes

################################################################################
# Define the different types of methylation-loci for each fragment als Boolean per locus
################################################################################
loci <- data.frame(matrix(nrow=n_markers,ncol=12));
colnames(loci)<-c("all","Herrera.epi","Paun.unmet","Paun.MeCpG","Paun.hemi",
                  "Lira_M1","Lira_M2",
                  "Mix1.unmet","Mix1.meth",
                  "Mix2.unmet","Mix2.MeCpG","Mix2.hemi");

for (i in 1 : n_markers){
loci$all <- TRUE

# Herrera: 5% error threshold
if ((N_codes[i,1] + N_codes[i,3])/sum(N_codes[i,]) > 0.05)
                                                   loci$Herrera.epi[i]<-TRUE else loci$Herrera.epi[i]<- FALSE

if (N_codes[i,4] > 0 & N_codes[i,4] < n_samples)   loci$Paun.unmet[i]<- TRUE else loci$Paun.unmet[i]<- FALSE
if (N_codes[i,1] >0
  & N_codes[i,1]+ N_codes[i,4] < n_samples)        loci$Paun.MeCpG[i]<- TRUE else loci$Paun.MeCpG[i]<- FALSE

if (N_codes[i,3] > 0
  & N_codes[i,3]+ N_codes[i,4] < n_samples)        loci$Paun.hemi[i]<-  TRUE else loci$Paun.hemi[i]<- FALSE

if (N_codes[i,3] > 0 | N_codes[i,4] == 0)          loci$Lira_M1[i]<- FALSE else loci$Lira_M1[i]<- TRUE
if (N_codes[i,4] == 0)                             loci$Lira_M2[i]<- FALSE else loci$Lira_M2[i]<- TRUE

if (N_codes[i,4] == 0)                             loci$Mix1.unmet[i]<- FALSE else loci$Mix1.unmet[i]<- TRUE
if (N_codes[i,1] == 0 & N_codes[i,3] == 0)         loci$Mix1.meth[i]<-  FALSE else loci$Mix1.meth[i]<- TRUE

if (N_codes[i,4] == 0)                             loci$Mix2.unmet[i]<- FALSE else loci$Mix2.unmet[i]<- TRUE
if (N_codes[i,1] == 0)                             loci$Mix2.MeCpG[i]<- FALSE else loci$Mix2.MeCpG[i]<- TRUE
if (N_codes[i,3] == 0)                             loci$Mix2.hemi[i]<-  FALSE else loci$Mix2.hemi[i] <- TRUE
} # end of i in 1:n_markers
###############################################################################

###############################################################################
# get tranfsormed matrices
# Paun primary   (2,1,-1,0)
if (Epicode == "Paun.0") { out <- MSAP_code(d,"Paun.primary", loci$all,"") }

# Paun
if (Epicode == "Paun") {
o2 <- MSAP_code(d,"Paun.unmet",loci$Paun.unmet,"u");
o3 <- MSAP_code(d,"Paun.MeCpG",loci$Paun.MeCpG,"m");
o4 <- MSAP_code(d,"Paun.hemi", loci$Paun.hemi,"h");
out <- cbind(sampleIDs,o2,o3,o4)}

# Mixed1
if (Epicode == "Mix1") {
o2 <- MSAP_code(d,"Mix1.unmet",loci$Mix1.unmet,"u");
o3 <- MSAP_code(d,"Mix1.meth",loci$Mix1.meth,"M");
out <- cbind(sampleIDs,o2,o3)}

# Mixed2
if (Epicode == "Mix2") {
o2 <- MSAP_code(d,"Mix2.unmet",loci$Mix2.unmet,"u")
o3 <- MSAP_code(d,"Mix2.MeCpG",loci$Mix2.MeCpG,"m")
o4 <- MSAP_code(d,"Mix2.hemi",loci$Mix2.hemi,"h")
out <- cbind(sampleIDs,o2,o3,o4)}

# Herrera
# assumign 5% error rate
if (Epicode == "Herrera") {
o2 <- MSAP_code(d,"Herrera",loci$Herrera.epi,"e");
out <- cbind(sampleIDs,o2)
str(out)
}

# Vergeer
if (Epicode == "Vergeer") {
o2 <- MSAP_code(d,"Vergeer",loci$all,"e");
out <- cbind(sampleIDs,o2)}

# Salmon
if (Epicode == "Salmon") {
o2 <- MSAP_code(d,"Salmon",loci$all,"e");
out <- cbind(sampleIDs,o2)}

# Lira-Medeiros
if (Epicode == "Lira-M1") {
o2 <- MSAP_code(d,"Lira-M1",loci$Lira_M1,"e");
out <- cbind(sampleIDs,o2)}

# Lira-Medeiros + HpaII
if (Epicode == "Lira-M2") {
o2 <- MSAP_code(d,"Lira-M2",loci$Lira_M2,"e");
out <- cbind(sampleIDs,o2)
}

if (delete.monomorphic.loci) out <-delete.monomorphic.loci_go(out,2)
  #str(out)

# Export formatted files
if (outputfile != "") write.table(out,outputfile,sep="\t",quote=F,row.names=F)
return(out)
} # End of Extract_MSAP_epigenotypes
################################################################################

################################################################################
descriptive_parameters <- function(inputfile="MSAP_Mix2.txt",outputfile="MSAP_out.txt",AppendOutput=F){
# inputfile is a file produced by Extract_MSAP_epigenotypes :
# column 1 = population ID
# column 2 = sample ID
# columns 3 to n = 0/1 for presence/absence of epiloci

################################################################################
 do_it_des_par<- function(d.)
{
 nonNAs<- function(x) {length(which(!is.na(x)))}

# prescript<-"x"#++++++++++++++++++++++++++++++++++++++++++++++++++
  n_pops <- length(levels(as.factor(d.[,1])));
  n_markers <- ncol(d.)-2
  n_samples <- nrow(d.)
  d1 <- data.frame(d.[,c(1)]); d1[,2]<-1; colnames(d1) <- c("pop","count");
  d. <-cbind(d1,d.[,-c(1:2)]);
  dagg   <-aggregate(d.[,-1], by=list(d.$pop),  FUN=sum, na.rm=TRUE ); # count number of "1"
  daggNA <-aggregate(d.[,-1], by=list(d.$pop),  FUN=nonNAs); #  count number of nonNA data

  param <- data.frame(matrix(nrow=n_pops,ncol=9,0));
  colnames(param)<-c("PopID",
                     "N_samples",
                     "N_markers_total",
                     "N_markers_pop",
                     "N_markers_poly",
                     "Pc_markers_poly",
                     "Mean_N_1scores",
                     "N_private_markers",
                     "Shannon_diversity");

  param$PopID                <- dagg[,1]
  param$N_samples            <- dagg[,2]  # n_samples per pop

  param$N_markers_total      <- n_markers # n_markers

  ########################################
  # Count the number of markers present per pop
  markerpresent <- data.frame(matrix(nrow=n_markers,ncol=n_pops,0));
  for (nP in 1:n_pops){
    for (nM in 1:n_markers) if (dagg[nP,nM+2]>0) markerpresent[nM,nP] <-1 }

  # Number polymorphic markers
  poly <- data.frame(matrix(nrow=n_markers,ncol=n_pops,0));
  for (nP in 1:n_pops){
    for (nM in 1:n_markers)
         if (dagg[nP,nM+2]>0 & dagg[nP,nM+2] < daggNA[nP,nM+2]) poly[nM,nP] <-1
    }
  param$N_markers_poly       <- round(colSums(poly),digits=0)

  # Percentage polymorphic markers
  param$Pc_markers_poly      <- round(colSums(poly)/n_markers*100,digits=2)

  # Number of markers present
  param$N_markers_pop        <- round(colMeans(markerpresent)*nrow(markerpresent),digits=0)

  #########################################
  # Mean number of markers per pop
  param$Mean_N_1scores       <- round(rowMeans(dagg[,-c(1:2),drop=FALSE]),digits=2)

  #########################################
  # Number of private markers per pop
  markerpresent$priv <- (rowSums(markerpresent)==1 )
  private <- data.frame(matrix(nrow=n_markers,ncol=n_pops,0));
  for (nP in 1:n_pops){
    for (nM in 1:n_markers)
         if (markerpresent$priv[nM] & markerpresent[nM,nP]>0) private[nM,nP]<-1
    }
  param$N_private_markers    <- round(colSums(private),digits=0)

  #####################################
  # Shannon  index of phenotypic diversity S = -SUM pi * log2(pi)
  # For a dominant locus: S = - (pi *log2(pi) +  (1-pi)*log2(1-pi)) ; with pi = marker frequency per pop
  # Mean across loci
  Shannon <- data.frame(matrix(nrow=n_markers,ncol=n_pops,0));

  for (nP in 1:n_pops){
    for (nM in 1:n_markers)
        { pi <- dagg[nP,nM+2]/daggNA[nP,nM+2]#dagg[nP,2]
          if (pi>0 & pi<1)  Shannon[nM,nP] <- -1 * (pi*log2(pi) + (1-pi) * log2(1-pi))}
    }
  param$Shannon_diversity   <- round(colMeans(Shannon),digits=4);#parameters   # Mean across pops
 return(param)
 } # End of do_it_des_par
################################################################################
# inputfile="MSAP_Mix1.txt"
# inputfile="MSAP_Herrera.txt"

 d.<-read.table(inputfile, header=T);
 d.IDs <- d.[,c(1:2)]
 d.1 <- d.[,-c(1:2)]
 d.1 <- d.1[,order(names(d.1))];
 loci_tab <- table(substr(colnames(d.1),1,1));#loci_tab
 N_epiloci_types <- length(loci_tab)

 loci_tab <- c(1,loci_tab);#loci_tab

 F_para <- do_it_des_par(cbind(d.IDs,d.1))

 if (N_epiloci_types > 1){
   v1<-0;v2<-0;
   for (epitype in 1:N_epiloci_types) {
     v1 <- v1 + loci_tab[epitype];
     v2 <- v2 + loci_tab[epitype+1];
     d.t <-d.1[c(v1:v2)];
     para <- do_it_des_par(cbind(d.IDs,d.t))
     prescript <- substr(colnames(d.1[v1]),1,1) ;
     b <- colnames(para);
     for (i in  3:9) b[i] <- paste(prescript,b[i],sep="_")
     colnames(para)<-b;
     F_para <- cbind(F_para, para[,-c(1:2)])
   } # End of for (epitype in
 }# End of if (N_epiloci_types > 1)
 sink(outputfile,append=AppendOutput);
 cat("Descriptive parameters of populations calculated from ",inputfile,"\n");
 sink()
 if (outputfile != "") write.table(F_para,outputfile,sep="\t",quote=F,row.names=F,append=T)

 return(F_para)
} # End of function descriptive_parameters
##################################################################################


