# To run this script, make sure you have the following:
#
# 1) R installed.  R is available at
#
#    http://www.r-project.org/
#
# 2) R in your execution path.
# 3) The randomForest package installed in R.  The package can be installed
#    from the package manager in R, or it is also available at whichever
#    download mirror you choose in the Packages section.
#
# Then set your current directory to the directory containing both this
# R script and all the requisite data files, and type the following on
# the command line:
#
# R --no-restore --no-save < RFPhotoZ.R
#
# The following files will be output:
#
# 1) testzPhot10k.txt containing photo-z estimates and associated sigmas
# 2) errDist10k.txt containing resulting standardized true errors
# 3) errHistVsPhi10k.eps containing a postscript plot of the standardized true
#    errors vs. a standard normal curve

library(randomForest)

computePhotoZ <- function(trainingColorsInputFilename,
                          trainingzSpecInputFilename,
                          testColorsInputFilename,
                          testzSpecInputFilename,
                          testzPhotOutputFilename)
{
  trainingColors <- read.table(trainingColorsInputFilename, header = TRUE)
  trainingzSpec <- read.table(trainingzSpecInputFilename, header = TRUE)[,1]
  testColors <- read.table(testColorsInputFilename, header = TRUE)
  testzSpec <- read.table(testzSpecInputFilename, header = TRUE)[,1]

  # CHANGE ntree BELOW TO INCREASE OR DECREASE FOREST SIZE
  rf <- randomForest(x = trainingColors, y = trainingzSpec,
                     ntree = 50, keep.forest = TRUE)
  prediction <- predict(rf, testColors, predict.all = TRUE)

  sigmas <- sqrt(rowMeans((prediction$individual - prediction$aggregate)^2))

  write.table(x = cbind(zPhot = prediction$aggregate, sigma = sigmas),
              file = testzPhotOutputFilename,
              quote = FALSE, row.names = FALSE, col.names = TRUE);

  # return value
  rmsError <- sqrt(mean((prediction$aggregate - testzSpec)^2))
}

computeAggregateErrorDistribution <- function(testzSpecInputFilename,
                                              testzPhotInputFilename,
                                              errDistOutputFilename)
{
  zSpec <- read.table(testzSpecInputFilename, header = TRUE)[,1]
  zPhot <- read.table(testzPhotInputFilename, header = TRUE)[,1]
  sigma <- read.table(testzPhotInputFilename, header = TRUE)[,2]

  write.table(x = cbind(stdErr = ((zPhot - zSpec) / sigma)),
              file = errDistOutputFilename,
              quote = FALSE, row.names = FALSE, col.names = TRUE)
}

plotHistVsPhi <- function(distInputFilename,
                          plotOutputFilename,
                          main = "", xlab = "x", ylab = "f(x)",
                          binWidth = 0.5,
                          width = 4 * 1.25, height = 3 * 1.25)
{
  x <- read.table(distInputFilename, header = TRUE)[,1]
  cnts <- c()
  ticks <- 11
  bins <- seq(-5, 5, binWidth)
  total <- 0
  for(i in bins)
  {
    cnts[sprintf("%f", i)] <-
      sum(x > (i - (binWidth / 2.0)) & x <= (i + (binWidth / 2.0))) / length(x)
    total <- total + cnts[sprintf("%f", i)]
  }

  postscript(plotOutputFilename, width = width, height = height,
             horizontal = FALSE, onefile = FALSE, paper = "special")
  plot(-50:50 / 10, dnorm(-50:50 / 10), type = "l",
       main = main, xlab = xlab, ylab = ylab)

  points(bins, cnts / binWidth)
  legend("topright", .415,
         c(expression("f(x) = " * phi * "(x)"), "f(x) = freq(x)"),
         lty = c(1, 0), pch = c(21, 21), pt.cex = c(0, 1))
  dev.off()
}

doAll <- function(trainingColorsInputFilename,
                  trainingzSpecInputFilename,
                  testColorsInputFilename,
                  testzSpecInputFilename,
                  testzPhotOutputFilename,
                  errDistOutputFilename,
                  plotOutputFilename)
{
  print("Training and regressing...")
  rmsError <- computePhotoZ(trainingColorsInputFilename,
                            trainingzSpecInputFilename,
                            testColorsInputFilename,
                            testzSpecInputFilename,
                            testzPhotOutputFilename)
  print(sprintf("Done.  Wrote zPhot to %s.  RMS error = %f",
                testzPhotOutputFilename, rmsError))

  print("Generating standardized error distribution...")
  computeAggregateErrorDistribution(testzSpecInputFilename,
                                    testzPhotOutputFilename,
                                    errDistOutputFilename)
  print(sprintf("Done.  Wrote standardized errors to %s.",
                errDistOutputFilename))

  print("Generating standardized error distribution plot...")
  plotHistVsPhi(errDistOutputFilename,
                plotOutputFilename,
                binWidth = 0.5)
  print(sprintf("Done.  Wrote plot to %s.", plotOutputFilename))
}

# CHANGE FILENAMES WITHIN QUOTES BELOW APPROPRIATELY TO LOAD OTHER DATA.
# MAKE SURE THE INPUT FILES ARE FORMATTED LIKE THE SAMPLE FILES.
doAll(trainingColorsInputFilename = "trainColors10k.txt",
      trainingzSpecInputFilename = "trainzSpec10k.txt",
      testColorsInputFilename = "testColors10k.txt",
      testzSpecInputFilename = "testzSpec10k.txt",
      testzPhotOutputFilename = "testzPhot10k.txt",
      errDistOutputFilename = "errDist10k.txt",
      plotOutputFilename = "errHistVsPhi10k.eps")