# To run this script, make sure you have the following: # # 1) R installed. R is available at # # http://www.r-project.org/ # # 2) R in your execution path. # 3) The randomForest package installed in R. The package can be installed # from the package manager in R, or it is also available at whichever # download mirror you choose in the Packages section. # # Then set your current directory to the directory containing both this # R script and all the requisite data files, and type the following on # the command line: # # R --no-restore --no-save < RFPhotoZ.R # # The following files will be output: # # 1) testzPhot10k.txt containing photo-z estimates and associated sigmas # 2) errDist10k.txt containing resulting standardized true errors # 3) errHistVsPhi10k.eps containing a postscript plot of the standardized true # errors vs. a standard normal curve library(randomForest) computePhotoZ <- function(trainingColorsInputFilename, trainingzSpecInputFilename, testColorsInputFilename, testzSpecInputFilename, testzPhotOutputFilename) { trainingColors <- read.table(trainingColorsInputFilename, header = TRUE) trainingzSpec <- read.table(trainingzSpecInputFilename, header = TRUE)[,1] testColors <- read.table(testColorsInputFilename, header = TRUE) testzSpec <- read.table(testzSpecInputFilename, header = TRUE)[,1] # CHANGE ntree BELOW TO INCREASE OR DECREASE FOREST SIZE rf <- randomForest(x = trainingColors, y = trainingzSpec, ntree = 50, keep.forest = TRUE) prediction <- predict(rf, testColors, predict.all = TRUE) sigmas <- sqrt(rowMeans((prediction$individual - prediction$aggregate)^2)) write.table(x = cbind(zPhot = prediction$aggregate, sigma = sigmas), file = testzPhotOutputFilename, quote = FALSE, row.names = FALSE, col.names = TRUE); # return value rmsError <- sqrt(mean((prediction$aggregate - testzSpec)^2)) } computeAggregateErrorDistribution <- function(testzSpecInputFilename, testzPhotInputFilename, errDistOutputFilename) { zSpec <- read.table(testzSpecInputFilename, header = TRUE)[,1] zPhot <- read.table(testzPhotInputFilename, header = TRUE)[,1] sigma <- read.table(testzPhotInputFilename, header = TRUE)[,2] write.table(x = cbind(stdErr = ((zPhot - zSpec) / sigma)), file = errDistOutputFilename, quote = FALSE, row.names = FALSE, col.names = TRUE) } plotHistVsPhi <- function(distInputFilename, plotOutputFilename, main = "", xlab = "x", ylab = "f(x)", binWidth = 0.5, width = 4 * 1.25, height = 3 * 1.25) { x <- read.table(distInputFilename, header = TRUE)[,1] cnts <- c() ticks <- 11 bins <- seq(-5, 5, binWidth) total <- 0 for(i in bins) { cnts[sprintf("%f", i)] <- sum(x > (i - (binWidth / 2.0)) & x <= (i + (binWidth / 2.0))) / length(x) total <- total + cnts[sprintf("%f", i)] } postscript(plotOutputFilename, width = width, height = height, horizontal = FALSE, onefile = FALSE, paper = "special") plot(-50:50 / 10, dnorm(-50:50 / 10), type = "l", main = main, xlab = xlab, ylab = ylab) points(bins, cnts / binWidth) legend("topright", .415, c(expression("f(x) = " * phi * "(x)"), "f(x) = freq(x)"), lty = c(1, 0), pch = c(21, 21), pt.cex = c(0, 1)) dev.off() } doAll <- function(trainingColorsInputFilename, trainingzSpecInputFilename, testColorsInputFilename, testzSpecInputFilename, testzPhotOutputFilename, errDistOutputFilename, plotOutputFilename) { print("Training and regressing...") rmsError <- computePhotoZ(trainingColorsInputFilename, trainingzSpecInputFilename, testColorsInputFilename, testzSpecInputFilename, testzPhotOutputFilename) print(sprintf("Done. Wrote zPhot to %s. RMS error = %f", testzPhotOutputFilename, rmsError)) print("Generating standardized error distribution...") computeAggregateErrorDistribution(testzSpecInputFilename, testzPhotOutputFilename, errDistOutputFilename) print(sprintf("Done. Wrote standardized errors to %s.", errDistOutputFilename)) print("Generating standardized error distribution plot...") plotHistVsPhi(errDistOutputFilename, plotOutputFilename, binWidth = 0.5) print(sprintf("Done. Wrote plot to %s.", plotOutputFilename)) } # CHANGE FILENAMES WITHIN QUOTES BELOW APPROPRIATELY TO LOAD OTHER DATA. # MAKE SURE THE INPUT FILES ARE FORMATTED LIKE THE SAMPLE FILES. doAll(trainingColorsInputFilename = "trainColors10k.txt", trainingzSpecInputFilename = "trainzSpec10k.txt", testColorsInputFilename = "testColors10k.txt", testzSpecInputFilename = "testzSpec10k.txt", testzPhotOutputFilename = "testzPhot10k.txt", errDistOutputFilename = "errDist10k.txt", plotOutputFilename = "errHistVsPhi10k.eps")