source("Propositionalize.r")
source("KNN.r")
source("wNB.r")
library(dplyr)
library(caret)
library(rpart)
library(doParallel)
library(flock) # install.packages("flock", repos="http://R-Forge.R-project.org")

getReproducibleSeeds <- function(seed, folds, repeats, tuneLength) {
    set.seed(seed)
    resamples <- folds*repeats
    seeds <- vector(mode = "list", length = resamples + 1)
    
    for(i in 1:resamples) {
        seeds[[i]] <- sample.int(1000, tuneLength)
    }
    
    seeds[[resamples+1]] <- sample.int(1000, 1)
    seeds
}

removePreviousResults <- function(file){
    if(file.exists(file)){
        invisible(file.remove(file))
    }
}

saveResultsToFile <- function(dataset.name, model, fitResults, weights.name, parameters, file) {
    if (length(parameters) == 0) {
        parameters = ""
    } else {
        parameters = paste(names(parameters), parameters, sep="=", collapse = ", ")
    }
    
    results = data.frame(Dataset=dataset.name, Classifier=model$modelInfo$label,
                         Library=ifelse(is.null(model$modelInfo$library), "custom", model$modelInfo$library),
                         Weight=weights.name, Parameters=parameters)
    results = merge(results, fitResults[(ncol(fitResults)-1):ncol(fitResults)])
    
    ll = lock(paste(file, ".lock", sep=""))
    if (file.exists(file)){
        write.table(results, file, append = T, sep = ",", col.names = F, row.names = F)
    } else {
        write.table(results, file, sep = ",", row.names = F)
    }
    unlock(ll)
}

printResults <- function(dataset.name, model, results, weights.name, parameters){
    cat(paste(dataset.name, ": ", ifelse(is.null(model$modelInfo$label), model, model$modelInfo$label), sep=""))
    cat(paste(" with", weights.name, "weights\n"))
    cat(paste("Parameters: ", paste(names(parameters), parameters, sep="=", collapse = ", "), "\n\n", sep=""))
    cat("Results:\n")
    print(results)
    cat("\n")
}

getDataset <- function(name, type, dsList, responseName="Class",
                       pathData="../datasets/UCI_ML_DataFolders/",
                       pathDescription="../datasets//UCI_ML_DataDescription") {
    if (type == "package") {
        data(list=name)   
        get(name)
    } else if (type == "UCI") {
        if (is.null(dsList)){
            dsList = prepareDSList(pathData, pathDescription)
        }
        dsRead(dsList, name, responseName)
    } else {
        stop(paste("Unknown dataset type: ", type))
    }
}

summarizeDatasets <- function(datasets, file) {
    Name=character()
    Examples=integer()
    Attributes=integer()
    Classes=integer()
    
    for (d in 1:nrow(datasets)) {
        dataInfo = datasets[d,]
        dataset = getDataset(dataInfo$name, dataInfo$type, dsList)
        classAttr = ifelse(is.na(dataInfo$class), names(df)[-1], dataInfo$class)
        
        Name[d] = dataInfo$name
        Examples[d] = nrow(dataset)
        Attributes[d] = ncol(dataset) - 1
        Classes[d] = length(unique(dataset[,classAttr]))
    }
    
    summary <- data.frame(Name, Examples, Attributes, Classes)
    write.table(summary, file, sep = ",", row.names = F)
}

evaluateClassifier <- function(train, test, classAttr, selectedModel, seed = NA, weights = NULL) {
    set.seed(seed)
    
    fitControl <- trainControl(method = "none", allowParallel = F)
    
    model = selectedModel$modelInfo
    tunedParams = selectedModel$bestTune
    # additionalParams = selectedModel$dots[[1]]
    
    fit = train(train[, names(train) != classAttr],
                as.character(train[, classAttr]),
                model,
                weights = weights,
                trControl = fitControl,
                tuneGrid = tunedParams)
    
    testPred = predict(fit, test[, names(test) != classAttr])
    evalResults = postResample(testPred, test[, classAttr])
    
    performance = as.data.frame(t(evalResults))
    performance = cbind(tunedParams, performance)
    performance
}

evaluateWeights <- function(train, test, classAttr, model, weights,
                            datasetName, resultsFile, parameters=NULL,
                            seed = NA, verbose=TRUE){
    for (weight in names(weights)) {
        tryCatch({
            results = evaluateClassifier(train, test, classAttr, model, weights = weights[, weight], seed = seed)
            if (verbose) printResults(datasetName, model, results, weight, parameters)
            saveResultsToFile(datasetName, model, results, weight, parameters, resultsFile)
        },
        error = function(err) {
            print(paste("Trouble evaluating", model, "on", datasetName, "with", weight, "weights.", err))
        })
    }
}

selectBestModel <- function(train, classAttr, classifier, datasetName, repeatNum, foldNum, seed) {
    cat(paste("Selecting best model for", classifier, "\n"))
    set.seed(seed)
    
    tuneGrid = NULL
    tuneLength = 10
    classifierParams = NULL
    performance = NULL
    
    if (classifier == "wkNN") {
        classifier = wkNN
    }
    else if (classifier == "wNB") {
        classifier = wNB
        tuneLength = 1
    } else {
        if (classifier == "gbm") {
            tuneGrid = rbind(expand.grid(n.trees = c(100), interaction.depth = c(2, 5, 10), 
                                         shrinkage = 0.001, n.minobsinnode = c(2, 5, 10)),
                             data.frame(n.trees = 10, interaction.depth = 5, 
                                        shrinkage = 0.001, n.minobsinnode = 2))
            classifierParams = list(verbose = FALSE)
        }
        classifier = getModelInfo(classifier, regex = FALSE)[[1]]
    }
    
    if (is.null(tuneGrid)) {
        tuneGrid = classifier$grid(x = train[, names(train) != classAttr],
                                   y = as.factor(train[, classAttr]),
                                   len = tuneLength,
                                   search = "random")
    }
    
    fitControl <- trainControl(method = "repeatedcv",
                               number = foldNum,
                               repeats = repeatNum,
                               allowParallel = F)
    
    fit = train(x = train[, names(train) != classAttr],
                y = as.factor(train[, classAttr]),
                method = classifier,
                preProcess = NULL,
                classifierParams,
                trControl = fitControl,
                tuneGrid = tuneGrid)
    
    fit
}

getBestPropParams <- function(parameterSelectionFile, dataset, classifier, folds=2, reps=5) {
    results <- read.csv(parameterSelectionFile) %>%
        mutate(Parameters = as.factor(Parameters)) %>%
        filter(Dataset == dataset, Classifier == classifier) %>%
        group_by(Dataset, Classifier, Library, Weight, Parameters) %>%
        summarise(MeanKappa=mean(Kappa), Count=n()) %>%
        filter(Count == folds*reps) %>%
        separate(Parameters, c("tmp_0", "k", "tmp_1", "tmp_2", "threshold"), sep="[ =,]", remove=F) %>%
        mutate(k=as.numeric(k), threshold=as.numeric(threshold)) %>%
        select(-tmp_0, -tmp_1, -tmp_2) %>%
        top_n(1, MeanKappa) %>%
        top_n(-1, Parameters) %>%
        as.data.frame()
    
    results
}

evaluatePropositionalization <- function(datasets, classifiers, params, resultsFile,
                                         verbose, dsList, allowParallel, repeatNum=5,
                                         foldNum=2) {
    if (allowParallel) {
        cl <- makeCluster(detectCores(), type='PSOCK', outfile="")
        registerDoParallel(cl)
    }
    
    # foreach(d=1:nrow(datasets), .export=ls(envir=globalenv()),
    # .packages=c("dplyr", "caret", "infotheo", "igraph", "arules",
    #            "discretization", "readMLData", "flock", "hash")) %dopar% {
    for (d in 1:nrow(datasets)) {
        tryCatch({
            set.seed(23)
            seed = 123
            
            dataInfo = datasets[d,]
            dataset = getDataset(dataInfo$name, dataInfo$type, dsList)
            classAttr = ifelse(is.na(dataInfo$class), names(df)[-1], dataInfo$class)
            inTraining = createDataPartition(as.factor(dataset[, classAttr]), p=0.5, list=F)
            train = dataset[inTraining, ]
            test = dataset[-inTraining, ]
            
            cat("-------------------------------------------------------------------\n")
            cat(paste("Evaluating", paste(classifiers, collapse = ", "), "on the", dataInfo$name, "dataset\n"))
            cat(paste("Start", Sys.time(), "\n\n"))
            
            models = list()
            
            # Find best model for each classifier
            for (classifier in classifiers) {
                tryCatch({
                    model = selectBestModel(train, classAttr, classifier, dataInfo$name, repeatNum, foldNum, seed)
                    
                    weights = data.frame(uniform = rep(1, nrow(train)),
                                         #random = runif(nrow(train), 0.0000001, 1),
                                         distance = "distance")
                    evaluateWeights(train, test, classAttr, model, weights, dataInfo$name,
                                    resultsFile, NULL, seed, verbose)
                    
                    models[[length(models)+1]] <- model
                },
                error = function(err) {
                    print(paste("Trouble with classifier", classifier, "on dataset", dataInfo$name, ":", err))
                })
            }
            
            validation <- models[[1]]$control$index
            
            # Calculate different propositionalizations for each model
            for (pIdx in 1:nrow(params)) {
                for (fold in validation) {
                    tryCatch({
                        weights = Propositionalize(train[fold, ], classAttr, k = params[pIdx, "k"], threshold = params[pIdx, "threshold"])

                        for (mIdx in length(models)) {
                            evaluateWeights(train[fold, ], train[-fold, ], classAttr, models[[mIdx]], weights, dataInfo$name,
                                            paste0(resultsFile, "_prop.csv"), params[pIdx,], seed, verbose)
                        }
                    },
                    error = function(err) {
                        print(paste("Trouble with propositionalization on dataset", dataInfo$name, ":", err))
                    })
                }
            }

            # Evaluate best propositionalization for each model
            for (mIdx in length(models)) {
                bestParams <- getBestPropParams(paste0(resultsFile, "_prop.csv"), dataInfo$name, models[[mIdx]]$modelInfo$label)

                for (pIdx in 1:nrow(bestParams))  {
                    weights = Propositionalize(train, classAttr, k = bestParams[pIdx, "k"],
                                               threshold = bestParams[pIdx, "threshold"],
                                               selectedWeights = bestParams[pIdx, "Weight"])
                    evaluateWeights(train, test, classAttr, models[[mIdx]], weights, dataInfo$name,
                                    resultsFile, bestParams[pIdx, c("k", "threshold")], seed, verbose)
                }
            }
            
            cat(paste("Stop", Sys.time(), "\n"))
            cat("-------------------------------------------------------------------\n")}, 
            error = function(err) {
                print(paste("Trouble with preparing dataset", dataInfo$name, ":", err))
            })
    }
    
    if (allowParallel) {
        stopCluster(cl)
        closeAllConnections()
    }
}
