source("Experiments.Util.r")
library(readMLData)
library(dplyr)
library(tidyr)
library(PMCMR)
library(scmamp)

# Decyzje:
# - wybor parametrow propozycjonalizacji na najlepszym modelu dla klasyfikatora
# -- parametry klasyfikatora wybierane pod Uniform, za pomoca 5x2 cv
# -- parametry propozycjonalizacji wybierane per miara per model
# -- do oceny wynikow holdout jako zbior testowy
# -- na koniec Friedman z podzialem na algorytm (+ moze bez podzialu)

# download external datasets (requires wget installed on the OS)
pathData <- "../datasets/UCI_ML_DataFolders/"
pathDescription <- "../datasets/UCI_ML_DataDescription"
dsList <- prepareDSList(pathData, pathDescription)
selectedDatasets <- c("cmc", "credit-screening", "dermatology",
                      "ecoli", "glass", "haberman", "hepatitis",
                      "house-votes-84", "spect-SPECT", "vowel-context",
                      "wine", "yeast", "zoo", "breast-cancer-wisconsin",
                      "car-evaluation", "image-segmentation",
                      "heart-disease-cleveland", "ionosphere",
                      "iris", "monks-1", "monks-2", "monks-3",
                      "pima-indians-diabetes", "tae", "tic-tac-toe",
                      "promoters", "balance-scale", "sonar.all",
                      "statlog-australian", "statlog-heart"
                      )

for (id in c(selectedDatasets)) {
    dsDownload(dsList, id, "wget", "links.txt")
}
dsList <- prepareDSList(pathData, pathDescription)


# experiment parameters
datasets = data.frame(name=selectedDatasets, class=rep("Class", length(selectedDatasets)),
                      type=rep("UCI", length(selectedDatasets)), stringsAsFactors = F)
classifiers = c("wkNN")#, "wNB", "wkNN", "rpart", "multinom", "pda", "gbm", "bayesglm")
proposParameters = data.frame(k = c(1:10, 20, 0.01, 0.05, 0.1, 0.15, 0.2),
                                      threshold = c(rep(NA, 16)))
resultsFile = "../results/evaluation_wknn2.csv"
allowParallel = FALSE


# # test parameters
# classifiers = c("rpart")
# proposParameters = data.frame(k = c(1:10, 20, 0.01, 0.05, 0.1, 0.15, 0.2), threshold = c(rep(NA, 16)))
# selectedDatasets <- c("cmc")
# datasets = data.frame(name=selectedDatasets, class=rep("Class", length(selectedDatasets)),
#                       type=rep("UCI", length(selectedDatasets)), stringsAsFactors = F)
# allowParallel = FALSE

################################################
# Run experiment
################################################
try({
    removePreviousResults(file = resultsFile)
    removePreviousResults(file = paste0(resultsFile, "_prop.csv"))
    removePreviousResults(file = paste0(resultsFile, "_datasets.csv"))
    summarizeDatasets(datasets, file = paste0(resultsFile, "_datasets.csv"))
    evaluatePropositionalization(datasets, classifiers, proposParameters,
                                 resultsFile, verbose = T, dsList = dsList,
                                 allowParallel = allowParallel)
}) 

################################################
# Freidman: Uniform vs Random vs Best measure
################################################
allResultsFile <- paste0(resultsFile, "_prop.csv")
allResults <- read.csv(allResultsFile) %>%
    mutate(Parameters = as.factor(Parameters), Weight = as.factor(Weight)) %>%
    group_by(Dataset, Classifier, Weight, Parameters) %>%
    summarise(MeanKappa = mean(Kappa)) %>%
    separate(Parameters, c("tmp_0", "k", "tmp_1", "tmp_2", "threshold"), sep="[ =,]", remove=F) %>%
    mutate(k=as.numeric(k), threshold=as.numeric(threshold)) %>%
    select(-tmp_0, -tmp_1, -tmp_2, -threshold, -Parameters) %>%
    group_by(Dataset, Classifier) %>%
    top_n(1, MeanKappa) %>%
    select(-MeanKappa) %>%
    slice(which.max(k)) %>%
    as.data.frame()

################################################
# Freidman: Uniform vs Random vs each measure
################################################
results <- read.csv(resultsFile) %>% 
    dplyr::select(Dataset, Classifier, Weight, Kappa) %>%
    filter(Weight != "distance") %>%
    # full_join(allResults, by = c("Dataset" = "Dataset", "Classifier" = "Classifier", "Weight" = "Weight")) %>%
    # filter(Weight %in% c("uniform", "random", "distance") | !is.na(k)) %>%
    # dplyr::select(-k) %>%
    # mutate(Weight = ifelse(Weight == "uniform", "uniform", ifelse(Weight == "random", "random", ifelse(Weight == "distance", "distance", "prop")))) %>%
    spread(Weight, Kappa) %>%
    ungroup() %>%
    dplyr::select(-Dataset, -Classifier) %>%
    data.matrix()
results <- t(apply(results, 1, rank))
results <- results[, order(colMeans(results, na.rm=TRUE))]
print(colMeans(results, na.rm=TRUE))
boxplot(results)
friedman.test(results)
posthoc.friedman.nemenyi.test(results)
setEPS()
postscript(paste0(resultsFile, ".eps"), width = 7, height = 3.4)
plotCD(results.matrix = results, alpha = 0.05)
dev.off()
