#include <R_ext/RS.h>
#include <R_ext/Boolean.h>
#include <R.h>
#include <Rinternals.h>
#include <Rmath.h>
#include <R_ext/Rdynload.h>
#include <sys/time.h>


#include <stdbool.h>
#include <math.h>
#include "CCubes.h"

#ifdef _OPENMP
    #undef match
    #include <omp.h>
#endif

#include "real.h"
#include "cl_setup.h"

#include "clccubes.h"

#include "config.h"
#include "logging.h"


SEXP CCubes(SEXP tt) {

    // $ export BITS_PER_WORD=32 in the Terminal, before running R

    // 32 bits per word, in bit shifting representation
    char *bits_per_word = getenv("BITS_PER_WORD");  // Read from the PATH
    int BITS_PER_WORD = bits_per_word ? atoi(bits_per_word) : 32;
    if (BITS_PER_WORD != 8 && BITS_PER_WORD != 16 && BITS_PER_WORD != 32 && BITS_PER_WORD != 64) {
        BITS_PER_WORD = 32;  // Default to 32
    }

    // $ export PRINT_INFO=1 in the Terminal, before running R
    char *print_info = getenv("PRINT_INFO");  // Read from the PATH
    Rboolean PRINT_INFO = print_info && print_info[0] == '1';

    int multiplier = 0;
    struct timeval start, end;
    double elapsed_time;

    config_set_int("log", LOG_LEVEL_WARN);
    config_set_int("log:clccubes", LOG_LEVEL_WARN);
    config_set_int("log:ccubes", LOG_LEVEL_DEBUG);
    config_set_int("log:cl", LOG_LEVEL_DEBUG);

    char log[100] = {0};
    sprintf(log, "log-ccubes");
    config_set_string("out", log);


    if (PRINT_INFO) {
        Rprintf("--- START ---\n");
        gettimeofday(&start, NULL); // Start time
    }

    int *p_tt = INTEGER(tt);
    int ttrows = nrows(tt); // number of rows in the data matrix
    int ninputs = ncols(tt) - 1; // number of inputs (columns - 1, the last one is the outcome)

    // calculate the number of positive output rows (the ON set)
    int posrows = 0;
    for (int r = 0; r < ttrows; r++) {
        posrows += p_tt[ninputs * ttrows + r];
    }

    // calculate the number of negative output rows (the OFF set)
    int negrows = ttrows - posrows;

    if (negrows == 0) {
        // if there are no negative output rows, no PIs can be found
        // all inputs will be completely minimized
        return(R_NilValue);
    }

    // split the minterms in the ON and OFF set matrices
    int ON_set[posrows * ninputs];
    int OFF_set[ninputs * negrows];
    int rowpos = 0, rowneg = 0;
    int max_value = 0;

    for (int r = 0; r < ttrows; r++) {
        if (p_tt[ninputs * ttrows + r] == 1) { // positive
            for (int c = 0; c < ninputs; c++) {
                int value = p_tt[c * ttrows + r];
                ON_set[c * posrows + rowpos] = value;
                if (value > max_value) {
                    max_value = value;
                }
            }
            rowpos += 1;
        }
        else { // negative
            for (int c = 0; c < ninputs; c++) {
                int value = p_tt[c * ttrows + r];
                OFF_set[c * negrows + rowneg] = value;
                if (value > max_value) {
                    max_value = value;
                }
            }
            rowneg += 1;
        }
    }

    int value_bit_width = 0;
    while (max_value > 0) {
        max_value >>= 1; // Shift right until no bits remain
        value_bit_width++;
    }

    // calculate the number of values (biggest number) for each input
    int nofvalues[ninputs];
    int nofpi[ninputs];

    for (int i = 0; i < ninputs; i++) {
        nofvalues[i] = 0; // initialize
        nofpi[i] = 0; // initialize

        for (int r = 0; r < ttrows; r++) {
            if (nofvalues[i] < p_tt[i * ttrows + r]) {
                nofvalues[i] = p_tt[i * ttrows + r];
            }
        }

        // add 1 because if the biggest number is 2 then it has three levels: 0, 1 and 2
        nofvalues[i] += 1;

        if (nofvalues[i] == 1) {
            // no input ever has less than two values
            nofvalues[i] = 2;
        }
    }

    // preallocating for an estimated large number of 10000 found PIs
    // this number will be iteratively increased when the found PIs reach the upper limit
    int estimPI = 250000;

    // the index of the PIs, in descending order of their number of covered ON-set minterms
    int *p_covered = R_Calloc(estimPI, int);

    // many PIs will have the same coverage, but they don't necessarily cover the same minterms
    // to employ row dominance when solving the PI chart, we need to compare the coverage of the
    // current PI with the coverage of the previous PIs. If this PI survives the comparison, its
    // coverage has to be added in the p_covered vector, and its order in the p_covered
    // vector, at the last index of the PI coverage with the same number of minterms
    int last_index[posrows]; // descending order

    // p_pichart = malloc(estimPI * posrows * sizeof(int));
    // memset(p_pichart, false, estimPI * posrows * sizeof(int));
    int *p_pichart = (int *) R_Calloc(estimPI * posrows, int);
    // prefixing (int *) prefills in all values with 0s

    int pichart_words = (posrows + BITS_PER_WORD - 1) / BITS_PER_WORD; // Words needed per PI chart columns
    unsigned int *p_pichart_pos = (unsigned int *) R_Calloc(estimPI * pichart_words, unsigned int);
    int implicant_words = (ninputs + BITS_PER_WORD - 1) / BITS_PER_WORD; // Words needed per PI representation
    unsigned int *p_implicants_pos = (unsigned int *) R_Calloc(estimPI * implicant_words, unsigned int);
    unsigned int *p_implicants_val = (unsigned int *) R_Calloc(estimPI * implicant_words, unsigned int);

    int prevfoundPI = 0;  // the number of previously found PIs
    int foundPI = 0;
    int prevsolmin = 0;     // the minimum number of PIs that solve the PI chart
    int solmin = 0;

    // the positions of the PIs solving the PI chart
    // a vector which can never be lengthier than the number of ON minterms (posrows)
    int previndices[posrows];
    int indices[posrows];


    for (int i = 0; i < posrows; i++) {
        previndices[i] = 0;
        indices[i] = 0;
        last_index[i] = 0;
    }

    Rboolean ON_set_covered = false;
    if (PRINT_INFO) {
        Rprintf("ON-set minterms: %d\n", posrows);
        #ifdef _OPENMP
            Rprintf("OpenMP enabled, %d workers\n", omp_get_max_threads());
        #endif
    }


    int stop_counter = 0; // to stop if two consecutive levels of complexity yield no PIs
    int k;
    for (k = 1; k <= ninputs; k++) {
        if (PRINT_INFO) {
            Rprintf("---k: %d\n", k);
        }

	int n_tasks = 512;
	struct ccubes_context *ctx = NULL;
        for (int task = 0; task < nchoosek(ninputs, k); task+=n_tasks) {
		bool *coverage;
		unsigned int *fixed_bits;
		unsigned int *value_bits;
		unsigned int *pichart_values;
		ctx = ccubes_do_tasks(n_tasks,
		    task,
		    k,
		    ninputs,
		    posrows,
		    negrows,
		    implicant_words,
		    value_bit_width,
		    pichart_words,
		    estimPI,
		    nofvalues,
		    ON_set,
		    OFF_set,
		    p_implicants_pos,
		    p_implicants_val,
		    last_index,
		    p_covered,
		    p_pichart_pos,
		    coverage,
		    fixed_bits,
		    value_bits,
		    pichart_values
			);
		if (ctx == NULL) {
			log_error("ccubes", "ccubes_do_tasks failed");
		}

		for (int i = 0; i < n_tasks; i++) {
			log_debug("ccubes", "Task %d", i);

			log_debug_raw("ccubes", "coverage[%d]:", i);
			for (int j = 0; j < posrows; j++) {
				log_debug_raw("ccubes", " %d",
				    ctx->h_coverage[i * posrows + j]);
			}
			log_debug_raw("ccubes", "\n");

			log_debug_raw("ccubes", "fixed_bits[%d]:", i);
			for (int j = 0; j < implicant_words; j++) {
				log_debug_raw("ccubes", " %d", 
				    ctx->h_fixed_bits[i * implicant_words + j]);
			}
			log_debug_raw("ccubes", "\n");

			log_debug_raw("ccubes", "value_bits[%d]:", i);
			for (int j = 0; j < implicant_words; j++) {
				log_debug_raw("ccubes", " %d", 
				    ctx->h_value_bits[i * implicant_words + j]);
			}
			log_debug_raw("ccubes", "\n");

			log_debug_raw("ccubes", "pichart_values[%d]:", i);
			for (int j = 0; j < pichart_words; j++) {
				log_debug_raw("ccubes", " %d", 
				    ctx->h_pichart_values[i * pichart_words + j]);
			}
			log_debug_raw("ccubes", "\n");
		}
	}

        #ifdef _OPENMP
            #pragma omp parallel for schedule(dynamic)
        #endif

        for (int task = 0; task < nchoosek(ninputs, k); task++) {
            int tempk[k];
            int x = 0;
            int start_point = task;

            // fill the combination for the current task
            for (int i = 0; i < k; i++) {
                while (nchoosek(ninputs - (x + 1), k - (i + 1)) <= start_point) {
                    start_point -= nchoosek(ninputs - (x + 1), k - (i + 1));
                    x++;
                }
                tempk[i] = x;
                x++;
            }

            // allocate vectors of decimal row numbers for the positive and negative rows
            int decpos[posrows];
            int decneg[negrows];

            // create the vector of multiple bases, useful when calculating the decimal representation
            // of a particular combination of columns, for each row
            int mbase[k];
            mbase[0] = 1; // the first number is _always_ equal to 1, irrespective of the number of values in a certain input

            // calculate the vector of multiple bases, for example if we have k = 3 (three inputs) with
            // 2, 3 and 2 values then mbase will be [1, 2, 6] from: 1, 1 * 2 = 2, 2 * 3 = 6
            for (int i = 1; i < k; i++) {
                mbase[i] = mbase[i - 1] * nofvalues[tempk[i - 1]];
            }

            // calculate decimal numbers, using mbase, fills in decpos and decneg
            for (int r = 0; r < posrows; r++) {
                decpos[r] = 0;
                for (int c = 0; c < k; c++) {
                    decpos[r] += ON_set[tempk[c] * posrows + r] * mbase[c];
                }
            }

            for (int r = 0; r < negrows; r++) {
                decneg[r] = 0;
                for (int c = 0; c < k; c++) {
                    decneg[r] += OFF_set[tempk[c] * negrows + r] * mbase[c];
                }
            }


            int possible_rows[posrows];

            Rboolean possible_cover[posrows];
            possible_cover[0] = true; // Rboolean flag, to be set with false if found among the OFF set

            int found = 0;

            // identifies all unique decimal rows, for the selected combination of k inputs
            for (int r = 0; r < posrows; r++) {
                int prev = 0;
                Rboolean unique = true; // Rboolean flag, assume the row is unique
                while (prev < found && unique) {
                    unique = decpos[possible_rows[prev]] != decpos[r];
                    prev++;
                }

                if (unique) {
                    possible_rows[found] = r;
                    possible_cover[found] = true;
                    found++;
                }
            }

            if (found > 0) {
                // some of the ON set numbers are possible PIs (not found in the OFF set)
                int frows[found];

                // verify if this is a possible PI
                // (if the same decimal number is not found in the OFF set)
                for (int i = found - 1; i >= 0; i--) {
                    int j = 0;
                    while (j < negrows && possible_cover[i]) {
                        if (decpos[possible_rows[i]] == decneg[j]) {
                            possible_cover[i] = false;
                            found--;
                        }
                        j++;
                    }

                    if (possible_cover[i]) {
                        frows[found - i - 1] = possible_rows[i];
                    }
                }
                // Rprintf("task: %d; rows: %d\n", task, found);

                for (int f = 0; f < found; f++) {


                    // create a temporary vector of length k, containing the values from the initial ON set
                    // plus 1 (because 0 now signals a minimization, it becomes 1, and 1 becomes 2 etc.
                    int tempc[k];

                    // using bit shifting, store the fixed bits and value bits
                    unsigned int fixed_bits[implicant_words];
                    unsigned int value_bits[implicant_words];

                    for (int i = 0; i < implicant_words; i++) {
                        fixed_bits[i] = 0U;
                        value_bits[i] = 0U;
                    }

                    for (int c = 0; c < k; c++) {
                        int value = ON_set[tempk[c] * posrows + frows[f]];
                        tempc[c] = value + 1;

                        int word_index = tempk[c] / BITS_PER_WORD;
                        int bit_index = tempk[c] % BITS_PER_WORD;

                        fixed_bits[word_index] |= 1U << bit_index;
                        value_bits[word_index] |= (unsigned int)value << (bit_index * value_bit_width);
                    }

                    // check if the current PI is not redundant
                    Rboolean redundant = false;

                    int i = 0;
                    while (i < prevfoundPI && !redundant) {
                        // /*
                        // - ck contains the complexity level for each of the previously found non-redundant PIs
                        // - indx is a matrix containing the indexes of the columns where the values were stored
                        // - a redundant PI is one for which all values from a previous PI are exactly the same:
                        // 0 0 1 2 0, let's say previously found PI
                        // which means a corresponding ck = 2 and a corresponding indx = [3, 4]
                        // 0 0 1 2 1 is redundant because on both columns 3 and 4 the values are equal
                        // therefore sumeq = 2 and it will be equal to v = 2 when reaching the complexity level ck = 2
                        // */

                        Rboolean is_subset = true; // Assume it's a subset unless proven otherwise

                        for (int w = 0; w < implicant_words; w++) {
                            // If the new PI has values on positions outside the existing PI’s fixed positions, it’s not a subset
                            if ((fixed_bits[w] & p_implicants_pos[i * implicant_words + w]) != p_implicants_pos[i * implicant_words + w]) {
                                is_subset = false;
                                break;
                            }

                            // then compare the value bits, if one or more values on those positions are different, it’s not a subset
                            if ((value_bits[w] & p_implicants_val[i * implicant_words + w]) != p_implicants_val[i * implicant_words + w]) {
                                is_subset = false;
                                break;
                            }
                        }

                        redundant = is_subset;

                        i++;
                    }

                    if (redundant) continue;

                    Rboolean coverage[posrows];
                    int covsum = 0;
                    unsigned int pichart_values[pichart_words];
                    for (int w = 0; w < pichart_words; w++) {
                        pichart_values[w] = 0U;
                    }

                    for (int r = 0; r < posrows; r++) {
                        coverage[r] = decpos[r] == decpos[frows[f]];
                        if (coverage[r]) {
                            int word_index = r / BITS_PER_WORD;
                            int bit_index = r % BITS_PER_WORD;
                            pichart_values[word_index] |= (1U << bit_index);
                        }
                        covsum += coverage[r];
                    }

                    // verify row dominance
                    int rd = 0;
                    while (rd < last_index[covsum - 1] && !redundant) {

                        bool dominated = true;
                        for (int w = 0; w < pichart_words; w++) {
                            if ((pichart_values[w] & p_pichart_pos[p_covered[rd] * pichart_words + w]) != pichart_values[w]) {
                                dominated = false;
                                break;
                            }
                        }

                        redundant = dominated;
                        rd++;
                    }

                    if (redundant) continue;


                    // Rprintf("It is a prime implicant\n");
                    // This operation first gets a new index to push in the global array in a concurrent way
                    // Then adds the result there.
                    // We could synchronize only the index and let the copy operation happen in parallel BUT this
                    // creates a false sharing problem and the performance is down by several factors.

                    #ifdef _OPENMP
                        #pragma omp critical
                    #endif
                    {

                        // push the PI information to the global arrays

                        for (int i = foundPI; i > last_index[covsum - 1]; i--) {
                            p_covered[i] = p_covered[i - 1];
                        }

                        p_covered[last_index[covsum - 1]] = foundPI;

                        for (int l = 1; l < covsum; l++) {
                            last_index[l - 1] += 1;
                        }

                        for (int w = 0; w < implicant_words; w++) {
                            p_implicants_pos[implicant_words * foundPI + w] = fixed_bits[w];
                            p_implicants_val[implicant_words * foundPI + w] = value_bits[w];
                        }

                        // populate the PI chart
                        for (int r = 0; r < posrows; r++) {
                            for (int w = 0; w < pichart_words; w++) {
                                p_pichart_pos[foundPI * pichart_words + w] = pichart_values[w];
                            }

                            p_pichart[posrows * foundPI + r] = coverage[r];
                        }

                        ++foundPI;

                        // when needed, increase allocated memory
                        if (foundPI / estimPI > 0.9) {
                            int old_size = estimPI;
                            estimPI += 100000;
                            p_pichart =        R_Realloc(p_pichart,        posrows * estimPI, int);
                            p_pichart_pos =    R_Realloc(p_pichart_pos,    estimPI,           unsigned int);
                            p_implicants_val = R_Realloc(p_implicants_val, ninputs * estimPI, unsigned int);
                            p_implicants_pos = R_Realloc(p_implicants_pos, ninputs * estimPI, unsigned int);
                            p_covered =        R_Realloc(p_covered,        estimPI,           int);

                            for (unsigned int i = old_size; i < posrows * estimPI; i++) {
                                p_pichart[i] = 0;
                            }
                            for (unsigned int i = old_size; i < estimPI; i++) {
                                p_pichart_pos[i] = 0U;
                            }
                            for (unsigned int i = old_size; i < ninputs * estimPI; i++) {
                                p_implicants_val[i] = 0U;
                                p_implicants_pos[i] = 0U;
                            }

                            if (PRINT_INFO) {
                                multiplier++;
                                Rprintf("%dx ", multiplier);
                            }
                        }
                    }
                }
            }
        }

        nofpi[k - 1] = foundPI;

        if (foundPI > 0 && !ON_set_covered) {
            Rboolean test_coverage = true;

            int r = 0;
            while (r < posrows && test_coverage) {

                Rboolean minterm_covered = false;
                int c = 0;

                while (c < foundPI && !minterm_covered) {
                    minterm_covered = p_pichart[c * posrows + r];
                    c++;
                }

                test_coverage = minterm_covered;
                r++;
            }

            ON_set_covered = test_coverage;
        }

        if (ON_set_covered) {
            // Rprintf("posrows: %d; foundPI: %d\n", posrows, foundPI);


            if (PRINT_INFO) {
                gettimeofday(&end, NULL); // End time
                elapsed_time = (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / 1e6;
                Rprintf("Time taken finding %d PIs: %f sec.\n", foundPI, elapsed_time);
                gettimeofday(&start, NULL);
            }


            SEXP pic = PROTECT(allocMatrix(INTSXP, posrows, foundPI));
            for (long long unsigned int i = 0; i < posrows * foundPI; i++) {
                INTEGER(pic)[i] = p_pichart[i];
            }

            SEXP PIlayers = PROTECT(allocVector(INTSXP, ninputs));
            for (int i = 0; i < ninputs; i++) {
                INTEGER(PIlayers)[i] = nofpi[i];
            }
            setAttrib(pic, install("PIlayers"), PIlayers);

            // if this file is run directly using SHLIB, the following line is needed
            // R_ParseEvalString("library(IEEE)", R_GlobalEnv);

            SEXP pkg_env = PROTECT(R_FindNamespace(mkString("IEEE")));
            SEXP solvechart = PROTECT(Rf_findVarInFrame(pkg_env, Rf_install("solvechart")));
            SEXP evalinR = PROTECT(R_tryEval(Rf_lang2(solvechart, pic), pkg_env, NULL));

            solmin = length(evalinR);
            for (int i = 0; i < solmin; i++) {
                indices[i] = INTEGER(evalinR)[i] - 1; // R is 1-based
            }

            UNPROTECT(5);
            // Rprintf("solution minima: %d\n", solmin);


            if (PRINT_INFO) {
                gettimeofday(&end, NULL);
                elapsed_time = (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / 1e6;
                Rprintf("Time spent solving the PI chart: %f sec.\n", elapsed_time);
                gettimeofday(&start, NULL);
            }

            if (solmin == prevsolmin) {
                // the minimum number of PIs did not change in the current level of complexity
                // we can safely retain the less complex PIs from the previous level
                for (int i = 0; i < solmin; i++) {
                    indices[i] = previndices[i];
                }
                stop_counter += 1;
            }
            else {
                // this means solmin is in fact smaller than the previously found solmin
                // or it is the very first time a solmin was found
                // only here it makes sense to overwrite prevsolmin and previndices,
                // otherwise they are just as good as the ones from the previous level

                prevsolmin = solmin;
                for (int i = 0; i < solmin; i++) {
                    previndices[i] = indices[i];
                }

                stop_counter = 0;
            }
        }

        prevfoundPI = foundPI;

        // printf("stop_counter: %d\n", stop_counter);

        // One level of complexity up, and the solution minima does not change
        if (stop_counter > 0) {
            // the search can stop
            break;
        }
    }

    // printf("solmin: %d\n", solmin);
    if (PRINT_INFO) {
        Rprintf("--- END ---\n");
    }

    SEXP sol = PROTECT(allocMatrix(INTSXP, solmin, ninputs));
    int *p_sol = INTEGER(sol);

    for (int c = 0; c < solmin; c++) {
        for (int r = 0; r < ninputs; r++) {
            unsigned int value = 0;
            int word_index = r / BITS_PER_WORD;  // Word index within the implicant
            int bit_index = r % BITS_PER_WORD;  // Bit position within the word

            if (p_implicants_pos[indices[c] * implicant_words + word_index] & (1U << bit_index)) {
                value = 1U + ((p_implicants_val[indices[c] * implicant_words + word_index] >> (bit_index * value_bit_width)) & ((1U << value_bit_width) - 1U));
            }

            p_sol[r * solmin + c] = value; // transposed
        }
    }

    R_Free(p_pichart);
    R_Free(p_implicants_val);
    R_Free(p_implicants_pos);
    R_Free(p_pichart_pos);
    R_Free(p_covered);

    UNPROTECT(1);
    return (sol);
}


long long unsigned int nchoosek(
    int n,
    int k
) {
    if (k == 0 || k == n) return 1;
    if (k == 1) return n;

    long long unsigned int result = 1;

    if (k > n - k) {
        k = n - k;
    }

    for (int i = 0; i < k; i++) {
        result = result * (n - i) / (i + 1);
    }

    return result;
}