// from std
#include <array>
#include <algorithm>
#include <iostream>
#include <chrono>
#include <complex>
#include <tuple>
#include <vector>
#include <stdexcept>

// from options
#include <options.hpp>

// from utils
#include "../utils/pxgemm_utils.hpp"

using namespace cosma;

int main(int argc, char **argv) {
    // **************************************
    //   setup MPI and command-line parser
    // **************************************
    options::initialize(argc, argv);

    MPI_Init(&argc, &argv);

    int rank, P;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &P);

    // **************************************
    //   readout the command line arguments
    // **************************************
    // matrix dimensions
    // dim(A) = mxk, dim(B) = kxn, dim(C) = mxn
    auto m = options::next_int("-m", "--m_dim", "number of rows of A and C.", 1000);
    auto n = options::next_int("-n", "--n_dim", "number of columns of B and C.", 1000);
    auto k = options::next_int("-k", "--k_dim", "number of columns of A and rows of B.", 1000);

    // block sizes
    auto block_a = options::next_int_pair("-ba", "--block_a", "block size for the number of rows of A.", 128);
    auto block_b = options::next_int_pair("-bb", "--block_b", "block size for the number of rows of B.", 128);
    auto block_c = options::next_int_pair("-bc", "--block_c", "block size for the number of rows of C.", 128);

    // transpose flags
    bool trans_a = options::flag_exists("-ta", "--trans_a");
    bool trans_b = options::flag_exists("-tb", "--trans_b");
    char ta = trans_a ? 'T' : 'N';
    char tb = trans_b ? 'T' : 'N';

    // processor grid decomposition
    auto p = options::next_int("-p", "--p_row", "number of rows in a processor grid.", 1);
    auto q = options::next_int("-q", "--q_row", "number of columns in a processor grid.", P);

    if (p * q != P) {
        std::runtime_error("Number of processors in a grid has to match the number of available ranks.");
    }

    double alpha = 1.0;
    double beta = 0.0;

    pxgemm_params<double> params(m, n, k, 
                                 block_a.first, block_a.second,
                                 block_b.first, block_b.second,
                                 block_c.first, block_c.second,
                                 p, q,
                                 ta, tb,
                                 alpha, beta);

    // **************************************
    //    output the problem description
    // **************************************
    if (rank == 0) {
        std::cout << "Running PDGEMM on the following problem:" << std::endl;
        std::cout << params << std::endl;
    }

    // *******************************
    //   multiply and validate
    // *******************************
    bool ok = test_pdgemm(params, MPI_COMM_WORLD);

    int result = ok ? 0 : 1;
    int global_result = 0;

    MPI_Reduce(&result, &global_result, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        std::string yes_no = global_result == 0 ? "" : " NOT";
        std::cout << "Result is" << yes_no << " CORRECT!" << std::endl;
    }

    MPI_Finalize();

    return 0;
}
