#define _POSIX_C_SOURCE 199309L
#include "fast_bot.h"
#include "fast_transposition.h"
#include "fast_move_order.h"
#include "fast_zobrist.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include <limits.h>
#include <stddef.h>
#include <stdint.h>

// Forward declaration for format_move (defined in main.c)
void format_move(FastMove move, char* buffer, size_t buffer_size);

// Constants
#define DEFAULT_TT_SIZE_MB 2048


static _Thread_local FastBoardEvaluation tls_evaluation;
static _Thread_local FastMoveGenerator  tls_movegen;
static _Thread_local bool               tls_inited = false;
static _Thread_local int                lme_counter = 0;

static inline int64_t now_ms(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (int64_t)ts.tv_sec * 1000 + ts.tv_nsec / 1000000;
}


// Transposition table functions
// Transposition table functions moved to fast_transposition.c

// Bot implementation
void fast_bot_init(FastFlangBot* bot, int min_depth, int max_depth, int threads, int ttsize_mb, bool use_lme, int lme_max_extension) {
    bot->min_depth = min_depth;
    bot->max_depth = max_depth;
    bot->use_opening_database = false; // Not implemented in C version
    bot->total_evaluations = 0;
    bot->threads = threads;
    bot->use_lme = use_lme;
    bot->lme_max_extension = lme_max_extension;

    tt_init(&bot->tt, ttsize_mb);
    zobrist_init();
}

void fast_bot_destroy(FastFlangBot* bot) {
    tt_destroy(&bot->tt);
}

static inline void ensure_tls_search_state(FastBoard* board) {
    if (!tls_inited) {
        // Init per-thread evaluator and move generator
        fast_move_generator_init(&tls_movegen, board, false, false, 1);
        fast_evaluation_init(&tls_evaluation, board);
        tls_inited = true;
    }
    // Always point to current board for this node
    tls_movegen.board = board;
    tls_evaluation.board = board;
    tls_evaluation.move_generator.board = board;
    //tls_evaluation.move_generator = tls_movegen;
}

// Check if a move should be extended by LME
static bool should_extend_move(FastFlangBot* bot, FastMove move, int depth) {
    if (!bot->use_lme || depth <= 1) {
        return false;
    }
    if (lme_counter >= bot->lme_max_extension) {
        return false;
    }
    
    FastPieceState from_piece = move.from_piece;
    if (from_piece.type == FAST_KING) {
        FastBoardIndex from_index = move.from;
        FastBoardIndex to_index = move.to;
        FastColor color = from_piece.color;
        
        int from_y = get_y(from_index);
        int to_y = get_y(to_index);
        int winning_y = (color == FAST_WHITE) ? 7 : 0;
        
        // Check if king moves forward (toward winning y)
        int forward_movement = (to_y - from_y) * get_evaluation_number(color);
        
        // Check distance to winning position
        int distance_to_win = abs(to_y - winning_y);
        
        return (forward_movement > 0) && (distance_to_win <= 3);
    }
    
    return false;
}

static int order_root_moves(FastFlangBot* bot, FastBoard* board, MoveBuffer* src, FastMove* out, int out_cap) {
    // Probe TT once at the root to get hash move
    uint64_t h = zobrist_hash(board);
    double tt_val; FastMove tt_best = NULL_MOVE;
    (void)tt_probe(&bot->tt, h, bot->max_depth, -MATE_EVAL, MATE_EVAL, &tt_val, &tt_best);
    // Ensure thread-local heuristics are reset in this thread
    mo_clear();
    return mo_order_moves(src, tt_best, /*ply=*/0, out, out_cap);
}

static void* root_worker(void* arg) {
    RootSearchCtx* ctx = (RootSearchCtx*)arg;

    // Per-thread heuristics
    mo_clear();

    // Local evaluator/movegen can piggyback bot’s, but safest is to reuse bot fields
    // since fast_bot_evaluate_move sets them each call.

    for (;;) {
        if (atomic_load_explicit(&ctx->stop, memory_order_relaxed))
            break;

        int i = atomic_fetch_add_explicit(&ctx->next_idx, 1, memory_order_relaxed);
        if (i >= ctx->root_count) break;

        FastMove move = ctx->root_moves[i];

        FastBoard temp;
        fast_board_copy(ctx->root_board, &temp);
        fast_board_execute_move(&temp, move);

        // Full window at root; you can add aspiration later
        double eval = fast_bot_evaluate_move(ctx->bot, &temp,
                                             ctx->root_depth - 1,
                                             -MATE_EVAL, MATE_EVAL, 1);

        // Store this move's evaluation
        if (ctx->all_evaluations) {
            ctx->all_evaluations[i].move = move;
            ctx->all_evaluations[i].evaluation = eval;
            ctx->all_evaluations[i].depth = ctx->root_depth;
        }

        // Update best
        pthread_mutex_lock(&ctx->best_lock);
        bool better = (ctx->side_to_move == FAST_WHITE) ? (eval > ctx->best_eval)
                                                        : (eval < ctx->best_eval);
        if (better) {
            ctx->best_eval = eval;
            ctx->best_move = move;
        }
        pthread_mutex_unlock(&ctx->best_lock);

        // Optional: early stop if we found a mate or huge cutoff
        // if (is_mate_score(eval)) atomic_store(&ctx->stop, true);
    }

    return NULL;
}

BotResult fast_bot_find_best_move(FastFlangBot* bot, FastBoard* board, bool print_time) {
    return fast_bot_find_best_move_iterative(bot, board, print_time, LONG_MAX);
}

BotResult fast_bot_find_best_move_iterative(FastFlangBot* bot, FastBoard* board, bool print_time, long max_time_ms) {
    int64_t start_ms = now_ms();

    // Init engine components
    fast_move_generator_init(&bot->move_generator, board, false, false, 1);
    fast_evaluation_init(&bot->evaluator, board);

    bot->total_evaluations = 0;
    tt_clear(&bot->tt);

    BotResult best_result = (BotResult){0};
    best_result.best_move.evaluation = -MATE_EVAL;
    
    // Initialize arrays for all move evaluations
    // NOTE: Caller is responsible for freeing this memory via best_result.move_evaluations
    MoveEvaluation* move_evaluations = malloc(MAX_MOVES * sizeof(MoveEvaluation));
    int final_move_count = 0;

    // Prepare root move list once; regenerated each depth (cheap)
    MoveBuffer root_buf;
    move_buffer_init(&root_buf);

    // Set threads (configurable)
    int thread_count = bot->threads > 0 ? bot->threads : 4; // e.g., default 4
    if (thread_count < 1) thread_count = 1;
    pthread_t threads[64]; // cap
    if (thread_count > 64) thread_count = 64;

    for (int depth = bot->min_depth; depth <= bot->max_depth; depth++) {
        int64_t iter_start = now_ms();
        int64_t elapsed = now_ms() - start_ms;
        if (depth > bot->min_depth && elapsed * 3 >= max_time_ms) break;

        // Rebuild and order root moves for this position
        move_buffer_clear(&root_buf);
        bot->move_generator.board = board;
        fast_move_generator_load_moves(&bot->move_generator, board->at_move, &root_buf);

        if (root_buf.count == 0) {
            if (print_time) printf("No moves available\n");
            break;
        }

        FastMove ordered[MAX_MOVES];
        int root_count = order_root_moves(bot, board, &root_buf, ordered, MAX_MOVES);

        // Context
        RootSearchCtx ctx = {
                .bot = bot,
                .root_board = board,
                .root_moves = ordered,
                .root_count = root_count,
                .root_depth = depth,
                .side_to_move = board->at_move,
                .next_idx = ATOMIC_VAR_INIT(0),
                .stop = ATOMIC_VAR_INIT(false),
                .best_move = NULL_MOVE,
                .best_eval = -MATE_EVAL * get_evaluation_number(board->at_move),
                .all_evaluations = move_evaluations
        };
        pthread_mutex_init(&ctx.best_lock, NULL);

        // Launch workers
        for (int t = 0; t < thread_count; ++t) {
            pthread_create(&threads[t], NULL, root_worker, &ctx);
        }

        // Simple time guard loop
        while (true) {
            /*if (elapsed >= max_time_ms) {
                atomic_store_explicit(&ctx.stop, true, memory_order_relaxed);
                break;
            }*/
            // Check if all work is handed out
            int idx = atomic_load_explicit(&ctx.next_idx, memory_order_relaxed);
            if (idx >= root_count) break;
            // Sleep a hair to avoid busy-wait (optional)
            struct timespec ts = {.tv_sec=0, .tv_nsec=1000000}; nanosleep(&ts, NULL);
        }

        // Join
        for (int t = 0; t < thread_count; ++t) {
            pthread_join(threads[t], NULL);
        }
        pthread_mutex_destroy(&ctx.best_lock);

        // Depth result
        best_result.best_move.move = ctx.best_move;
        best_result.best_move.evaluation = ctx.best_eval;
        best_result.best_move.depth = depth;
        best_result.total_evaluations = bot->total_evaluations;
        
        // Update the collected move evaluations count for the final iteration
        final_move_count = root_count;

        // Store root in TT
        uint64_t h = zobrist_hash(board);
        tt_store(&bot->tt, h, ctx.best_eval, depth, ctx.best_move, TT_NODE_EXACT);

        // Debug (like you had)
        if (print_time) {
            long iter_ms = now_ms() - iter_start;
            printf("Iteration depth=%d done in %ldms\n", depth, iter_ms);
        }
    }

    if (print_time) {
        long time_ms = now_ms() - start_ms;
        printf("Evals: %ldK, Depth: %d, Time: %ldms, EPms: %.0f\n",
               bot->total_evaluations / 1000,
               best_result.best_move.depth,
               time_ms,
               bot->total_evaluations / (time_ms > 0 ? (double)time_ms : 1.0));
        printf("TT: %lu/%lu hits (%.1f%%)\n",
               bot->tt.hits, bot->tt.probes,
               bot->tt.probes > 0 ? (100.0 * bot->tt.hits / bot->tt.probes) : 0.0);
    }

    // Set the collected move evaluations
    best_result.all_evaluations = move_evaluations;
    best_result.evaluation_count = final_move_count;

    return best_result;
}

double fast_bot_evaluate_move(FastFlangBot* bot, FastBoard* board, uint8_t depth, double alpha, double beta, int ply) {
    if (depth <= 0) {
        bot->total_evaluations++;
        ensure_tls_search_state(board);
        return fast_evaluation_evaluate(&tls_evaluation);
    }
    // Probe TT
    uint64_t hash = zobrist_hash(board);
    double tt_value;
    FastMove tt_best_move = NULL_MOVE;
    if (tt_probe(&bot->tt, hash, depth, alpha, beta, &tt_value, &tt_best_move)) {
        return tt_value;
    }

    // Generate moves
    MoveBuffer move_buffer;
    move_buffer_init(&move_buffer);
    ensure_tls_search_state(board);
    fast_move_generator_load_moves(&tls_movegen, board->at_move, &move_buffer);

    if (move_buffer.count == 0) {
        // No legal moves → evaluate terminal
        return fast_bot_evaluate_move(bot, board, 0, alpha, beta, ply);
    }

    // Order moves
    FastMove ordered[MAX_MOVES];
    int mcount = mo_order_moves(&move_buffer, tt_best_move, ply, ordered, MAX_MOVES);

    double best_eval = -MATE_EVAL * get_evaluation_number(board->at_move);
    FastMove best_move = NULL_MOVE;
    uint8_t node_type = TT_NODE_UPPER_BOUND;

    for (int i = 0; i < mcount; i++) {
        FastMove move = ordered[i];

        FastBoard temp;
        fast_board_copy(board, &temp);
        fast_board_execute_move(&temp, move);

        // Check for Late Move Extensions (LME)
        double eval;
        if (should_extend_move(bot, move, depth)) {
            // Late Move Extension - search at depth + 1
            lme_counter++;
            eval = fast_bot_evaluate_move(bot, &temp, depth, alpha, beta, ply + 1);
            lme_counter--;
        } else {
            // Normal search at depth - 1
            eval = fast_bot_evaluate_move(bot, &temp, depth - 1, alpha, beta, ply + 1);
        }

        // Mate distance adjustment
        if (eval > 1000)       eval -= MATE_STEP_LOSS;
        else if (eval < -1000) eval += MATE_STEP_LOSS;

        if (board->at_move == FAST_WHITE) {
            if (eval > best_eval) {
                best_eval = eval;
                best_move = move;
                node_type = TT_NODE_EXACT;
            }
            if (best_eval > alpha) alpha = best_eval;

            if (alpha >= beta) {
                node_type = TT_NODE_LOWER_BOUND; // cut because maximizing refuted bound
                mo_update_history(move, depth); // depth-based bonus
                mo_update_killer(move, ply);    // quiet-only inside
                break;
            }
        } else { // FAST_BLACK minimizing
            if (eval < best_eval) {
                best_eval = eval;
                best_move = move;
                node_type = TT_NODE_EXACT;
            }
            if (best_eval < beta) beta = best_eval;

            if (beta <= alpha) {
                node_type = TT_NODE_UPPER_BOUND; // cut because minimizing refuted bound
                mo_update_history(move, depth);
                mo_update_killer(move, ply);
                break;
            }
        }
    }

    // Store in TT
    uint8_t store_depth = is_mate_score(best_eval) ? MATE_TT_DEPTH : depth;
    tt_store(&bot->tt, hash, best_eval, store_depth, best_move, node_type);

    return best_eval;
}



// Helper functions
bool is_mate_score(double value) {
    return fabs(value) > 5000;
}

FastColor get_color_at_move(const FastBoard* board) {
    return board->at_move;
}