#include <cuda_runtime.h>
#include <stdio.h>
#include <math.h>  // For pairwise distance calculations

#define BLOCK_SIZE 256          // Threads per block
#define MAX_CLUSTERS 9          // Total clusters
#define MAX_ITERATIONS 1024     // Total iterations
#define MAX_NODES 1024          // Nodes for TSP example
#define INF 1e9                 // Infinity for TSP comparison

// Error checking macro
#define CUDA_CHECK(call)                                 \
    do {                                                 \
        cudaError_t err = call;                          \
        if (err != cudaSuccess) {                        \
            fprintf(stderr, "CUDA Error: %s\n",          \
                    cudaGetErrorString(err));            \
            exit(err);                                   \
        }                                                \
    } while (0)

// Device function for distance calculation
__device__ float calculate_distance(float x1, float y1, float x2, float y2) {
    return sqrtf((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1));
}

// Kernel for logic loop iteration
__global__ void PMLL_LogicLoop_GPU(int *cluster_states, float *node_positions, int num_nodes, int max_iterations, int *iteration_count) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int cluster_id = blockIdx.x;  // Each block handles one cluster

    if (tid < num_nodes) {
        // Loop over the maximum iterations
        for (int iter = 0; iter < max_iterations; ++iter) {
            // Calculate pairwise distances for TSP-like logic
            __shared__ float local_min_distance;  // Shared memory for results
            __shared__ int best_node;
            if (threadIdx.x == 0) {
                local_min_distance = INF;
                best_node = -1;
            }
            __syncthreads();

            for (int i = 0; i < num_nodes; ++i) {
                float distance = calculate_distance(
                    node_positions[2 * tid],     // x-coordinate of current node
                    node_positions[2 * tid + 1], // y-coordinate of current node
                    node_positions[2 * i],       // x-coordinate of node i
                    node_positions[2 * i + 1]    // y-coordinate of node i
                );

                if (distance < local_min_distance && tid != i) {
                    local_min_distance = distance;
                    best_node = i;
                }
            }

            // Update global cluster state atomically
            if (threadIdx.x == 0) {
                atomicAdd(&cluster_states[cluster_id], best_node);
            }
            __syncthreads();  // Synchronize before next iteration

            // Update iteration count
            if (tid == 0) {
                atomicAdd(iteration_count, 1);
            }
        }
    }
}

int main() {
    // Host variables
    int h_cluster_states[MAX_CLUSTERS] = {0};  // State of each cluster
    float h_node_positions[2 * MAX_NODES];    // Node positions (x, y for each node)
    int h_iteration_count = 0;                // Total iterations processed

    // Initialize node positions (example)
    for (int i = 0; i < MAX_NODES; ++i) {
        h_node_positions[2 * i] = rand() % 100;     // x-coordinate
        h_node_positions[2 * i + 1] = rand() % 100; // y-coordinate
    }

    // Device variables
    int *d_cluster_states, *d_iteration_count;
    float *d_node_positions;

    // Allocate device memory
    CUDA_CHECK(cudaMalloc((void **)&d_cluster_states, MAX_CLUSTERS * sizeof(int)));
    CUDA_CHECK(cudaMalloc((void **)&d_node_positions, 2 * MAX_NODES * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void **)&d_iteration_count, sizeof(int)));

    // Copy data to device
    CUDA_CHECK(cudaMemcpy(d_cluster_states, h_cluster_states, MAX_CLUSTERS * sizeof(int), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_node_positions, h_node_positions, 2 * MAX_NODES * sizeof(float), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_iteration_count, 0, sizeof(int)));  // Initialize iteration count

    // Define grid and block dimensions
    dim3 blockSize(BLOCK_SIZE);
    dim3 gridSize(MAX_CLUSTERS);  // Each cluster is handled by a block

    // Launch kernel
    PMLL_LogicLoop_GPU<<<gridSize, blockSize>>>(d_cluster_states, d_node_positions, MAX_NODES, MAX_ITERATIONS, d_iteration_count);

    // Synchronize device
    CUDA_CHECK(cudaDeviceSynchronize());

    // Copy results back to host
    CUDA_CHECK(cudaMemcpy(h_cluster_states, d_cluster_states, MAX_CLUSTERS * sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(&h_iteration_count, d_iteration_count, sizeof(int), cudaMemcpyDeviceToHost));

    // Display results
    printf("Total iterations processed: %d\n", h_iteration_count);
    for (int i = 0; i < MAX_CLUSTERS; ++i) {
        printf("Cluster %d state: %d\n", i, h_cluster_states[i]);
    }

    // Free device memory
    CUDA_CHECK(cudaFree(d_cluster_states));
    CUDA_CHECK(cudaFree(d_node_positions));
    CUDA_CHECK(cudaFree(d_iteration_count));

    return 0;
}
