/*
 * Compute Velocity-Verlet algorithm on GPU.
 *
 * This file is CPU-side of Velocity-Verlet computation on GPU.
 * It follows a few steps:
 * 1. GPU initialization and kernel building.
 * 2. Setting the parameters up.
 * 3. Copying initialization values of parameters.
 * 4. Calling kernel, retrieving values and writing them.
 * 5. Cleaning memory.
 *
 * One could worry a bit about the architecture of this code, as it consists
 * only of one big function. Actually, it is the only way to have a minimal
 * overhead. By the way, a cleaner code would not be much less complicated, as
 * it consists mainly of not really generic and reusable functions.
 *
 * If you compile with macro SPEEDRUN set, the output writes are disabled, and
 * instead, you have the time ellapsed on stderr.
 */
#include "gpu_verlet.h"

int compute_verlet_gpu(int nb_planet, int dimension, int nbstep, double stepsize,
		double planets_position_adim[nb_planet][dimension],
		double planets_velocity_adim[nb_planet][dimension],
		double planets_mass[nb_planet],
		char *basename, int write_every) {
	
	cl_host *host;
	cl_int ret;
	cl_program program;
	cl_kernel step1_vverlet, step2_vverlet, init_vverlet;
	cl_mem planets_position_input, planets_position_output;
	cl_mem planets_velocity_input, planets_velocity_output;
	cl_mem planets_accel_input, planets_accel_output;
	cl_mem planets_mass_input;

	cl_event write_evt[3];
	cl_event output_to_input_evt[2];
	cl_event step1_vverlet_evt;
	cl_event step2_vverlet_evt;
	cl_event init_vverlet_evt;
	cl_event read_evt;

	cl_double3 *planets_position;
	planets_position = redim_coords(dimension, nb_planet, planets_position_adim);
	cl_double3 *planets_velocity;
	planets_velocity = redim_coords(dimension, nb_planet, planets_velocity_adim);
#ifndef SPEEDRUN
	FILE *fpout;
	char fnout[256];
	int planet, k;
	int filenum = 0;
#endif // SPEEDRUN

	int size1d;
	size1d = nb_planet*sizeof(double);
	int sizend;
	sizend = nb_planet*sizeof(cl_double3);

	size_t offset = 0;
	size_t workgroup_size, globalwork_size;

	/* 
	 * 1. Initialize GPU and build kernels.
	 * 
	 * Set the GPU parameters up, with workgroup and global work size.
	 * The number of planets and the stepsize is supplied to GPU through the
	 * use of macro NB_PLANET and STEPSIZE. We must provide them through
	 * compilation options.
	 *
	 * We use 3 kernels:
	 * - init_vverlet must be called before the first iteration.
	 * - step1_vverlet and step2_vverlet must be called in this order at each
	 * iteration.
	 */
	host = initialize_gpu();
	workgroup_size = optimal_workgroup_size(host, nb_planet);
	globalwork_size = optimal_globalwork_size(host, nb_planet);
	char options[512];
	snprintf(options, 512, "-D NB_PLANET=%d -D STEPSIZE=%lf",
			nb_planet, stepsize);
	program = buildProgramFromFile(host, "src/gpu.cl", options);

	step1_vverlet = clCreateKernel(program,"step1_vverlet",&ret); 
	if(ret != CL_SUCCESS) {
		fprintf(stderr, "clCreateKernel returned %d on step1_vverlet.\n",ret);
		abort();
	}
	step2_vverlet = clCreateKernel(program,"step2_vverlet",&ret); 
	if(ret != CL_SUCCESS) {
		fprintf(stderr, "clCreateKernel returned %d on step2_vverlet.\n",ret);
		abort();
	}
	init_vverlet = clCreateKernel(program,"init_vverlet",&ret); 
	if(ret != CL_SUCCESS) {
		fprintf(stderr, "clCreateKernel returned %d on init_vverlet.\n",ret);
		abort();
	}
	

	/*
	 * 2. Preparing device parameters.
	 *
	 * We create a cl_buffer for each parameter.
	 * A cl_buffer is an area of memory on the GPU. We link each cl_buffer with
	 * a kernel parameter.
	 */

	planets_position_input =
		clCreateBuffer(host->context, CL_MEM_READ_WRITE,
				sizend, NULL, &ret);
	if(ret != CL_SUCCESS) {
		fprintf(stderr, "clCreateBuffer returned %d on planets_position_input.\n", ret);
		abort();
	}
	CL_CHECK(clSetKernelArg(step1_vverlet, 0, sizeof(cl_mem), &planets_position_input));
	CL_CHECK(clSetKernelArg(step2_vverlet, 0, sizeof(cl_mem), &planets_position_input));
	CL_CHECK(clSetKernelArg(init_vverlet, 0, sizeof(cl_mem), &planets_position_input));
	
	planets_velocity_input =
		clCreateBuffer(host->context, CL_MEM_READ_WRITE,
				sizend, NULL, &ret);
	if(ret != CL_SUCCESS) {
		fprintf(stderr, "clCreateBuffer returned %d on planets_velocity_input.\n", ret);
		abort();
	}
	CL_CHECK(clSetKernelArg(step1_vverlet, 1, sizeof(cl_mem), &planets_velocity_input));
	CL_CHECK(clSetKernelArg(step2_vverlet, 1, sizeof(cl_mem), &planets_velocity_input));
	CL_CHECK(clSetKernelArg(init_vverlet, 1, sizeof(cl_mem), &planets_velocity_input));

	planets_accel_input =
		clCreateBuffer(host->context, CL_MEM_READ_WRITE,
				sizend, NULL, &ret);
	if(ret != CL_SUCCESS) {
		fprintf(stderr, "clCreateBuffer returned %d on planets_accel_input.\n", ret);
		abort();
	}
	CL_CHECK(clSetKernelArg(step1_vverlet, 2, sizeof(cl_mem), &planets_accel_input));
	CL_CHECK(clSetKernelArg(step2_vverlet, 2, sizeof(cl_mem), &planets_accel_input));
	CL_CHECK(clSetKernelArg(init_vverlet, 2, sizeof(cl_mem), &planets_accel_input));

	planets_position_output =
		clCreateBuffer(host->context, CL_MEM_READ_WRITE,
				sizend, NULL, &ret);
	if(ret != CL_SUCCESS) {
		fprintf(stderr, "clCreateBuffer returned %d on planets_position_output.\n", ret);
		abort();
	}
	CL_CHECK(clSetKernelArg(step1_vverlet, 3, sizeof(cl_mem), &planets_position_output));
	CL_CHECK(clSetKernelArg(step2_vverlet, 3, sizeof(cl_mem), &planets_position_output));
	CL_CHECK(clSetKernelArg(init_vverlet, 3, sizeof(cl_mem), &planets_position_output));

	planets_velocity_output =
		clCreateBuffer(host->context, CL_MEM_READ_WRITE,
				sizend, NULL, &ret);
	if(ret != CL_SUCCESS) {
		fprintf(stderr, "clCreateBuffer returned %d on planets_velocity_output.\n", ret);
		abort();
	}
	CL_CHECK(clSetKernelArg(step1_vverlet, 4, sizeof(cl_mem), &planets_velocity_output));
	CL_CHECK(clSetKernelArg(step2_vverlet, 4, sizeof(cl_mem), &planets_velocity_output));
	CL_CHECK(clSetKernelArg(init_vverlet, 4, sizeof(cl_mem), &planets_velocity_output));

	planets_accel_output =
		clCreateBuffer(host->context, CL_MEM_READ_WRITE,
				sizend, NULL, &ret);
	if(ret != CL_SUCCESS) {
		fprintf(stderr, "clCreateBuffer returned %d on planets_accel_output.\n", ret);
		abort();
	}
	CL_CHECK(clSetKernelArg(step1_vverlet, 5, sizeof(cl_mem), &planets_accel_output));
	CL_CHECK(clSetKernelArg(step2_vverlet, 5, sizeof(cl_mem), &planets_accel_output));
	CL_CHECK(clSetKernelArg(init_vverlet, 5, sizeof(cl_mem), &planets_accel_output));

	planets_mass_input =
		clCreateBuffer(host->context, CL_MEM_READ_WRITE,
				size1d, NULL, &ret);
	if(ret != CL_SUCCESS) {
		fprintf(stderr, "clCreateBuffer returned %d on planets_mass_input.\n", ret);
		abort();
	}
	CL_CHECK(clSetKernelArg(step1_vverlet, 6, sizeof(cl_mem), &planets_mass_input));
	CL_CHECK(clSetKernelArg(step2_vverlet, 6, sizeof(cl_mem), &planets_mass_input));
	CL_CHECK(clSetKernelArg(init_vverlet, 6, sizeof(cl_mem), &planets_mass_input));

	/*
	 * 3. Writing initialization values to the device.
	 * We copy the local values onto the GPU.
	 */

	CL_CHECK(clEnqueueWriteBuffer(host->command_queue, planets_position_output,
				CL_FALSE, 0, sizend, planets_position, 0, NULL, &(write_evt[0])));
	CL_CHECK(clEnqueueWriteBuffer(host->command_queue, planets_velocity_output,
				CL_FALSE, 0, sizend, planets_velocity, 0, NULL, &(write_evt[1])));
	CL_CHECK(clEnqueueWriteBuffer(host->command_queue, planets_mass_input,
				CL_FALSE, 0, size1d, planets_mass, 0, NULL, &(write_evt[2])));
	CL_CHECK(clFlush(host->command_queue));
	CL_CHECK(clWaitForEvents(3, write_evt));


	/*
	 * 4. Execute kernel, retrieve data and print.
	 *
	 * At first, we call the init_vverlet kernel.
	 * Then, we iterate over time. Each iteration is a step for the
	 * integration method.
	 */
#ifdef SPEEDRUN
	fprintf(stderr, "Testing speed of %d iterations over %d planets...\n", nbstep,
			nb_planet);
	struct timespec start_time, end_time;
	clock_gettime(CLOCK_REALTIME, &start_time);
#endif // SPEEDRUN

	CL_CHECK(clEnqueueNDRangeKernel(host->command_queue,
				init_vverlet, 1, &offset, &globalwork_size, &workgroup_size,
				0, NULL, &init_vverlet_evt));
	CL_CHECK(clFlush(host->command_queue));
	CL_CHECK(clWaitForEvents(1, &init_vverlet_evt));
	for(int i = 0; i < nbstep; i++) {
		/* We copy the output buffers of the GPU (position, velocity and
		 * acceleration on the next step) to the input buffers (position,
		 * velocity and acceleration on the current step).
		 */
		CL_CHECK(clEnqueueCopyBuffer(host->command_queue, planets_position_output,
                	planets_position_input, 0, 0, sizend, 0, NULL,
	                &output_to_input_evt[0]));
		CL_CHECK(clEnqueueCopyBuffer(host->command_queue, planets_velocity_output,
                	planets_velocity_input, 0, 0, sizend, 0, NULL,
	                &output_to_input_evt[1]));
		CL_CHECK(clEnqueueCopyBuffer(host->command_queue, planets_accel_output,
                	planets_accel_input, 0, 0, sizend, 0, NULL,
	                &output_to_input_evt[2]));

		/* We compute the next step */
		CL_CHECK(clEnqueueNDRangeKernel(host->command_queue,
					step1_vverlet, 1, &offset, &globalwork_size, &workgroup_size,
					3, output_to_input_evt, &step1_vverlet_evt));
		CL_CHECK(clEnqueueNDRangeKernel(host->command_queue,
					step2_vverlet, 1, &offset, &globalwork_size, &workgroup_size,
					1, &step1_vverlet_evt, &step2_vverlet_evt));
		CL_CHECK(clFlush(host->command_queue));
		CL_CHECK(clWaitForEvents(1,&step2_vverlet_evt));
		
		/* We retrieve the results and write it once every write_every.
		 * Retrieval is done, but not writing if SPEEDRUN macro is set. */
		if(i % write_every == 0) {
			CL_CHECK(clEnqueueReadBuffer(host->command_queue, planets_position_input,
						CL_TRUE, 0, sizend, planets_position,
						0, NULL, &read_evt));
			CL_CHECK(clFlush(host->command_queue));
			CL_CHECK(clWaitForEvents(1, &read_evt));

#ifndef SPEEDRUN
			sprintf(fnout, "%s/%06d.dat",basename,filenum++);
			fpout = fopen(fnout, "w");
			for(planet = 0; planet < nb_planet; planet++) {
				for(k=0; k < dimension; k++) {
					fprintf(fpout, "%lf\t", planets_position[planet].s[k]);
				}
				fprintf(fpout, "\n");
			}
			fclose(fpout);
#endif // SPEEDRUN
		}
	}

	/*
	 * 5. Cleaning memory.
	 */
#ifdef SPEEDRUN
	clock_gettime(CLOCK_REALTIME, &end_time);
	double timediff = difftime(end_time.tv_sec, start_time.tv_sec)
		+ 1E-9 * (double) (end_time.tv_nsec - start_time.tv_sec);
	fprintf(stderr, "Done in %lg sec.\n", timediff);
#endif // SPEEDRUN
	CL_CHECK(clReleaseKernel(step1_vverlet));
	CL_CHECK(clReleaseKernel(step2_vverlet));
	CL_CHECK(clReleaseKernel(init_vverlet));

	CL_CHECK(clReleaseMemObject(planets_position_input));
	CL_CHECK(clReleaseMemObject(planets_position_output));
	CL_CHECK(clReleaseMemObject(planets_velocity_input));
	CL_CHECK(clReleaseMemObject(planets_velocity_output));
	CL_CHECK(clReleaseMemObject(planets_mass_input));

	clReleaseHost(host);

	return 0;
}
