#include "gpusolver.h" #include "time.h" #include "gmp.h" #define LOCAL_SIZE (64) #define GLOBAL_SIZE (2048) #define CHECKASGN (true) #define DEBUG gpusolver* initSolver() { gpusolver* o = calloc(1, sizeof(gpusolver)); if (o == NULL) return NULL; o->platformid = NULL; o->numplatforms = 0; o->deviceid = NULL; o->numdevices = 0; FILE* fp = fopen("../psat.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel\n"); // TODO: Cleanup return NULL; } o->source_str = malloc(0x100000); o->source_size = fread(o->source_str, 1, 0x100000, fp); o->source_str = realloc(o->source_str, o->source_size + 1); if (o->source_str == NULL) { printf("Failed to reallocate source\n"); return NULL; } fclose(fp); cl_int res = clGetPlatformIDs(1, &(o->platformid), &(o->numplatforms)); if (res != CL_SUCCESS) { printf("Failed to retrieve OpenCL platform IDs\n"); // TODO: Cleanup return NULL; } res = clGetDeviceIDs(o->platformid, CL_DEVICE_TYPE_GPU, 1, &(o->deviceid), &(o->numdevices)); if (res != CL_SUCCESS) { printf("Failed to retrieve OpenCL device IDs\n"); // TODO: Cleanup return NULL; } o->ctx = clCreateContext(NULL, 1, &(o->deviceid), NULL, NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create OpenCL context\n"); // TODO: Cleanup return NULL; } cl_queue_properties properties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 }; o->commqueue = clCreateCommandQueueWithProperties(o->ctx, o->deviceid, properties, &res); if (res != CL_SUCCESS) { printf("Failed to create OpenCL command queue\n"); // TODO: Cleanup return NULL; } res = clGetDeviceInfo(o->deviceid, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &(o->gpuMemoryMax), NULL); if (res != CL_SUCCESS) { printf("Failed to query total GPU memory\n"); // TODO: CLeanup return NULL; } res = clGetDeviceInfo(o->deviceid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &(o->gpuLocalMax), NULL); if (res != CL_SUCCESS) { printf("Failed to query total GPU memory\n"); // TODO: CLeanup return NULL; } res = clGetDeviceInfo(o->deviceid, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &(o->gpuAllocMax), NULL); if (res != CL_SUCCESS) { printf("Failed to query total GPU memory\n"); // TODO: CLeanup return NULL; } res = clGetDeviceInfo(o->deviceid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_ulong), &(o->gpuCUs), NULL); if (res != CL_SUCCESS) { printf("Failed to query total GPU memory\n"); // TODO: Cleanup return NULL; } o->program = clCreateProgramWithSource(o->ctx, 1, (const char**) &(o->source_str), (const size_t*) &(o->source_size), &res); if (res != CL_SUCCESS) { printf("Failed to create OpenCL program\n"); // TODO: Cleanup exit(1); } res = clBuildProgram(o->program, 1, &(o->deviceid), NULL, NULL, NULL); if (res != CL_SUCCESS) { printf("Build failed\n"); // TODO: Cleanup exit(1); } size_t loglen = 0; res = clGetProgramBuildInfo(o->program, o->deviceid, CL_PROGRAM_BUILD_LOG, NULL, NULL, &loglen); if (res != CL_SUCCESS) { printf("Failed to retrieve build logs\n"); exit(1); } char* logbuf = malloc(sizeof(char) * loglen); res = clGetProgramBuildInfo(o->program, o->deviceid, CL_PROGRAM_BUILD_LOG, sizeof(char) * loglen, logbuf, &loglen); if (res != CL_SUCCESS) { printf("Failed to retrieve build logs\n"); exit(1); } printf("%*.s\n", (int) loglen, logbuf); free(logbuf); o->kernel = clCreateKernel(o->program, "vectorSAT", &res); if (res != CL_SUCCESS) { printf("Failed to create kernel\n"); printf("%d\n", res); // TODO: Cleanup exit(1); } printf("Initialized solver:\n"); printf("\tCompute Units: %lu\n", o->gpuCUs); printf("\tMax Global Memory: %lu\n", o->gpuMemoryMax); printf("\tMax Local Memory: %lu\n", o->gpuLocalMax); printf("\tMax Alloc Memory: %lu\n", o->gpuAllocMax); return o; } i32 gpusolve2(gpusolver* gs, cnf* c) { u32 wcnt = 1 + (c->cnts[0] >> 5U); u32* solution = calloc((wcnt + 1), sizeof(u32)); if (solution == NULL) { printf("Failed to allocate solution buffer\n"); exit(1); } gs->gpuCUs = 1024; mpz_t gmpmax; mpz_init(gmpmax); mpz_ui_pow_ui(gmpmax, 2, c->cnts[0]); mpz_div_ui(gmpmax, gmpmax, gs->gpuCUs); mpz_export(solution + 1, NULL, -1, sizeof(u32), 0, 0, gmpmax); // mpz_out_str(stdout, 10, gmpmax); // printf("\n\n"); mpz_clear(gmpmax); solution[0] = 0; cl_int res = 2; cl_mem gpuheader = clCreateBuffer(gs->ctx, CL_MEM_READ_ONLY, 2 * sizeof(cl_uint), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create CNF header buffer\n"); exit(1); } cl_mem gpulvars = clCreateBuffer(gs->ctx, CL_MEM_READ_ONLY, 3 * c->cnts[1] * sizeof(cl_uint), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create CNF lvar buffer\n"); exit(1); } cl_mem gpuvariables = clCreateBuffer(gs->ctx, CL_MEM_READ_ONLY, c->cnts[2] * sizeof(cl_uint), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create CNF variable buffer\n"); exit(1); } cl_mem gpuparities = clCreateBuffer(gs->ctx, CL_MEM_READ_ONLY, c->cnts[2] * sizeof(cl_uchar), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create CNF parity buffer\n"); exit(1); } cl_mem gpuoutput = clCreateBuffer(gs->ctx, CL_MEM_READ_WRITE, (wcnt + 1) * sizeof(cl_uint), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create output buffer\n"); exit(1); } cl_mem gpuscratchpad = clCreateBuffer(gs->ctx, CL_MEM_READ_WRITE, 2 * wcnt * gs->gpuCUs * sizeof(cl_uint), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create output buffer\n"); exit(1); } // Load buffers to GPU res = clEnqueueWriteBuffer(gs->commqueue, gpuheader, CL_TRUE, 0, 2 * sizeof(cl_uint), c->cnts, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to queue CNF header write\n"); exit(1); } res = clEnqueueWriteBuffer(gs->commqueue, gpulvars, CL_TRUE, 0, 3 * c->cnts[1] * sizeof(cl_uint), c->clausedat, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to queue CNF lvar write\n"); exit(1); } res = clEnqueueWriteBuffer(gs->commqueue, gpuvariables, CL_TRUE, 0, c->cnts[2] * sizeof(cl_uint), c->variables, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to queue CNF variable write\n"); exit(1); } res = clEnqueueWriteBuffer(gs->commqueue, gpuparities, CL_TRUE, 0, c->cnts[2] * sizeof(cl_uchar), c->parities, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to queue CNF parity write\n"); exit(1); } res = clEnqueueWriteBuffer(gs->commqueue, gpuoutput, CL_TRUE, 0, (wcnt + 1) * sizeof(cl_uint), solution, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to queue CNF parity write\n"); exit(1); } res = clSetKernelArg(gs->kernel, 0, sizeof(cl_mem), (void*) &gpuheader); res = clSetKernelArg(gs->kernel, 1, sizeof(cl_mem), (void*) &gpulvars); res = clSetKernelArg(gs->kernel, 2, sizeof(cl_mem), (void*) &gpuvariables); res = clSetKernelArg(gs->kernel, 3, sizeof(cl_mem), (void*) &gpuparities); res = clSetKernelArg(gs->kernel, 4, sizeof(cl_mem), (void*) &gpuoutput); res = clSetKernelArg(gs->kernel, 5, sizeof(cl_mem), (void*) &gpuscratchpad); size_t deploySize[2] = { gs->gpuCUs, 64 }; res = clEnqueueNDRangeKernel(gs->commqueue, gs->kernel, 1, NULL, &(gs->gpuCUs), &(gs->gpuCUs), 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to queue kernel for execution\n"); exit(res); } res = clEnqueueReadBuffer(gs->commqueue, gpuoutput, CL_TRUE, 0, (wcnt + 1) * sizeof(cl_uint), solution, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to read kernel output\n"); exit(1); } // u64 endtime = utime(); if (solution[0] == 0) { printf("UNSAT\n"); } else if (solution[0] == 1) { printf("SAT: "); for (u32 k = 0; k < c->cnts[0]; ++k) { u32 vind = (c->cnts[0] - 1) - k; u32 iind = vind >> 5U; u32 bind = vind & 0b11111U; u8 par = (solution[iind + 1] >> bind) & 1U; printf("%u", par); } if (CHECKASGN) { u8 checkres = 0; for (u32 i = 0; i < c->cnts[1]; ++i) { checkres = 0; for (u32 j = 0; j < c->clausedat[3 * i + 1]; ++j) { u32 v = c->variables[c->clausedat[3 * i] + j]; u32 vv = c->cnts[0] - 1; u32 g = (vv - v) >> 5U; u32 h = (vv - v) & 0b11111U; u8 paract = (solution[g + 1] >> h) & 1U; if (c->parities[c->clausedat[3 * i] + j] == paract) { checkres = 1; break; } } if (!checkres) break; } if (checkres) { printf(" \xE2\x9C\x93\n"); } else { printf(" -\n"); } } } else { printf("What the fuck???\n"); solution[0] = 3; } res = clReleaseMemObject(gpuheader); res = clReleaseMemObject(gpulvars); res = clReleaseMemObject(gpuvariables); res = clReleaseMemObject(gpuparities); res = clReleaseMemObject(gpuoutput); res = clReleaseMemObject(gpuscratchpad); i32 retval = (i32) solution[0]; free(solution); return retval; } void freeSolver(gpusolver* gs) { i32 res = 0; res = clFlush(gs->commqueue); if (res != CL_SUCCESS) { printf("Failed to release solver\n"); return; } res = clFinish(gs->commqueue); if (res != CL_SUCCESS) { printf("Failed to release solver\n"); return; } res = clReleaseKernel(gs->kernel); if (res != CL_SUCCESS) { printf("Failed to release solver\n"); return; } res = clReleaseProgram(gs->program); if (res != CL_SUCCESS) { printf("Failed to release solver\n"); return; } res = clReleaseCommandQueue(gs->commqueue); if (res != CL_SUCCESS) { printf("Failed to release solver\n"); return; } res = clReleaseContext(gs->ctx); if (res != CL_SUCCESS) { printf("Failed to release solver\n"); return; } res = clReleaseDevice(gs->deviceid); if (res != CL_SUCCESS) { printf("Failed to release solver\n"); return; } free(gs->source_str); free(gs); } i32 gpusolve(cnf* c) { cl_platform_id platformid = NULL; cl_device_id deviceid = NULL; cl_uint numdevices; cl_uint numplatforms; FILE *fp; char *source_str; size_t source_size; fp = fopen("../psat.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(0x100000); source_size = fread( source_str, 1, 0x100000, fp); fclose( fp ); u32 wordcnt = 1 + ((c->cnts[0]) >> 5U); u32* solution = calloc((wordcnt + 1), sizeof(u32)); if (solution == NULL) { printf("Failed to allocate solution buffer\n"); exit(1); } mpz_t gmpmax; mpz_init(gmpmax); mpz_ui_pow_ui(gmpmax, 2, c->cnts[0]); mpz_div_ui(gmpmax, gmpmax, GLOBAL_SIZE); mpz_export(solution + 1, NULL, -1, sizeof(u32), 0, 0, gmpmax); mpz_clear(gmpmax); // printf("%lu\n", wordcnt); cl_int res = clGetPlatformIDs(1, &platformid, &numplatforms); if (res != CL_SUCCESS) { printf("Failed to retrieve OpenCL platform IDs\n"); exit(1); } // printf("Found %u platforms\n", numplatforms); res = clGetDeviceIDs(platformid, CL_DEVICE_TYPE_GPU, 1, &deviceid, &numdevices); if (res != CL_SUCCESS) { printf("Failed to retrieve OpenCL device IDs\n"); exit(1); } // printf("Found %u devices\n", numdevices); u64 memoryMax = 0; res = clGetDeviceInfo(deviceid, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &memoryMax, NULL); if (res != CL_SUCCESS) { printf("Failed to query GPU memory\n"); exit(1); } u64 localMax = 0; res = clGetDeviceInfo(deviceid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMax, NULL); if (res != CL_SUCCESS) { printf("Failed to query GPU memory\n"); exit(1); } u64 allocMax = 0; res = clGetDeviceInfo(deviceid, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &allocMax, NULL); if (res != CL_SUCCESS) { printf("Failed to query GPU memory\n"); exit(1); } printf("GPU mem: %lu %lu %lu\n", memoryMax, localMax, allocMax); size_t computeUnits = 0; res = clGetDeviceInfo(deviceid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &computeUnits, NULL); if (res != CL_SUCCESS) { printf("Failed to query GPU memory\n"); exit(1); } printf("Compute Units: %lu\n", computeUnits); cl_context context = clCreateContext(NULL, 1, &deviceid, NULL, NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create OpenCL context\n"); exit(1); } cl_command_queue commqueue = clCreateCommandQueueWithProperties(context, deviceid, 0, &res); if (res != CL_SUCCESS) { printf("Failed to create OpenCL command queue\n"); exit(1); } // Device memory buffers: /* For the CNF: * {clausecnt, literalcnt, varcnt) * variable array * clause array * parity array * * Other: * Status * A single counter */ // TODO: Look into DMA, maybe? Could do clause learning CPU-side and just update the GPU buffer cl_mem gpuheader = clCreateBuffer(context, CL_MEM_READ_ONLY, 2 * sizeof(cl_uint), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create CNF header buffer\n"); exit(1); } cl_mem gpulvars = clCreateBuffer(context, CL_MEM_READ_ONLY, 3 * c->cnts[1] * sizeof(cl_uint), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create CNF lvar buffer\n"); exit(1); } cl_mem gpuvariables = clCreateBuffer(context, CL_MEM_READ_ONLY, c->cnts[2] * sizeof(cl_uint), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create CNF variable buffer\n"); exit(1); } cl_mem gpuparities = clCreateBuffer(context, CL_MEM_READ_ONLY, c->cnts[2] * sizeof(cl_uchar), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create CNF parity buffer\n"); exit(1); } /* cl_mem gpumaxvals = clCreateBuffer(context, CL_MEM_READ_WRITE, GLOBAL_SIZE * sizeof(cl_uint), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create CNF maxval buffer\n"); exit(1); } */ cl_mem gpuoutput = clCreateBuffer(context, CL_MEM_READ_WRITE, (wordcnt + 1) * sizeof(cl_uint), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create output buffer\n"); exit(1); } cl_mem gpuscratchpad = clCreateBuffer(context, CL_MEM_READ_WRITE, 2 * wordcnt * GLOBAL_SIZE * sizeof(cl_uint), NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create output buffer\n"); exit(1); } // Load buffers to GPU res = clEnqueueWriteBuffer(commqueue, gpuheader, CL_TRUE, 0, 2 * sizeof(cl_uint), c->cnts, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to queue CNF header write\n"); exit(1); } res = clEnqueueWriteBuffer(commqueue, gpulvars, CL_TRUE, 0, 3 * c->cnts[1] * sizeof(cl_uint), c->clausedat, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to queue CNF lvar write\n"); exit(1); } res = clEnqueueWriteBuffer(commqueue, gpuvariables, CL_TRUE, 0, c->cnts[2] * sizeof(cl_uint), c->variables, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to queue CNF variable write\n"); exit(1); } res = clEnqueueWriteBuffer(commqueue, gpuparities, CL_TRUE, 0, c->cnts[2] * sizeof(cl_uchar), c->parities, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to queue CNF parity write\n"); exit(1); } res = clEnqueueWriteBuffer(commqueue, gpuoutput, CL_TRUE, 0, (wordcnt + 1) * sizeof(cl_uint), solution, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to queue CNF parity write\n"); exit(1); } cl_program satprog = clCreateProgramWithSource(context, 1, (const char**) &source_str, (const size_t*) &source_size, &res); if (res != CL_SUCCESS) { printf("Failed to create OpenCL program\n"); exit(1); } res = clBuildProgram(satprog, 1, &deviceid, NULL, NULL, NULL); if (res != CL_SUCCESS) { char* logbuf = malloc(sizeof(char) * 65536); size_t loglen = 0; res = clGetProgramBuildInfo(satprog, deviceid, CL_PROGRAM_BUILD_LOG, sizeof(char) * 65536, logbuf, &loglen); if (res != CL_SUCCESS) { printf("Failed to retrieve build logs\n"); exit(1); } printf("Build failed\n"); printf("%s\n", logbuf); free(logbuf); exit(1); } cl_kernel kernel = clCreateKernel(satprog, "vectorSAT", &res); if (res != CL_SUCCESS) { printf("Failed to create kernel\n"); exit(1); } size_t maxworkgrpu = 0; res = clGetKernelWorkGroupInfo(kernel, deviceid, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxworkgrpu, NULL); printf("Max work group size: %lu\n", maxworkgrpu); res = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*) &gpuheader); res = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &gpulvars); res = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*) &gpuvariables); res = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*) &gpuparities); res = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &gpuoutput); res = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &gpuscratchpad); // u64 starttime = utime(); size_t itemsize[2] = {GLOBAL_SIZE, LOCAL_SIZE }; res = clEnqueueNDRangeKernel(commqueue, kernel, 1, NULL, itemsize, itemsize + 1, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to queue kernel for execution\n"); exit(res); } res = clEnqueueReadBuffer(commqueue, gpuoutput, CL_TRUE, 0, (wordcnt + 1) * sizeof(cl_uint), solution, 0, NULL, NULL); if (res != CL_SUCCESS) { printf("Failed to read kernel output\n"); exit(1); } // u64 endtime = utime(); if (solution[0] == 0) { printf("UNSAT\n"); } else if (solution[0] == 1) { printf("SAT: "); for (u32 k = 0; k < c->cnts[0]; ++k) { u32 vind = (c->cnts[0] - 1) - k; u32 iind = vind >> 5U; u32 bind = vind & 0b11111U; u8 par = (solution[iind + 1] >> bind) & 1U; printf("%u", par); } if (CHECKASGN) { u8 checkres = 0; for (u32 i = 0; i < c->cnts[1]; ++i) { checkres = 0; for (u32 j = 0; j < c->clausedat[3 * i + 1]; ++j) { u32 v = c->variables[c->clausedat[3 * i] + j]; u32 vv = c->cnts[0] - 1; u32 g = (vv - v) >> 5U; u32 h = (vv - v) & 0b11111U; u8 paract = (solution[g + 1] >> h) & 1U; if (c->parities[c->clausedat[3 * i] + j] == paract) { checkres = 1; break; } } if (!checkres) break; } if (checkres) { printf(" \xE2\x9C\x93\n"); } else { printf(" -\n"); } } } else { printf("What the fuck???\n"); solution[0] = 3; } // printf("Actual time: %f seconds\n", ((f64) (endtime - starttime)) / 1000000.0); res = clFlush(commqueue); res = clFinish(commqueue); res = clReleaseKernel(kernel); res = clReleaseProgram(satprog); res = clReleaseCommandQueue(commqueue); res = clReleaseContext(context); res = clReleaseMemObject(gpuheader); res = clReleaseMemObject(gpulvars); res = clReleaseMemObject(gpuvariables); res = clReleaseMemObject(gpuparities); res = clReleaseMemObject(gpuoutput); res = clReleaseDevice(deviceid); i32 retval = (i32) solution[0]; free(solution); free(source_str); return retval; }