From e2b633c802d39460ef6c2bba8e57c6e4441a0012 Mon Sep 17 00:00:00 2001 From: gothictomato Date: Sat, 27 Aug 2022 09:11:29 -0400 Subject: [PATCH] Working to make OpenCL recognize the limitations of the device its on, as well as reusing the OpenCL kernel/context for multiple runs. Signed-off-by: gothictomato --- gpusolver.c | 377 ++++++++++++++++++++++++++++++++++++++++++++- gpusolver.h | 23 +++ main.c | 16 +- psat.cl | 12 +- tests/masterTest.c | 12 +- 5 files changed, 424 insertions(+), 16 deletions(-) diff --git a/gpusolver.c b/gpusolver.c index 1709c14..1a84f01 100644 --- a/gpusolver.c +++ b/gpusolver.c @@ -1,16 +1,346 @@ #include "gpusolver.h" -#include #include "time.h" #include "gmp.h" -#define LOCAL_SIZE (128) -#define GLOBAL_SIZE (1024) +#define LOCAL_SIZE (64) +#define GLOBAL_SIZE (2048) #define CHECKASGN (true) #define DEBUG + + + +gpusolver* initSolver() { + gpusolver* o = calloc(1, sizeof(gpusolver)); + if (o == NULL) return NULL; + + o->platformid = NULL; + o->numplatforms = 0; + + o->deviceid = NULL; + o->numdevices = 0; + + FILE* fp = fopen("../psat.cl", "r"); + if (!fp) { + fprintf(stderr, "Failed to load kernel\n"); + // TODO: Cleanup + return NULL; + } + o->source_str = malloc(0x100000); + o->source_size = fread(o->source_str, 1, 0x100000, fp); + o->source_str = realloc(o->source_str, o->source_size + 1); + if (o->source_str == NULL) { + printf("Failed to reallocate source\n"); + return NULL; + } + fclose(fp); + + cl_int res = clGetPlatformIDs(1, &(o->platformid), &(o->numplatforms)); + if (res != CL_SUCCESS) { + printf("Failed to retrieve OpenCL platform IDs\n"); + // TODO: Cleanup + return NULL; + } + + res = clGetDeviceIDs(o->platformid, CL_DEVICE_TYPE_GPU, 1, &(o->deviceid), &(o->numdevices)); + if (res != CL_SUCCESS) { + printf("Failed to retrieve OpenCL device IDs\n"); + // TODO: Cleanup + return NULL; + } + + o->ctx = clCreateContext(NULL, 1, &(o->deviceid), NULL, NULL, &res); + if (res != CL_SUCCESS) { + printf("Failed to create OpenCL context\n"); + // TODO: Cleanup + return NULL; + } + + o->commqueue = clCreateCommandQueueWithProperties(o->ctx, o->deviceid, 0, &res); + if (res != CL_SUCCESS) { + printf("Failed to create OpenCL command queue\n"); + // TODO: Cleanup + return NULL; + } + + + + res = clGetDeviceInfo(o->deviceid, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &(o->gpuMemoryMax), NULL); + if (res != CL_SUCCESS) { + printf("Failed to query total GPU memory\n"); + // TODO: CLeanup + return NULL; + } + + res = clGetDeviceInfo(o->deviceid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &(o->gpuLocalMax), NULL); + if (res != CL_SUCCESS) { + printf("Failed to query total GPU memory\n"); + // TODO: CLeanup + return NULL; + } + + res = clGetDeviceInfo(o->deviceid, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &(o->gpuAllocMax), NULL); + if (res != CL_SUCCESS) { + printf("Failed to query total GPU memory\n"); + // TODO: CLeanup + return NULL; + } + + res = clGetDeviceInfo(o->deviceid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_ulong), &(o->gpuCUs), NULL); + if (res != CL_SUCCESS) { + printf("Failed to query total GPU memory\n"); + // TODO: Cleanup + return NULL; + } + + o->program = clCreateProgramWithSource(o->ctx, 1, (const char**) &(o->source_str), (const size_t*) &(o->source_size), &res); + if (res != CL_SUCCESS) { + printf("Failed to create OpenCL program\n"); + // TODO: Cleanup + exit(1); + } + + res = clBuildProgram(o->program, 1, &(o->deviceid), NULL, NULL, NULL); + if (res != CL_SUCCESS) { + printf("Build failed\n"); + // TODO: Cleanup + exit(1); + } + + + size_t loglen = 0; + res = clGetProgramBuildInfo(o->program, o->deviceid, CL_PROGRAM_BUILD_LOG, NULL, NULL, &loglen); + if (res != CL_SUCCESS) { + printf("Failed to retrieve build logs\n"); + exit(1); + } + char* logbuf = malloc(sizeof(char) * loglen); + res = clGetProgramBuildInfo(o->program, o->deviceid, CL_PROGRAM_BUILD_LOG, sizeof(char) * loglen, logbuf, &loglen); + if (res != CL_SUCCESS) { + printf("Failed to retrieve build logs\n"); + exit(1); + } + printf("%*.s\n", (int) loglen, logbuf); + free(logbuf); + + o->kernel = clCreateKernel(o->program, "vectorSAT", &res); + if (res != CL_SUCCESS) { + printf("Failed to create kernel\n"); + printf("%d\n", res); + // TODO: Cleanup + exit(1); + } + + printf("Initialized solver:\n"); + printf("\tCompute Units: %lu\n", o->gpuCUs); + printf("\tMax Global Memory: %lu\n", o->gpuMemoryMax); + printf("\tMax Local Memory: %lu\n", o->gpuLocalMax); + printf("\tMax Alloc Memory: %lu\n", o->gpuAllocMax); + + return o; +} + +i32 gpusolve2(gpusolver* gs, cnf* c) { + u32 wcnt = 1 + (c->cnts[0] >> 5U); + + u32* solution = calloc((wcnt + 1), sizeof(u32)); + if (solution == NULL) { + printf("Failed to allocate solution buffer\n"); + exit(1); + } + + mpz_t gmpmax; + mpz_init(gmpmax); + mpz_ui_pow_ui(gmpmax, 2, c->cnts[0]); + mpz_div_ui(gmpmax, gmpmax, gs->gpuCUs); + mpz_export(solution + 1, NULL, -1, sizeof(u32), 0, 0, gmpmax); + // mpz_out_str(stdout, 10, gmpmax); + // printf("\n\n"); + mpz_clear(gmpmax); + + cl_int res = 2; + + cl_mem gpuheader = clCreateBuffer(gs->ctx, CL_MEM_READ_ONLY, 2 * sizeof(cl_uint), NULL, &res); + if (res != CL_SUCCESS) { + printf("Failed to create CNF header buffer\n"); + exit(1); + } + cl_mem gpulvars = clCreateBuffer(gs->ctx, CL_MEM_READ_ONLY, 3 * c->cnts[1] * sizeof(cl_uint), NULL, &res); + if (res != CL_SUCCESS) { + printf("Failed to create CNF lvar buffer\n"); + exit(1); + } + cl_mem gpuvariables = clCreateBuffer(gs->ctx, CL_MEM_READ_ONLY, c->cnts[2] * sizeof(cl_uint), NULL, &res); + if (res != CL_SUCCESS) { + printf("Failed to create CNF variable buffer\n"); + exit(1); + } + cl_mem gpuparities = clCreateBuffer(gs->ctx, CL_MEM_READ_ONLY, c->cnts[2] * sizeof(cl_uchar), NULL, &res); + if (res != CL_SUCCESS) { + printf("Failed to create CNF parity buffer\n"); + exit(1); + } + + cl_mem gpuoutput = clCreateBuffer(gs->ctx, CL_MEM_READ_WRITE, (wcnt + 1) * sizeof(cl_uint), NULL, &res); + if (res != CL_SUCCESS) { + printf("Failed to create output buffer\n"); + exit(1); + } + + cl_mem gpuscratchpad = clCreateBuffer(gs->ctx, CL_MEM_READ_WRITE, 2 * wcnt * gs->gpuCUs * sizeof(cl_uint), NULL, &res); + if (res != CL_SUCCESS) { + printf("Failed to create output buffer\n"); + exit(1); + } + + // Load buffers to GPU + res = clEnqueueWriteBuffer(gs->commqueue, gpuheader, CL_TRUE, 0, 2 * sizeof(cl_uint), c->cnts, 0, NULL, NULL); + if (res != CL_SUCCESS) { + printf("Failed to queue CNF header write\n"); + exit(1); + } + res = clEnqueueWriteBuffer(gs->commqueue, gpulvars, CL_TRUE, 0, 3 * c->cnts[1] * sizeof(cl_uint), c->clausedat, 0, NULL, NULL); + if (res != CL_SUCCESS) { + printf("Failed to queue CNF lvar write\n"); + exit(1); + } + res = clEnqueueWriteBuffer(gs->commqueue, gpuvariables, CL_TRUE, 0, c->cnts[2] * sizeof(cl_uint), c->variables, 0, NULL, NULL); + if (res != CL_SUCCESS) { + printf("Failed to queue CNF variable write\n"); + exit(1); + } + res = clEnqueueWriteBuffer(gs->commqueue, gpuparities, CL_TRUE, 0, c->cnts[2] * sizeof(cl_uchar), c->parities, 0, NULL, NULL); + if (res != CL_SUCCESS) { + printf("Failed to queue CNF parity write\n"); + exit(1); + } + + res = clEnqueueWriteBuffer(gs->commqueue, gpuoutput, CL_TRUE, 0, (wcnt + 1) * sizeof(cl_uint), solution, 0, NULL, NULL); + if (res != CL_SUCCESS) { + printf("Failed to queue CNF parity write\n"); + exit(1); + } + + res = clSetKernelArg(gs->kernel, 0, sizeof(cl_mem), (void*) &gpuheader); + res = clSetKernelArg(gs->kernel, 1, sizeof(cl_mem), (void*) &gpulvars); + res = clSetKernelArg(gs->kernel, 2, sizeof(cl_mem), (void*) &gpuvariables); + res = clSetKernelArg(gs->kernel, 3, sizeof(cl_mem), (void*) &gpuparities); + + res = clSetKernelArg(gs->kernel, 4, sizeof(cl_mem), (void*) &gpuoutput); + + res = clSetKernelArg(gs->kernel, 5, sizeof(cl_mem), (void*) &gpuscratchpad); + + size_t deploySize[2] = { gs->gpuCUs, 1 }; + res = clEnqueueNDRangeKernel(gs->commqueue, gs->kernel, 1, NULL, &(gs->gpuCUs), &(gs->gpuCUs), 0, NULL, NULL); + if (res != CL_SUCCESS) { + printf("Failed to queue kernel for execution\n"); + exit(res); + } + + res = clEnqueueReadBuffer(gs->commqueue, gpuoutput, CL_TRUE, 0, (wcnt + 1) * sizeof(cl_uint), solution, 0, NULL, NULL); + if (res != CL_SUCCESS) { + printf("Failed to read kernel output\n"); + exit(1); + } + // u64 endtime = utime(); + + if (solution[0] == 0) { + printf("UNSAT\n"); + } else if (solution[0] == 1) { + printf("SAT: "); + for (u32 k = 0; k < c->cnts[0]; ++k) { + u32 vind = (c->cnts[0] - 1) - k; + u32 iind = vind >> 5U; + u32 bind = vind & 0b11111U; + u8 par = (solution[iind + 1] >> bind) & 1U; + printf("%u", par); + } + if (CHECKASGN) { + u8 checkres = 0; + for (u32 i = 0; i < c->cnts[1]; ++i) { + checkres = 0; + for (u32 j = 0; j < c->clausedat[3 * i + 1]; ++j) { + u32 v = c->variables[c->clausedat[3 * i] + j]; + u32 vv = c->cnts[0] - 1; + u32 g = (vv - v) >> 5U; + u32 h = (vv - v) & 0b11111U; + u8 paract = (solution[g + 1] >> h) & 1U; + if (c->parities[c->clausedat[3 * i] + j] == paract) { + checkres = 1; + break; + } + } + if (!checkres) break; + } + if (checkres) { + printf(" \xE2\x9C\x93\n"); + } else { + printf(" -\n"); + } + } + } else { + printf("What the fuck???\n"); + solution[0] = 3; + } + + res = clReleaseMemObject(gpuheader); + res = clReleaseMemObject(gpulvars); + res = clReleaseMemObject(gpuvariables); + res = clReleaseMemObject(gpuparities); + res = clReleaseMemObject(gpuoutput); + res = clReleaseMemObject(gpuscratchpad); + + i32 retval = (i32) solution[0]; + free(solution); + return retval; +} + +void freeSolver(gpusolver* gs) { + i32 res = 0; + res = clFlush(gs->commqueue); + if (res != CL_SUCCESS) { + printf("Failed to release solver\n"); + return; + } + res = clFinish(gs->commqueue); + if (res != CL_SUCCESS) { + printf("Failed to release solver\n"); + return; + } + res = clReleaseKernel(gs->kernel); + if (res != CL_SUCCESS) { + printf("Failed to release solver\n"); + return; + } + res = clReleaseProgram(gs->program); + if (res != CL_SUCCESS) { + printf("Failed to release solver\n"); + return; + } + + res = clReleaseCommandQueue(gs->commqueue); + if (res != CL_SUCCESS) { + printf("Failed to release solver\n"); + return; + } + res = clReleaseContext(gs->ctx); + if (res != CL_SUCCESS) { + printf("Failed to release solver\n"); + return; + } + + res = clReleaseDevice(gs->deviceid); + if (res != CL_SUCCESS) { + printf("Failed to release solver\n"); + return; + } + free(gs->source_str); + free(gs); +} + i32 gpusolve(cnf* c) { cl_platform_id platformid = NULL; cl_device_id deviceid = NULL; @@ -45,6 +375,7 @@ i32 gpusolve(cnf* c) { mpz_export(solution + 1, NULL, -1, sizeof(u32), 0, 0, gmpmax); mpz_clear(gmpmax); + // printf("%lu\n", wordcnt); cl_int res = clGetPlatformIDs(1, &platformid, &numplatforms); if (res != CL_SUCCESS) { printf("Failed to retrieve OpenCL platform IDs\n"); @@ -58,6 +389,36 @@ i32 gpusolve(cnf* c) { } // printf("Found %u devices\n", numdevices); + u64 memoryMax = 0; + res = clGetDeviceInfo(deviceid, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &memoryMax, NULL); + if (res != CL_SUCCESS) { + printf("Failed to query GPU memory\n"); + exit(1); + } + u64 localMax = 0; + res = clGetDeviceInfo(deviceid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMax, NULL); + if (res != CL_SUCCESS) { + printf("Failed to query GPU memory\n"); + exit(1); + } + u64 allocMax = 0; + res = clGetDeviceInfo(deviceid, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &allocMax, NULL); + if (res != CL_SUCCESS) { + printf("Failed to query GPU memory\n"); + exit(1); + } + printf("GPU mem: %lu %lu %lu\n", memoryMax, localMax, allocMax); + + size_t computeUnits = 0; + res = clGetDeviceInfo(deviceid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &computeUnits, NULL); + if (res != CL_SUCCESS) { + printf("Failed to query GPU memory\n"); + exit(1); + } + + printf("Compute Units: %lu\n", computeUnits); + + cl_context context = clCreateContext(NULL, 1, &deviceid, NULL, NULL, &res); if (res != CL_SUCCESS) { printf("Failed to create OpenCL context\n"); @@ -118,6 +479,12 @@ i32 gpusolve(cnf* c) { exit(1); } + cl_mem gpuscratchpad = clCreateBuffer(context, CL_MEM_READ_WRITE, 2 * wordcnt * GLOBAL_SIZE * sizeof(cl_uint), NULL, &res); + if (res != CL_SUCCESS) { + printf("Failed to create output buffer\n"); + exit(1); + } + // Load buffers to GPU res = clEnqueueWriteBuffer(commqueue, gpuheader, CL_TRUE, 0, 2 * sizeof(cl_uint), c->cnts, 0, NULL, NULL); if (res != CL_SUCCESS) { @@ -176,7 +543,7 @@ i32 gpusolve(cnf* c) { size_t maxworkgrpu = 0; res = clGetKernelWorkGroupInfo(kernel, deviceid, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxworkgrpu, NULL); - // printf("Max work group size: %lu\n", maxworkgrpu); + printf("Max work group size: %lu\n", maxworkgrpu); res = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*) &gpuheader); res = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &gpulvars); @@ -185,7 +552,7 @@ i32 gpusolve(cnf* c) { res = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*) &gpuoutput); - res = clSetKernelArg(kernel, 5, 2 * wordcnt * sizeof(cl_uint) * LOCAL_SIZE, NULL); + res = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*) &gpuscratchpad); // u64 starttime = utime(); size_t itemsize[2] = {GLOBAL_SIZE, LOCAL_SIZE }; diff --git a/gpusolver.h b/gpusolver.h index a1d91a1..2bb27d1 100644 --- a/gpusolver.h +++ b/gpusolver.h @@ -1,4 +1,27 @@ #pragma once #include "ncnf.h" +#include + +typedef struct { + cl_platform_id platformid; + cl_device_id deviceid; + cl_uint numdevices; + cl_uint numplatforms; + char* source_str; + size_t source_size; + cl_context ctx; + cl_command_queue commqueue; + cl_program program; + cl_kernel kernel; + u64 gpuMemoryMax; + u64 gpuLocalMax; + u64 gpuAllocMax; + u64 gpuCUs; +} gpusolver; + +gpusolver* initSolver(); +i32 gpusolve2(gpusolver* gs, cnf* c); +void freeSolver(gpusolver* gs); + i32 gpusolve(cnf* c); \ No newline at end of file diff --git a/main.c b/main.c index 21663ee..7c9567c 100644 --- a/main.c +++ b/main.c @@ -250,17 +250,27 @@ int main() { + + /* - cnf* c = readDIMACS("/home/lev/Downloads/logistics/logistics.d.cnf"); + cnf* c = readDIMACS("/home/lev/Downloads/uf20/uf20-03.cnf"); sortlastnum(c); - printf("%u\n", c->cnts[0]); + // gpusolve(c); - gpusolve(c); + + gpusolver* gs = initSolver(); + if (gs == NULL) return -1; + + i32 res = gpusolve2(gs, c); + + freeSolver(gs); freecnf(c); + return 0; */ + runTests(); return 0; /* diff --git a/psat.cl b/psat.cl index 13fd003..9305baa 100644 --- a/psat.cl +++ b/psat.cl @@ -61,7 +61,7 @@ void mul(uint* c, uint len, uint* a, uint b) { } } -__kernel void vectorSAT(__global const uint* cnfhdr, __global const uint* clausedat, __global const uint* vars, __global const uchar* pars, __global uint* output, __local uint* lctrs) { +__kernel void vectorSAT(__global const uint* cnfhdr, __global const uint* clausedat, __global const uint* vars, __global const uchar* pars, __global uint* output, __global uint* lctrs) { uint locid = get_local_id(0); uint locsz = get_local_size(0); // uint grpid = get_group_id(0); @@ -76,21 +76,23 @@ __kernel void vectorSAT(__global const uint* cnfhdr, __global const uint* clause uint addval = 0; - if (globid == 0) output[0] = 0; - uint* ctr = lctrs + wcnt * 2 * locid; - uint* max = lctrs + wcnt * (2 * locid + 1); + output[0] = 0; + + uint* ctr = lctrs + wcnt * 2 * globid; + uint* max = lctrs + wcnt * (2 * globid + 1); for (uint i = 0; i < wcnt; ++i) { ctr[i] = max[i] = 0; } mul(ctr, wcnt, output + 1, globid); - if (globid == globsz) { + if (globid == globsz - 1) { stateaddpow(wcnt, max, cnfhdr[0]); } else { mul(max, wcnt, output + 1, globid + 1); } + // printf("%u %u\n", ctr[0], max[0]); barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); diff --git a/tests/masterTest.c b/tests/masterTest.c index db87a35..c079b1e 100644 --- a/tests/masterTest.c +++ b/tests/masterTest.c @@ -24,6 +24,7 @@ i32 runTests() { i32 runuf20() { + gpusolver* gs = initSolver(); // printf("Running against uf20\n"); u32 passed = 0; u64 tottime = 0; @@ -36,7 +37,7 @@ i32 runuf20() { sortlastnum(c); u64 start = utime(); - i32 res = gpusolve(c); + i32 res = gpusolve2(gs, c); u64 stop = utime(); tottime += (stop - start); @@ -46,10 +47,12 @@ i32 runuf20() { // printf("Passed %u / 1000 tests\n", passed); // printf("Took %lf s total, %lf s on avg\n", ((f64) tottime) / 1000000.0, ((f64) tottime) / 1000000000.0); if (passed == 1000) return 0; + freeSolver(gs); return 1; } i32 runuf50() { + gpusolver* gs = initSolver(); // printf("Running against uf50\n"); u32 passed = 0; u64 tottime = 0; @@ -62,7 +65,7 @@ i32 runuf50() { sortlastnum(c); u64 start = utime(); - i32 res = gpusolve(c); + i32 res = gpusolve2(gs, c); u64 stop = utime(); tottime += (stop - start); @@ -72,10 +75,12 @@ i32 runuf50() { // printf("Passed %u / 1000 tests\n", passed); // printf("Took %lf s total, %lf s on avg\n", ((f64) tottime) / 1000000.0, ((f64) tottime) / 1000000000.0); if (passed == 1000) return 0; + freeSolver(gs); return 1; } i32 runuuf50() { + gpusolver* gs = initSolver(); // printf("Running against uuf50\n"); u32 passed = 0; u64 tottime = 0; @@ -88,7 +93,7 @@ i32 runuuf50() { sortlastnum(c); u64 start = utime(); - i32 res = gpusolve(c); + i32 res = gpusolve2(gs, c); u64 stop = utime(); tottime += (stop - start); @@ -98,5 +103,6 @@ i32 runuuf50() { // printf("Passed %u / 1000 tests\n", passed); // printf("Took %lf s total, %lf s on avg\n", ((f64) tottime) / 1000000.0, ((f64) tottime) / 1000000000.0); if (passed == 1000) return 0; + freeSolver(gs); return 1; } \ No newline at end of file