Fix OpenCL implementation
This commit is contained in:
parent
e7ec45ba0a
commit
09fea04022
11
src/clm.c
11
src/clm.c
@ -253,11 +253,20 @@ void clm_freeLinear(clm_Linear linear) {
|
||||
clm_freeMatrix(linear.bias.matrix);
|
||||
}
|
||||
|
||||
void clm_vectorPrint(clm_Vector vec) {
|
||||
printf("[");
|
||||
for(unsigned int i = 0; i < vec.length; i++) {
|
||||
printf("%f", vec.values[i]);
|
||||
if(i != vec.length - 1) printf(" ");
|
||||
}
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
void clm_matrixPrint(clm_Matrix mat) {
|
||||
printf("[\n");
|
||||
for(unsigned int i = 0; i < mat.rows; i++) {
|
||||
for(unsigned int j = 0; j < mat.cols; j++) {
|
||||
printf("%7.3f", matrixAt(mat, i, j));
|
||||
printf("%f ", matrixAt(mat, i, j));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
@ -81,6 +81,7 @@ bool clm_vectorIsInvalid(clm_Vector vec);
|
||||
clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs);
|
||||
clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate, unsigned int batchSize);
|
||||
|
||||
void clm_vectorPrint(clm_Vector vec);
|
||||
void clm_matrixPrint(clm_Matrix mat);
|
||||
void clm_matrixPrintShape(clm_Matrix mat);
|
||||
void clm_freeMatrix(clm_Matrix mat);
|
||||
|
@ -49,7 +49,7 @@ int clm_gpuInit(unsigned int mode) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("%s", buffer);
|
||||
// printf("%s", buffer);
|
||||
|
||||
context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
|
||||
if(!context) {
|
||||
@ -132,13 +132,16 @@ static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *m
|
||||
cl_mem mem = allocGPUMat(gpuMat, numMats, flags, nativeBuf);
|
||||
|
||||
for(unsigned int i = 0; i < numMats; i++) {
|
||||
err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, i * sizeof(float) * gpuMat.rows * gpuMat.cols, sizeof(float) * gpuMat.rows * gpuMat.cols, mats[i].values, 0, NULL, NULL);
|
||||
err = clEnqueueWriteBuffer(queue, mem, CL_FALSE, i * sizeof(float) * gpuMat.rows * gpuMat.cols, sizeof(float) * gpuMat.rows * gpuMat.cols, mats[i].values, 0, NULL, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to enqueue write: %s\n", clm_clErrorToString(err));
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
clFlush(queue);
|
||||
clFinish(queue);
|
||||
|
||||
return mem;
|
||||
}
|
||||
|
||||
@ -153,20 +156,22 @@ static cl_mem writeNativeMatrix(clm_NativeMatrix matrix) {
|
||||
return mem;
|
||||
}
|
||||
|
||||
static cl_mem writeNativeMatrixArray(clm_NativeMatrixArray array) {
|
||||
static cl_mem writeNativeMatrixArray(clm_NativeMatrixArray array, unsigned int n) {
|
||||
clm_Matrix mat = array.matrixes[0];
|
||||
cl_mem mem = array.native->mem;
|
||||
|
||||
// TODO: don't do blocking writes, instead wait once at the end
|
||||
size_t matLength = sizeof(float) * mat.rows * mat.cols;
|
||||
for(unsigned int i = 0; i < array.length; i++) {
|
||||
cl_int err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL);
|
||||
cl_int err = clEnqueueWriteBuffer(queue, mem, CL_FALSE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to enqueue write: %s\n", clm_clErrorToString(err));
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
clFlush(queue);
|
||||
clFinish(queue);
|
||||
|
||||
return mem;
|
||||
}
|
||||
|
||||
@ -175,12 +180,15 @@ static void readGPUMats(cl_GPUMat mat, unsigned int numMats, clm_Matrix *mats, c
|
||||
cl_mem mem = nativeBuf->mem;
|
||||
|
||||
for(unsigned int i = 0; i < numMats; i++) {
|
||||
err = clEnqueueReadBuffer(queue, mem, CL_TRUE, i * sizeof(float) * mat.rows * mat.cols, sizeof(float) * mat.rows * mat.cols, mats[i].values, 0, NULL, NULL);
|
||||
err = clEnqueueReadBuffer(queue, mem, CL_FALSE, i * sizeof(float) * mat.rows * mat.cols, sizeof(float) * mat.rows * mat.cols, mats[i].values, 0, NULL, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to enqueue read: %s\n", clm_clErrorToString(err));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
clFlush(queue);
|
||||
clFinish(queue);
|
||||
}
|
||||
|
||||
static void readNativeMatrix(clm_NativeMatrix matrix) {
|
||||
@ -191,18 +199,21 @@ static void readNativeMatrix(clm_NativeMatrix matrix) {
|
||||
}
|
||||
}
|
||||
|
||||
static void readNativeMatrixArray(clm_NativeMatrixArray array) {
|
||||
static void readNativeMatrixArray(clm_NativeMatrixArray array, unsigned int n) {
|
||||
clm_Matrix mat = array.matrixes[0];
|
||||
size_t matLength = sizeof(float) * mat.rows * mat.cols;
|
||||
|
||||
// TODO: don't do blocking reads, instead wait once at the end
|
||||
for(unsigned int i = 0; i < array.length; i++) {
|
||||
cl_int err = clEnqueueReadBuffer(queue, array.native->mem, CL_TRUE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL);
|
||||
for(unsigned int i = 0; i < n; i++) {
|
||||
cl_int err = clEnqueueReadBuffer(queue, array.native->mem, CL_FALSE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to enqueue read: %s\n", clm_clErrorToString(err));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
clFlush(queue);
|
||||
clFinish(queue);
|
||||
}
|
||||
|
||||
static void clm_nativeAllocMatrix(clm_NativeMatrix *matrix, cl_mem_flags flags) {
|
||||
@ -281,13 +292,7 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
|
||||
clFlush(queue);
|
||||
clFinish(queue);
|
||||
|
||||
/*err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output[0].values, 0, NULL, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to read from buffer\n");
|
||||
return;
|
||||
}*/
|
||||
|
||||
readNativeMatrixArray(linear->output);
|
||||
readNativeMatrixArray(linear->output, batchSize);
|
||||
}
|
||||
|
||||
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, bool updateErrors, clm_Matrix *outputErrors) {
|
||||
@ -305,8 +310,8 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
|
||||
|
||||
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput);
|
||||
cl_mem matWeights_values = writeNativeMatrix(linear->weights);
|
||||
cl_mem matInputErrors_values = writeNativeMatrixArray(linear->error);
|
||||
cl_mem matOutput_values = writeNativeMatrixArray(linear->output);
|
||||
cl_mem matInputErrors_values = writeNativeMatrixArray(linear->error, batchSize);
|
||||
cl_mem matOutput_values = writeNativeMatrixArray(linear->output, batchSize);
|
||||
if(!matInput_values || !matWeights_values || !matInputErrors_values || !matOutput_values) {
|
||||
printf("Failed to write GPU mats\n");
|
||||
return;
|
||||
@ -333,13 +338,17 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
|
||||
err |= clSetKernelArg(kernelLinearBackprop1, 8, sizeof(matOutputGradients), &matOutputGradients);
|
||||
err |= clSetKernelArg(kernelLinearBackprop1, 9, sizeof(matOutputGradients_values), &matOutputGradients_values);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to set kernel args (2): %d\n", err);
|
||||
printf("Failed to set kernel args (2): %s\n", clm_clErrorToString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
size_t step1WorkSize = matOutputGradients.rows * matOutputGradients.cols;
|
||||
size_t step1Global = ceil((float) step1WorkSize / kernelLinearBackprop1Local) * kernelLinearBackprop1Local;
|
||||
clEnqueueNDRangeKernel(queue, kernelLinearBackprop1, 1, NULL, &step1Global, &kernelLinearBackprop1Local, 0, NULL, NULL);
|
||||
err = clEnqueueNDRangeKernel(queue, kernelLinearBackprop1, 1, NULL, &step1Global, &kernelLinearBackprop1Local, 0, NULL, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to enqueue: %s\n", clm_clErrorToString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
clFlush(queue);
|
||||
clFinish(queue);
|
||||
@ -354,9 +363,11 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 6, sizeof(matInputErrors_values), &matInputErrors_values);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 7, sizeof(cl_char), &updateErrors);
|
||||
|
||||
if(updateErrors) {
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 8, sizeof(matOutputErrors), &matOutputErrors);
|
||||
if(updateErrors) {
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 9, sizeof(matOutputErrors_values), &matOutputErrors_values);
|
||||
} else {
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 9, sizeof(matOutputErrors_values), NULL);
|
||||
}
|
||||
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 10, sizeof(matOutputWeightsErrors), &matOutputWeightsErrors);
|
||||
@ -364,23 +375,23 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 12, sizeof(matOutputGradients), &matOutputGradients);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 13, sizeof(matOutputGradients_values), &matOutputGradients_values);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to set kernel args (3): %d\n", err);
|
||||
printf("Failed to set kernel args (3): %s\n", clm_clErrorToString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
size_t step2WorkSize = matOutputWeightsErrors.rows * matOutputWeightsErrors.cols;
|
||||
size_t step2Global = ceil((float) step1WorkSize / kernelLinearBackprop2Local) * kernelLinearBackprop2Local;
|
||||
clEnqueueNDRangeKernel(queue, kernelLinearBackprop2, 1, NULL, &step2Global, &kernelLinearBackprop2Local, 0, NULL, NULL);
|
||||
size_t step2Global = ceil((float) step2WorkSize / kernelLinearBackprop2Local) * kernelLinearBackprop2Local;
|
||||
err = clEnqueueNDRangeKernel(queue, kernelLinearBackprop2, 1, NULL, &step2Global, &kernelLinearBackprop2Local, 0, NULL, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to enqueue: %s\n", clm_clErrorToString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
clFlush(queue);
|
||||
clFinish(queue);
|
||||
|
||||
clm_matrixPrint(linear->weightsError.matrixes[0]);
|
||||
|
||||
readNativeMatrixArray(linear->weightsError);
|
||||
readNativeMatrixArray(linear->gradient);
|
||||
|
||||
clm_matrixPrint(linear->weightsError.matrixes[0]);
|
||||
readNativeMatrixArray(linear->weightsError, batchSize);
|
||||
readNativeMatrixArray(linear->gradient, batchSize);
|
||||
|
||||
if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputErrors);
|
||||
}
|
||||
|
44
src/cltest.c
44
src/cltest.c
@ -37,9 +37,12 @@ float eval(clm_NN nn, unsigned int count, clm_Vector *images, clm_Vector *labels
|
||||
// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
|
||||
// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
|
||||
float *pred = predict(nn, images[idx]);
|
||||
|
||||
// if(idx < 100) clm_vectorPrint((clm_Vector){.values = pred, .length = labels[0].length});
|
||||
|
||||
unsigned int predDigit = 0;
|
||||
float max = -1;
|
||||
for(unsigned int j = 0; j < 10; j++) {
|
||||
for(unsigned int j = 0; j < labels[0].length; j++) {
|
||||
// printf("%.2f ", pred[j]);
|
||||
if(pred[j] > max || max < 0) {
|
||||
max = pred[j];
|
||||
@ -50,15 +53,17 @@ float eval(clm_NN nn, unsigned int count, clm_Vector *images, clm_Vector *labels
|
||||
|
||||
unsigned int actDigit = 0;
|
||||
float maxA = -1;
|
||||
for(unsigned int j = 0; j < 10; j++) {
|
||||
for(unsigned int j = 0; j < labels[0].length; j++) {
|
||||
// printf("%.2f ", pred[j]);
|
||||
if(labels[idx].values[j] > maxA || maxA < 0) {
|
||||
maxA = labels[idx].values[j];
|
||||
actDigit = j;
|
||||
}
|
||||
}
|
||||
// if(idx < 100) printf("Actual: %u\n", actDigit);
|
||||
// if(idx < 100) {
|
||||
// printf("Actual: %u\n", actDigit);
|
||||
// printf("\n");
|
||||
// }
|
||||
|
||||
if(predDigit == actDigit) correct++;
|
||||
}
|
||||
@ -233,7 +238,7 @@ int main(int argc, const char *argv[]) {
|
||||
|
||||
srand(1);
|
||||
|
||||
unsigned int
|
||||
/*unsigned int
|
||||
i = 784,
|
||||
h = 30,
|
||||
o = 10;
|
||||
@ -241,13 +246,40 @@ int main(int argc, const char *argv[]) {
|
||||
clm_Linear layers[] = {
|
||||
clm_linearCreateRandom(i, h),
|
||||
clm_linearCreateRandom(h, o)};
|
||||
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 10000);
|
||||
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 10000);*/
|
||||
|
||||
float v_00[2] = {0, 0};
|
||||
float v_01[2] = {0, 1};
|
||||
float v_10[2] = {1, 0};
|
||||
float v_11[2] = {1, 1};
|
||||
|
||||
images = calloc(4, sizeof(clm_Vector));
|
||||
images[0] = (clm_Vector){.values = v_00, .length = 2};
|
||||
images[1] = (clm_Vector){.values = v_01, .length = 2};
|
||||
images[2] = (clm_Vector){.values = v_10, .length = 2};
|
||||
images[3] = (clm_Vector){.values = v_11, .length = 2};
|
||||
|
||||
labels = calloc(4, sizeof(clm_Vector));
|
||||
labels[0] = (clm_Vector){.values = v_10, .length = 2};
|
||||
labels[1] = (clm_Vector){.values = v_01, .length = 2};
|
||||
labels[2] = (clm_Vector){.values = v_01, .length = 2};
|
||||
labels[3] = (clm_Vector){.values = v_01, .length = 2};
|
||||
|
||||
imageCount = 4;
|
||||
|
||||
unsigned int
|
||||
i = 2,
|
||||
o = 2;
|
||||
|
||||
clm_Linear layers[] = {
|
||||
clm_linearCreateRandom(i, o)};
|
||||
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.5, 4);
|
||||
|
||||
for(unsigned int i = 0; i < sizeof(layers) / sizeof(clm_Linear); i++) {
|
||||
clm_linearInit(&nn.layers[i]);
|
||||
}
|
||||
|
||||
for(unsigned int epoch = 0; epoch < 10; epoch++) {
|
||||
for(unsigned int epoch = 0; epoch < 1000; epoch++) {
|
||||
printf("Epoch %u\n", epoch);
|
||||
/*for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
|
||||
if(idx % 1000 == 0) {
|
||||
|
28
src/mat.cl
28
src/mat.cl
@ -6,17 +6,6 @@ typedef struct __attribute__((packed)) {
|
||||
|
||||
#define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
|
||||
|
||||
#define matrixGetIJ(mat, idx, i, j) \
|
||||
{ \
|
||||
if(!mat.transposed) { \
|
||||
i = idx / mat.cols; \
|
||||
j = idx % mat.cols; \
|
||||
} else { \
|
||||
i = idx % mat.rows; \
|
||||
j = idx / mat.rows; \
|
||||
} \
|
||||
}
|
||||
|
||||
void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
|
||||
uint idx = get_global_id(0);
|
||||
if(idx >= matOut.rows * matOut.cols) return;
|
||||
@ -66,7 +55,7 @@ void mat_multiply_elements(cl_GPUMat mat, __global float *mat_values, cl_GPUMat
|
||||
uint idx = get_global_id(0);
|
||||
if(idx >= mat.rows * mat.cols) return;
|
||||
|
||||
other_values[idx] *= mat_values[idx];
|
||||
mat_values[idx] *= other_values[idx];
|
||||
}
|
||||
|
||||
void mat_multiply_scalar(cl_GPUMat mat, __global float *mat_values, float scalar) {
|
||||
@ -84,6 +73,16 @@ cl_GPUMat clm_matrixTranspose(cl_GPUMat mat) {
|
||||
return tr;
|
||||
}
|
||||
|
||||
void clm_matrixPrint(cl_GPUMat mat, __global float *ptr) {
|
||||
printf("THEMATRIX: [");
|
||||
for(unsigned int i = 0; i < mat.rows; i++) {
|
||||
for(unsigned int j = 0; j < mat.cols; j++) {
|
||||
printf("%7.6f", matrixAt(mat, ptr, i, j));
|
||||
}
|
||||
}
|
||||
printf("]");
|
||||
}
|
||||
|
||||
__kernel void linear_forward(unsigned int batchSize,
|
||||
cl_GPUMat input, __global float *input_values,
|
||||
cl_GPUMat weights, __global float *weights_values,
|
||||
@ -93,6 +92,7 @@ __kernel void linear_forward(unsigned int batchSize,
|
||||
for(unsigned int b = 0; b < batchSize; b++) {
|
||||
__global float *batchInput_values = input_values + b * input.rows * input.cols;
|
||||
__global float *batchOut_values = out_values + b * out.rows * out.cols;
|
||||
|
||||
mat_multiply(weights, weights_values, input, batchInput_values, out, batchOut_values);
|
||||
mat_add(out, batchOut_values, bias, bias_values);
|
||||
mat_sigmoid(out, batchOut_values);
|
||||
@ -136,8 +136,8 @@ __kernel void linear_backprop_2(unsigned int batchSize,
|
||||
mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values);
|
||||
|
||||
if(updateErrors) {
|
||||
cl_GPUMat weightsT = clm_matrixTranspose(weightsT);
|
||||
// mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values);
|
||||
cl_GPUMat weightsT = clm_matrixTranspose(weights);
|
||||
mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user