More OpenCL debugging

This commit is contained in:
MrLetsplay 2024-02-22 17:42:19 +01:00
parent 707a9bf754
commit d55f7c69ba
Signed by: mr
SSH Key Fingerprint: SHA256:92jBH80vpXyaZHjaIl47pjRq+Yt7XGTArqQg1V7hSqg
3 changed files with 62 additions and 69 deletions

View File

@ -230,6 +230,9 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
}*/ }*/
readGPUMats(matOut, batchSize, outputs, linear->nativeOutput); readGPUMats(matOut, batchSize, outputs, linear->nativeOutput);
clFlush(queue);
clFinish(queue);
} }
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) { void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
@ -320,4 +323,7 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
readGPUMats(matOutputGradients, batchSize, outputGradients, linear->nativeGradient); readGPUMats(matOutputGradients, batchSize, outputGradients, linear->nativeGradient);
readGPUMats(matOutputWeightsErrors, batchSize, outputWeightsErrors, linear->nativeWeightsError); readGPUMats(matOutputWeightsErrors, batchSize, outputWeightsErrors, linear->nativeWeightsError);
if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputError); if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputError);
clFlush(queue);
clFinish(queue);
} }

View File

@ -31,6 +31,41 @@ float *predict(clm_NN nn, clm_Vector input) {
return xM.values; return xM.values;
} }
float eval(clm_NN nn, unsigned int count, clm_Vector *images, clm_Vector *labels) {
unsigned int correct = 0;
for(unsigned int idx = 0; idx < count; idx++) { // Each train sample
// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
float *pred = predict(nn, images[idx]);
unsigned int predDigit = 0;
float max = -1;
for(unsigned int j = 0; j < 10; j++) {
// printf("%.2f ", pred[j]);
if(pred[j] > max || max < 0) {
max = pred[j];
predDigit = j;
}
}
// if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
unsigned int actDigit = 0;
float maxA = -1;
for(unsigned int j = 0; j < 10; j++) {
// printf("%.2f ", pred[j]);
if(labels[idx].values[j] > maxA || maxA < 0) {
maxA = labels[idx].values[j];
actDigit = j;
}
}
// if(idx < 100) printf("Actual: %u\n", actDigit);
// printf("\n");
if(predDigit == actDigit) correct++;
}
return (float) correct / count * 100;
}
void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *expectedOutputs) { void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *expectedOutputs) {
clm_Matrix *batchInputs = calloc(nn.batchSize, sizeof(clm_Matrix)); clm_Matrix *batchInputs = calloc(nn.batchSize, sizeof(clm_Matrix));
clm_Matrix *batchOutputs = calloc(nn.batchSize, sizeof(clm_Matrix)); clm_Matrix *batchOutputs = calloc(nn.batchSize, sizeof(clm_Matrix));
@ -73,31 +108,6 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *
clm_matrixAddMatrix(layer->bias, layer->gradient[b]); clm_matrixAddMatrix(layer->bias, layer->gradient[b]);
} }
} }
/*for(int i = nn.numLayers - 1; i >= 0; i--) {
clm_Linear layer = nn.layers[i];
clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
clm_Matrix *outputsOfThisLayer = nn.layers[i].output;
clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
clm_Matrix error = layer.error;
if(i == nn.numLayers - 1) {
clm_matrixSubtractMatrix(clm_matrixCopy(batchOutputs[0], error), outputsOfThisLayer[0]); // yhat - y
} else {
clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
clm_matrixMultiplyMatrix(weightsT, prevError, error);
}
clm_Matrix gradient = clm_matrixDSigmoid(outputsOfThisLayer[0]); // dsig(yhat)
clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat)
clm_matrixMultiplyScalar(gradient, nn.learnRate);
clm_Matrix inputT = clm_matrixTranspose(inputsToThisLayer[0]);
clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError);
clm_matrixAddMatrix(layer.weights, layer.weightsError);
clm_matrixAddMatrix(layer.bias, gradient);
}*/
} }
free(batchInputs); free(batchInputs);
@ -216,7 +226,7 @@ int main(int argc, const char *argv[]) {
unsigned int imageCount; unsigned int imageCount;
loadImages(&images, &imageCount); loadImages(&images, &imageCount);
imageCount = 60000; imageCount = 600;
printf("%f\n", images[0].values[0]); printf("%f\n", images[0].values[0]);
@ -230,13 +240,13 @@ int main(int argc, const char *argv[]) {
clm_Linear layers[] = { clm_Linear layers[] = {
clm_linearCreateRandom(i, h), clm_linearCreateRandom(i, h),
clm_linearCreateRandom(h, o)}; clm_linearCreateRandom(h, o)};
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 500); clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 10000);
for(unsigned int i = 0; i < sizeof(layers) / sizeof(clm_Linear); i++) { for(unsigned int i = 0; i < sizeof(layers) / sizeof(clm_Linear); i++) {
clm_linearInit(&nn.layers[i]); clm_linearInit(&nn.layers[i]);
} }
for(unsigned int epoch = 0; epoch < 1; epoch++) { for(unsigned int epoch = 0; epoch < 10; epoch++) {
printf("Epoch %u\n", epoch); printf("Epoch %u\n", epoch);
/*for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample /*for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
if(idx % 1000 == 0) { if(idx % 1000 == 0) {
@ -245,45 +255,15 @@ int main(int argc, const char *argv[]) {
} }
}*/ }*/
train(nn, imageCount, images, labels); train(nn, imageCount, images, labels);
printf("Score: %.2f\n", eval(nn, imageCount, images, labels));
printf("\n"); printf("\n");
} }
printf("Train done\n"); printf("Train done\n");
unsigned int correct = 0; float score = eval(nn, imageCount, images, labels);
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample printf("Correct: %.2f\n", score);
// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
float *pred = predict(nn, images[idx]);
unsigned int predDigit = 0;
float max = -1;
for(unsigned int j = 0; j < 10; j++) {
// printf("%.2f ", pred[j]);
if(pred[j] > max || max < 0) {
max = pred[j];
predDigit = j;
}
}
// if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
unsigned int actDigit = 0;
float maxA = -1;
for(unsigned int j = 0; j < 10; j++) {
// printf("%.2f ", pred[j]);
if(labels[idx].values[j] > maxA || maxA < 0) {
maxA = labels[idx].values[j];
actDigit = j;
}
}
// if(idx < 100) printf("Actual: %u\n", actDigit);
// printf("\n");
if(predDigit == actDigit) correct++;
}
printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
printf("\n");
clm_gpuDestroy(); clm_gpuDestroy();
} }

View File

@ -21,8 +21,9 @@ void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, _
uint idx = get_global_id(0); uint idx = get_global_id(0);
if(idx >= matOut.rows * matOut.cols) return; if(idx >= matOut.rows * matOut.cols) return;
uint i, j; // TODO: might not work with transposed matOut
matrixGetIJ(matOut, idx, i, j); uint i = idx / matOut.cols;
uint j = idx % matOut.cols;
float sum = 0; float sum = 0;
for(unsigned int k = 0; k < matA.cols; k++) { for(unsigned int k = 0; k < matA.cols; k++) {
@ -75,6 +76,14 @@ void mat_multiply_scalar(cl_GPUMat mat, __global float *mat_values, float scalar
mat_values[idx] *= scalar; mat_values[idx] *= scalar;
} }
cl_GPUMat clm_matrixTranspose(cl_GPUMat mat) {
cl_GPUMat tr = {0};
tr.cols = mat.rows;
tr.rows = mat.cols;
tr.transposed = !mat.transposed;
return tr;
}
__kernel void linear_forward(unsigned int batchSize, __kernel void linear_forward(unsigned int batchSize,
cl_GPUMat input, __global float *input_values, cl_GPUMat input, __global float *input_values,
cl_GPUMat weights, __global float *weights_values, cl_GPUMat weights, __global float *weights_values,
@ -123,14 +132,12 @@ __kernel void linear_backprop_2(unsigned int batchSize,
__global float *batchOutWeightsErrors_values = outputWeightsErrors_values + b * outputWeightsErrors.rows * outputWeightsErrors.cols; __global float *batchOutWeightsErrors_values = outputWeightsErrors_values + b * outputWeightsErrors.rows * outputWeightsErrors.cols;
__global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols; __global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols;
cl_GPUMat inputsT = inputs; cl_GPUMat inputsT = clm_matrixTranspose(inputs);
inputsT.transposed = true;
mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values); mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values);
if(updateErrors) { if(updateErrors) {
cl_GPUMat weightsT = weights; cl_GPUMat weightsT = clm_matrixTranspose(weightsT);
weightsT.transposed = true; // mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values);
mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values);
} }
} }
} }