Update library structure

This commit is contained in:
MrLetsplay 2024-01-25 19:56:19 +01:00
parent 7668a249d9
commit 4c70af2496
Signed by: mr
SSH Key Fingerprint: SHA256:92jBH80vpXyaZHjaIl47pjRq+Yt7XGTArqQg1V7hSqg
12 changed files with 118 additions and 552 deletions

View File

@ -2,24 +2,35 @@ SRC=src
BUILD=build
CFLAGS=-Wall -g
CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c
CLM_SOURCES_CPU=$(CLM_SOURCES) $(SRC)/clm_gpu_cpu.c
CLM_SOURCES_OPENCL=$(CLM_SOURCES) $(SRC)/clm_gpu_opencl.c
CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c $(SRC)/clm_funcs.c
.PHONY: all
all:
mkdir -p $(BUILD)
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(CLM_SOURCES_OPENCL)
all: libclm_cpu libclm_opencl cltest
.PHONY: run
.PHONY: run_cpu
run: all
$(BUILD)/cltest
cd $(BUILD) && ./cltest cpu
.PHONY: cl
cl:
.PHONY: run_opencl
run: all
cd $(BUILD) && ./cltest opencl
.PHONY: cltest
cltest: libclm
mkdir -p $(BUILD)
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(CLM_SOURCES_OPENCL)
gcc -lOpenCL -lm -Wl,-rpath -Wl,'$$ORIGIN' -L$(BUILD) -lclm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c
.PHONY: cl_run
cl_run: cl
$(BUILD)/cl
.PHONY: libclm
libclm:
mkdir -p $(BUILD)
gcc -lOpenCL -lm -Wl,-rpath -Wl,'$$ORIGIN' -shared -fpic $(CFLAGS) -o $(BUILD)/libclm.so $(CLM_SOURCES)
.PHONY: libclm_cpu
libclm_cpu:
mkdir -p $(BUILD)
gcc -lOpenCL -lm -L$(BUILD) -lclm -shared -fpic $(CFLAGS) -o $(BUILD)/libclm_cpu.so $(SRC)/clm_cpu.c
.PHONY: libclm_opencl
libclm_opencl:
mkdir -p $(BUILD)
gcc -lOpenCL -lm -L$(BUILD) -lclm -shared -fpic $(CFLAGS) -o $(BUILD)/libclm_opencl.so $(SRC)/clm_opencl.c

141
src/cl.c
View File

@ -1,141 +0,0 @@
#define CL_TARGET_OPENCL_VERSION 200
#include <CL/cl_platform.h>
#include <math.h>
#include "clm.h"
#include "util.h"
#include <CL/cl.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
typedef struct __attribute__((packed)) {
cl_uint rows;
cl_uint cols;
cl_char transposed;
} cl_GPUMat;
int main() {
// Connect to a compute device
int useGPU = true;
cl_device_id deviceID;
cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
if(err != CL_SUCCESS) {
printf("Error: Failed to create a device group!\n");
return 1;
}
char *buffer = loadFile("src/mat.cl");
printf("%s", buffer);
cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
if(!context) {
printf("Failed to create context\n");
return 1;
}
cl_command_queue queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err);
if(!queue) {
printf("Failed to create command queue\n");
return 1;
}
size_t length = strlen(buffer);
cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err);
if(!program) {
printf("Failed to create program\n");
return 1;
}
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to build program\n");
// clGetProgramBuildInfo...
return 1;
}
cl_kernel kernel = clCreateKernel(program, "mat_multiply", &err);
if(!kernel) {
printf("Failed to create kernel\n");
return 1;
}
clm_Matrix a = clm_createMatrixRandom(5, 10);
clm_Matrix b = clm_createMatrixRandom(10, 3);
clm_Matrix out = clm_createMatrixRandom(5, 3);
cl_GPUMat matA = {.rows = a.rows, .cols = a.cols, .transposed = a.transposed};
cl_GPUMat matB = {.rows = b.rows, .cols = b.cols, .transposed = b.transposed};
cl_GPUMat matOut = {.rows = out.rows, .cols = out.cols, .transposed = out.transposed};
size_t inputSize = out.rows * out.cols;
clm_matrixMultiplyMatrix(a, b, out);
clm_matrixPrint(out);
clm_matrixZero(out);
cl_mem matA_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * a.rows * a.cols, NULL, &err);
cl_mem matB_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * b.rows * b.cols, NULL, &err);
cl_mem matOut_values = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * out.rows * out.cols, NULL, &err);
if(!matA_values || !matB_values || !matOut_values) {
printf("Failed to allocate input/output buffer\n");
return 1;
}
err = clEnqueueWriteBuffer(queue, matA_values, CL_TRUE, 0, sizeof(float) * a.rows * a.cols, a.values, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, matB_values, CL_TRUE, 0, sizeof(float) * b.rows * b.cols, b.values, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to write to buffer\n");
return 1;
}
err = 0;
err |= clSetKernelArg(kernel, 0, sizeof(matA), &matA);
err |= clSetKernelArg(kernel, 1, sizeof(matA_values), &matA_values);
err |= clSetKernelArg(kernel, 2, sizeof(matB), &matB);
err |= clSetKernelArg(kernel, 3, sizeof(matB_values), &matB_values);
err |= clSetKernelArg(kernel, 4, sizeof(matOut), &matOut);
err |= clSetKernelArg(kernel, 5, sizeof(matOut_values), &matOut_values);
// err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize);
if(err != CL_SUCCESS) {
printf("Failed to set kernel args\n");
return 1;
}
/*char *info = calloc(1, 1024);
clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL);
printf("INFO: %s\n", info);*/
size_t local;
err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if(err != CL_SUCCESS) {
printf("Failed to get work group size\n");
return 1;
}
printf("Group size is %zu\n", local);
size_t global = ceil((float) inputSize / local) * local;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to enqueue: %d\n", err);
return 1;
}
clFinish(queue);
err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * inputSize, out.values, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to read from buffer\n");
return 1;
}
clm_matrixPrint(out);
/*for(unsigned int i = 0; i < inputSize; i++) {
if(i % 1000 != 0) continue;
printf("%f: %f\n", inputData[i], outputData[i]);
}*/
}

View File

@ -9,7 +9,6 @@ const clm_Matrix INVALID_MATRIX = {.rows = 0, .cols = 0, .values = NULL};
const clm_Vector INVALID_VECTOR = {.length = 0, .values = NULL};
clm_Matrix clm_createMatrix(unsigned int rows, unsigned int cols) {
printf("CREATING MATRIX\n");
clm_Matrix mat = {0};
mat.rows = rows;
mat.cols = cols;

View File

@ -1,6 +1,8 @@
#ifndef _CLM_H_
#define _CLM_H_
#define CL_TARGET_OPENCL_VERSION 200
#include <CL/cl_platform.h>
#include <stdbool.h>
#define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
@ -26,6 +28,7 @@ typedef struct {
clm_Matrix *error;
clm_Matrix *weightsError;
clm_Matrix *gradient;
clm_NativeBuf *nativeWeights;
clm_NativeBuf *nativeBias;
clm_NativeBuf *nativeOutput;

View File

@ -5,7 +5,7 @@
struct clm_NativeBuf {};
int clm_gpuInit() {
int clm_gpuInit(unsigned int mode) {
return 0;
}

58
src/clm_funcs.c Normal file
View File

@ -0,0 +1,58 @@
#include "clm_gpu.h"
#include <dlfcn.h>
#include <stdio.h>
typedef int (*clm_gpuInitFunc)(unsigned int);
typedef void (*clm_gpuDestroyFunc)();
typedef void (*clm_linearForwardFunc)(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
typedef void (*clm_linearBackpropFunc)(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients);
static void *lib;
static clm_gpuInitFunc initFunc;
static clm_gpuDestroyFunc destroyFunc;
static clm_linearForwardFunc linearForwardFunc;
static clm_linearBackpropFunc linearBackpropFunc;
int clm_gpuInit(unsigned int mode) {
const char *libName = NULL;
if(mode & CLM_MODE_CPU) {
libName = "libclm_cpu.so";
} else if(mode & CLM_MODE_OPENCL) {
libName = "libclm_opencl.so";
} else {
printf("No valid mode supplied. Must set either CLM_MODE_CPU or CLM_MODE_GPU");
return 1;
}
lib = dlopen(libName, RTLD_NOW);
if(!lib) {
printf("Failed to load lib: %s\n", dlerror());
return 1;
}
initFunc = (clm_gpuInitFunc) dlsym(lib, "clm_gpuInit");
destroyFunc = (clm_gpuDestroyFunc) dlsym(lib, "clm_gpuDestroy");
linearForwardFunc = (clm_linearForwardFunc) dlsym(lib, "clm_linearForward");
linearBackpropFunc = (clm_linearBackpropFunc) dlsym(lib, "clm_linearBackprop");
if(!initFunc || !destroyFunc || !linearForwardFunc || !linearBackpropFunc) {
printf("Failed to load functions from library: %s\n", dlerror());
return 1;
}
return initFunc(mode);
}
void clm_gpuDestroy() {
destroyFunc();
dlclose(lib);
}
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
linearForwardFunc(linear, batchSize, inputs, outputs);
}
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
linearBackpropFunc(linear, learnRate, batchSize, inputs, outputs, inputErrors, updateErrors, outputErrors, outputWeightsErrors, outputGradients);
}

View File

@ -3,7 +3,10 @@
#include "clm.h"
int clm_gpuInit();
#define CLM_MODE_CPU 1
#define CLM_MODE_OPENCL 2
int clm_gpuInit(unsigned int mode);
void clm_gpuDestroy();
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);

View File

@ -33,7 +33,7 @@ typedef struct __attribute__((packed)) {
#define gpuMat(mat) \
{ .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed }
int clm_gpuInit() {
int clm_gpuInit(unsigned int mode) {
// Connect to a compute device
int useGPU = true;
cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);

View File

@ -1,119 +0,0 @@
#define CL_TARGET_OPENCL_VERSION 300
#include <CL/cl.h>
#include <stdio.h>
int main()
{
// Connect to a compute device
//
int gpu = 1;
cl_device_id deviceID;
cl_int err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
if (err != CL_SUCCESS) {
printf("Error: Failed to create a device group!\n");
return 1;
}
FILE *file = fopen("src/test.cl", "r");
fseek(file, 0, SEEK_END);
size_t length = ftell(file);
fseek(file, 0, SEEK_SET);
char *buffer = calloc(1, length + 1);
fread(buffer, length, 1, file);
printf("%s", buffer);
cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
if(!context) {
printf("Failed to create context\n");
return 1;
}
cl_command_queue queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err);
if(!queue) {
printf("Failed to create command queue\n");
return 1;
}
cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err);
if(!program) {
printf("Failed to create program\n");
return 1;
}
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to build program\n");
// clGetProgramBuildInfo...
return 1;
}
cl_kernel kernel = clCreateKernel(program, "do_stuff", &err);
if(!kernel) {
printf("Failed to create kernel\n");
return 1;
}
unsigned int inputSize = 256000000;
float *inputData = calloc(inputSize, sizeof(float));
for(unsigned int i = 0; i < inputSize; i++) {
inputData[i] = i;
}
cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * inputSize, NULL, &err);
cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * inputSize, NULL, &err);
if(!input || !output) {
printf("Failed to allocate input/output buffer\n");
return 1;
}
err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, sizeof(float) * inputSize, inputData, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to write to buffer\n");
return 1;
}
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(input), &input);
err |= clSetKernelArg(kernel, 1, sizeof(output), &output);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize);
if(err != CL_SUCCESS) {
printf("Failed to set kernel args\n");
return 1;
}
/*char *info = calloc(1, 1024);
clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL);
printf("INFO: %s\n", info);*/
size_t local;
err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if(err != CL_SUCCESS) {
printf("Failed to get work group size\n");
return 1;
}
printf("Group size is %zu\n", local);
size_t global = inputSize;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to enqueue\n");
return 1;
}
clFinish(queue);
float *outputData = calloc(inputSize, sizeof(float));
err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(float) * inputSize, outputData, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to read from buffer\n");
return 1;
}
for(unsigned int i = 0; i < inputSize; i++) {
if(i % 1000 != 0) continue;
printf("%f: %f\n", inputData[i], outputData[i]);
}
}

View File

@ -1,264 +0,0 @@
#include <dlfcn.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include "clm.h"
float train_data_x[4][2] = {
{0, 0},
{0, 1},
{1, 0},
{1, 1}};
float train_data_y[4][1] = {
{0},
{1},
{1},
{0}};
float *predict(clm_NN nn, float *x, unsigned int length) {
clm_Matrix xM = clm_matrixWrapArray(x, length);
for(unsigned int i = 0; i < nn.numLayers; i++) {
clm_Linear layer = nn.layers[i];
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output);
if(clm_matrixIsInvalid(newX)) {
printf("Failed to predict\n");
return NULL;
}
clm_matrixAddMatrix(newX, layer.bias);
clm_matrixSigmoid(newX);
xM = newX;
}
return xM.values;
}
void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
clm_Matrix xM = clm_matrixWrapArray(x, xL);
clm_Matrix yM = clm_matrixWrapArray(y, yL);
// TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached)
// Forward pass
clm_Matrix currentX = xM;
for(unsigned int i = 0; i < nn.numLayers; i++) {
clm_Linear layer = nn.layers[i];
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output);
if(clm_matrixIsInvalid(newX)) {
printf("Forward pass failed\n");
return;
}
clm_matrixAddMatrix(newX, layer.bias);
clm_matrixSigmoid(newX);
currentX = newX;
}
for(int i = nn.numLayers - 1; i >= 0; i--) {
clm_Linear layer = nn.layers[i];
clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output;
clm_Matrix outputOfThisLayer = nn.layers[i].output;
clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
clm_Matrix error = layer.error;
if(i == nn.numLayers - 1) {
clm_matrixSubtractMatrix(clm_matrixCopy(yM, error), outputOfThisLayer); // yhat - y
} else {
clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
clm_matrixMultiplyMatrix(weightsT, prevError, error);
}
clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat)
clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat)
clm_matrixMultiplyScalar(gradient, nn.learnRate);
clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer);
clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError);
clm_matrixAddMatrix(layer.weights, layer.weightsError);
clm_matrixAddMatrix(layer.bias, gradient);
}
}
void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
FILE *file = fopen("data/train-labels.idx1-ubyte", "r");
if(!file) {
perror("Failed to open labels\n");
return;
}
unsigned char magicBytes[4];
fread(magicBytes, sizeof(magicBytes), 1, file);
printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);
unsigned char lengthBytes[4];
fread(lengthBytes, sizeof(lengthBytes), 1, file);
uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) |
(lengthBytes[2] << 8) | lengthBytes[3];
printf("%" PRId32 "\n", length);
clm_Vector *vectors = calloc(length, sizeof(clm_Vector));
for(unsigned int i = 0; i < length; i++) {
unsigned char label;
fread(&label, sizeof(unsigned char), 1, file);
clm_Vector vector = clm_vectorCreate(10);
for(unsigned int j = 0; j < 10; j++) {
vector.values[j] = label == j ? 1 : 0;
}
vectors[i] = vector;
}
*labelsOut = vectors;
*labelsCountOut = length;
}
void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
FILE *file = fopen("data/train-images.idx3-ubyte", "r");
if(!file) {
perror("Failed to open images\n");
return;
}
unsigned char magicBytes[4];
fread(magicBytes, sizeof(magicBytes), 1, file);
printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);
unsigned char lengthBytes[4];
fread(lengthBytes, sizeof(lengthBytes), 1, file);
uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3];
printf("%" PRId32 "\n", length);
unsigned char rowsBytes[4];
fread(rowsBytes, sizeof(rowsBytes), 1, file);
uint32_t rows = (rowsBytes[0] << 24) | (rowsBytes[1] << 16) | (rowsBytes[2] << 8) | rowsBytes[3];
printf("%" PRId32 "\n", rows);
unsigned char colsBytes[4];
fread(colsBytes, sizeof(colsBytes), 1, file);
uint32_t cols = (colsBytes[0] << 24) | (colsBytes[1] << 16) | (colsBytes[2] << 8) | colsBytes[3];
printf("%" PRId32 "\n", cols);
clm_Vector *images = calloc(length, sizeof(clm_Vector));
for(unsigned int i = 0; i < length; i++) {
clm_Vector vec = clm_vectorCreate(cols * rows);
unsigned char image[cols * rows];
fread(image, sizeof(image), 1, file);
for(unsigned int j = 0; j < cols * rows; j++) {
vec.values[j] = (float) image[j];
}
images[i] = vec;
}
*imagesOut = images;
*imageCountOut = length;
}
typedef void *(*callocFunc)(size_t, size_t);
callocFunc oldCalloc;
int main() {
oldCalloc = dlsym(RTLD_NEXT, "calloc");
clm_Vector *labels = NULL;
unsigned int labelCount;
loadLabels(&labels, &labelCount);
printf("LENGTH: %u\n", labelCount);
clm_Vector *images = NULL;
unsigned int imageCount;
loadImages(&images, &imageCount);
imageCount = 60000;
printf("%f\n", images[0].values[0]);
srand(1);
unsigned int
i = 784,
h = 30,
o = 10;
clm_Linear layer1 = clm_linearCreateRandom(i, h);
clm_Linear layer2 = clm_linearCreateRandom(h, o);
clm_Linear layers[] = {layer1, layer2};
clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};
for(unsigned int epoch = 0; epoch < 1; epoch++) {
printf("Epoch %u\n", epoch);
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
if(idx % 1000 == 0) {
printf("\r%.2f%%", idx / (float) imageCount * 100);
fflush(stdout);
}
// printf("%u\n", idx);
// train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
/*for(unsigned int f = 0; f < images[idx].length; f++) {
printf("%.2f ", images[idx].values[f]);
}
printf("\n");
for(unsigned int f = 0; f < labels[idx].length; f++) {
printf("%.2f ", labels[idx].values[f]);
}
printf("\n");*/
// printf("%.2f\n", labels.values[idx]);
train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
// train(nn, test, 784, target, 10);
// predict(nn, test, 784);
}
printf("\n");
}
printf("Train done\n");
unsigned int correct = 0;
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
float *pred = predict(nn, images[idx].values, images[idx].length);
unsigned int predDigit = 0;
float max = -1;
for(unsigned int j = 0; j < 10; j++) {
// printf("%.2f ", pred[j]);
if(pred[j] > max || max < 0) {
max = pred[j];
predDigit = j;
}
}
if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
unsigned int actDigit = 0;
float maxA = -1;
for(unsigned int j = 0; j < 10; j++) {
// printf("%.2f ", pred[j]);
if(labels[idx].values[j] > maxA || maxA < 0) {
maxA = labels[idx].values[j];
actDigit = j;
}
}
if(idx < 100) printf("Actual: %u\n", actDigit);
// printf("\n");
if(predDigit == actDigit) correct++;
}
printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
printf("\n");
}
void *calloc(size_t nmemb, size_t size) {
// printf("CALLOC\n");
return oldCalloc(nmemb, size);
}

View File

@ -1,7 +1,9 @@
#include <errno.h>
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "clm.h"
#include "clm_gpu.h"
@ -105,7 +107,7 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *
void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
FILE *file = fopen("data/train-labels.idx1-ubyte", "r");
if(!file) {
perror("Failed to open labels\n");
printf("Failed to open labels: %s\n", strerror(errno));
return;
}
@ -141,7 +143,7 @@ void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
FILE *file = fopen("data/train-images.idx3-ubyte", "r");
if(!file) {
perror("Failed to open images\n");
printf("Failed to open images: %s\n", strerror(errno));
return;
}
@ -180,8 +182,23 @@ void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
*imageCountOut = length;
}
int main() {
if(clm_gpuInit() != 0) {
int main(int argc, const char *argv[]) {
if(argc < 2) {
printf("Usage: %s [cpu | opencl]\n", argv[0]);
return 1;
}
unsigned int mode = 0;
if(strcmp(argv[1], "cpu") == 0) {
mode = CLM_MODE_CPU;
} else if(strcmp(argv[1], "opencl") == 0) {
mode = CLM_MODE_OPENCL;
} else {
printf("Invalid mode\n");
return 1;
}
if(clm_gpuInit(mode) != 0) {
printf("Failed to init GPU\n");
return 1;
}
@ -190,6 +207,10 @@ int main() {
unsigned int labelCount;
loadLabels(&labels, &labelCount);
printf("LENGTH: %u\n", labelCount);
if(labelCount == 0) {
printf("Loaded no labels\n");
return 1;
}
clm_Vector *images = NULL;
unsigned int imageCount;
@ -209,7 +230,7 @@ int main() {
clm_Linear layers[] = {
clm_linearCreateRandom(i, h),
clm_linearCreateRandom(h, o)};
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 1000);
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, imageCount / 2);
for(unsigned int epoch = 0; epoch < 1; epoch++) {
printf("Epoch %u\n", epoch);

View File

@ -1,5 +0,0 @@
__kernel void do_stuff(__global float *input, __global float *output, unsigned int count) {
int i = get_global_id(0);
//printf("Task %d\n", i);
output[i] = input[i] * input[i];
}