Update library structure
This commit is contained in:
parent
7668a249d9
commit
4c70af2496
39
Makefile
39
Makefile
@ -2,24 +2,35 @@ SRC=src
|
|||||||
BUILD=build
|
BUILD=build
|
||||||
CFLAGS=-Wall -g
|
CFLAGS=-Wall -g
|
||||||
|
|
||||||
CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c
|
CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c $(SRC)/clm_funcs.c
|
||||||
CLM_SOURCES_CPU=$(CLM_SOURCES) $(SRC)/clm_gpu_cpu.c
|
|
||||||
CLM_SOURCES_OPENCL=$(CLM_SOURCES) $(SRC)/clm_gpu_opencl.c
|
|
||||||
|
|
||||||
.PHONY: all
|
.PHONY: all
|
||||||
all:
|
all: libclm_cpu libclm_opencl cltest
|
||||||
mkdir -p $(BUILD)
|
|
||||||
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(CLM_SOURCES_OPENCL)
|
|
||||||
|
|
||||||
.PHONY: run
|
.PHONY: run_cpu
|
||||||
run: all
|
run: all
|
||||||
$(BUILD)/cltest
|
cd $(BUILD) && ./cltest cpu
|
||||||
|
|
||||||
.PHONY: cl
|
.PHONY: run_opencl
|
||||||
cl:
|
run: all
|
||||||
|
cd $(BUILD) && ./cltest opencl
|
||||||
|
|
||||||
|
.PHONY: cltest
|
||||||
|
cltest: libclm
|
||||||
mkdir -p $(BUILD)
|
mkdir -p $(BUILD)
|
||||||
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(CLM_SOURCES_OPENCL)
|
gcc -lOpenCL -lm -Wl,-rpath -Wl,'$$ORIGIN' -L$(BUILD) -lclm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c
|
||||||
|
|
||||||
.PHONY: cl_run
|
.PHONY: libclm
|
||||||
cl_run: cl
|
libclm:
|
||||||
$(BUILD)/cl
|
mkdir -p $(BUILD)
|
||||||
|
gcc -lOpenCL -lm -Wl,-rpath -Wl,'$$ORIGIN' -shared -fpic $(CFLAGS) -o $(BUILD)/libclm.so $(CLM_SOURCES)
|
||||||
|
|
||||||
|
.PHONY: libclm_cpu
|
||||||
|
libclm_cpu:
|
||||||
|
mkdir -p $(BUILD)
|
||||||
|
gcc -lOpenCL -lm -L$(BUILD) -lclm -shared -fpic $(CFLAGS) -o $(BUILD)/libclm_cpu.so $(SRC)/clm_cpu.c
|
||||||
|
|
||||||
|
.PHONY: libclm_opencl
|
||||||
|
libclm_opencl:
|
||||||
|
mkdir -p $(BUILD)
|
||||||
|
gcc -lOpenCL -lm -L$(BUILD) -lclm -shared -fpic $(CFLAGS) -o $(BUILD)/libclm_opencl.so $(SRC)/clm_opencl.c
|
||||||
|
141
src/cl.c
141
src/cl.c
@ -1,141 +0,0 @@
|
|||||||
#define CL_TARGET_OPENCL_VERSION 200
|
|
||||||
#include <CL/cl_platform.h>
|
|
||||||
#include <math.h>
|
|
||||||
|
|
||||||
#include "clm.h"
|
|
||||||
#include "util.h"
|
|
||||||
|
|
||||||
#include <CL/cl.h>
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
typedef struct __attribute__((packed)) {
|
|
||||||
cl_uint rows;
|
|
||||||
cl_uint cols;
|
|
||||||
cl_char transposed;
|
|
||||||
} cl_GPUMat;
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
// Connect to a compute device
|
|
||||||
int useGPU = true;
|
|
||||||
cl_device_id deviceID;
|
|
||||||
cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Error: Failed to create a device group!\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
char *buffer = loadFile("src/mat.cl");
|
|
||||||
printf("%s", buffer);
|
|
||||||
|
|
||||||
cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
|
|
||||||
if(!context) {
|
|
||||||
printf("Failed to create context\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
cl_command_queue queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err);
|
|
||||||
if(!queue) {
|
|
||||||
printf("Failed to create command queue\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t length = strlen(buffer);
|
|
||||||
cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err);
|
|
||||||
if(!program) {
|
|
||||||
printf("Failed to create program\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Failed to build program\n");
|
|
||||||
// clGetProgramBuildInfo...
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
cl_kernel kernel = clCreateKernel(program, "mat_multiply", &err);
|
|
||||||
if(!kernel) {
|
|
||||||
printf("Failed to create kernel\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
clm_Matrix a = clm_createMatrixRandom(5, 10);
|
|
||||||
clm_Matrix b = clm_createMatrixRandom(10, 3);
|
|
||||||
clm_Matrix out = clm_createMatrixRandom(5, 3);
|
|
||||||
|
|
||||||
cl_GPUMat matA = {.rows = a.rows, .cols = a.cols, .transposed = a.transposed};
|
|
||||||
cl_GPUMat matB = {.rows = b.rows, .cols = b.cols, .transposed = b.transposed};
|
|
||||||
cl_GPUMat matOut = {.rows = out.rows, .cols = out.cols, .transposed = out.transposed};
|
|
||||||
|
|
||||||
size_t inputSize = out.rows * out.cols;
|
|
||||||
|
|
||||||
clm_matrixMultiplyMatrix(a, b, out);
|
|
||||||
|
|
||||||
clm_matrixPrint(out);
|
|
||||||
clm_matrixZero(out);
|
|
||||||
|
|
||||||
cl_mem matA_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * a.rows * a.cols, NULL, &err);
|
|
||||||
cl_mem matB_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * b.rows * b.cols, NULL, &err);
|
|
||||||
cl_mem matOut_values = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * out.rows * out.cols, NULL, &err);
|
|
||||||
if(!matA_values || !matB_values || !matOut_values) {
|
|
||||||
printf("Failed to allocate input/output buffer\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = clEnqueueWriteBuffer(queue, matA_values, CL_TRUE, 0, sizeof(float) * a.rows * a.cols, a.values, 0, NULL, NULL);
|
|
||||||
err |= clEnqueueWriteBuffer(queue, matB_values, CL_TRUE, 0, sizeof(float) * b.rows * b.cols, b.values, 0, NULL, NULL);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Failed to write to buffer\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = 0;
|
|
||||||
err |= clSetKernelArg(kernel, 0, sizeof(matA), &matA);
|
|
||||||
err |= clSetKernelArg(kernel, 1, sizeof(matA_values), &matA_values);
|
|
||||||
err |= clSetKernelArg(kernel, 2, sizeof(matB), &matB);
|
|
||||||
err |= clSetKernelArg(kernel, 3, sizeof(matB_values), &matB_values);
|
|
||||||
err |= clSetKernelArg(kernel, 4, sizeof(matOut), &matOut);
|
|
||||||
err |= clSetKernelArg(kernel, 5, sizeof(matOut_values), &matOut_values);
|
|
||||||
// err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Failed to set kernel args\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*char *info = calloc(1, 1024);
|
|
||||||
clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL);
|
|
||||||
printf("INFO: %s\n", info);*/
|
|
||||||
|
|
||||||
size_t local;
|
|
||||||
err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Failed to get work group size\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("Group size is %zu\n", local);
|
|
||||||
|
|
||||||
size_t global = ceil((float) inputSize / local) * local;
|
|
||||||
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Failed to enqueue: %d\n", err);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
clFinish(queue);
|
|
||||||
|
|
||||||
err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * inputSize, out.values, 0, NULL, NULL);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Failed to read from buffer\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
clm_matrixPrint(out);
|
|
||||||
|
|
||||||
/*for(unsigned int i = 0; i < inputSize; i++) {
|
|
||||||
if(i % 1000 != 0) continue;
|
|
||||||
printf("%f: %f\n", inputData[i], outputData[i]);
|
|
||||||
}*/
|
|
||||||
}
|
|
@ -9,7 +9,6 @@ const clm_Matrix INVALID_MATRIX = {.rows = 0, .cols = 0, .values = NULL};
|
|||||||
const clm_Vector INVALID_VECTOR = {.length = 0, .values = NULL};
|
const clm_Vector INVALID_VECTOR = {.length = 0, .values = NULL};
|
||||||
|
|
||||||
clm_Matrix clm_createMatrix(unsigned int rows, unsigned int cols) {
|
clm_Matrix clm_createMatrix(unsigned int rows, unsigned int cols) {
|
||||||
printf("CREATING MATRIX\n");
|
|
||||||
clm_Matrix mat = {0};
|
clm_Matrix mat = {0};
|
||||||
mat.rows = rows;
|
mat.rows = rows;
|
||||||
mat.cols = cols;
|
mat.cols = cols;
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
#ifndef _CLM_H_
|
#ifndef _CLM_H_
|
||||||
#define _CLM_H_
|
#define _CLM_H_
|
||||||
|
|
||||||
|
#define CL_TARGET_OPENCL_VERSION 200
|
||||||
|
#include <CL/cl_platform.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
#define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
|
#define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
|
||||||
@ -26,6 +28,7 @@ typedef struct {
|
|||||||
clm_Matrix *error;
|
clm_Matrix *error;
|
||||||
clm_Matrix *weightsError;
|
clm_Matrix *weightsError;
|
||||||
clm_Matrix *gradient;
|
clm_Matrix *gradient;
|
||||||
|
|
||||||
clm_NativeBuf *nativeWeights;
|
clm_NativeBuf *nativeWeights;
|
||||||
clm_NativeBuf *nativeBias;
|
clm_NativeBuf *nativeBias;
|
||||||
clm_NativeBuf *nativeOutput;
|
clm_NativeBuf *nativeOutput;
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
struct clm_NativeBuf {};
|
struct clm_NativeBuf {};
|
||||||
|
|
||||||
int clm_gpuInit() {
|
int clm_gpuInit(unsigned int mode) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
58
src/clm_funcs.c
Normal file
58
src/clm_funcs.c
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
#include "clm_gpu.h"
|
||||||
|
|
||||||
|
#include <dlfcn.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
typedef int (*clm_gpuInitFunc)(unsigned int);
|
||||||
|
typedef void (*clm_gpuDestroyFunc)();
|
||||||
|
typedef void (*clm_linearForwardFunc)(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
|
||||||
|
typedef void (*clm_linearBackpropFunc)(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients);
|
||||||
|
|
||||||
|
static void *lib;
|
||||||
|
static clm_gpuInitFunc initFunc;
|
||||||
|
static clm_gpuDestroyFunc destroyFunc;
|
||||||
|
static clm_linearForwardFunc linearForwardFunc;
|
||||||
|
static clm_linearBackpropFunc linearBackpropFunc;
|
||||||
|
|
||||||
|
int clm_gpuInit(unsigned int mode) {
|
||||||
|
const char *libName = NULL;
|
||||||
|
if(mode & CLM_MODE_CPU) {
|
||||||
|
libName = "libclm_cpu.so";
|
||||||
|
} else if(mode & CLM_MODE_OPENCL) {
|
||||||
|
libName = "libclm_opencl.so";
|
||||||
|
} else {
|
||||||
|
printf("No valid mode supplied. Must set either CLM_MODE_CPU or CLM_MODE_GPU");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
lib = dlopen(libName, RTLD_NOW);
|
||||||
|
if(!lib) {
|
||||||
|
printf("Failed to load lib: %s\n", dlerror());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
initFunc = (clm_gpuInitFunc) dlsym(lib, "clm_gpuInit");
|
||||||
|
destroyFunc = (clm_gpuDestroyFunc) dlsym(lib, "clm_gpuDestroy");
|
||||||
|
linearForwardFunc = (clm_linearForwardFunc) dlsym(lib, "clm_linearForward");
|
||||||
|
linearBackpropFunc = (clm_linearBackpropFunc) dlsym(lib, "clm_linearBackprop");
|
||||||
|
|
||||||
|
if(!initFunc || !destroyFunc || !linearForwardFunc || !linearBackpropFunc) {
|
||||||
|
printf("Failed to load functions from library: %s\n", dlerror());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return initFunc(mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
void clm_gpuDestroy() {
|
||||||
|
destroyFunc();
|
||||||
|
dlclose(lib);
|
||||||
|
}
|
||||||
|
|
||||||
|
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
|
||||||
|
linearForwardFunc(linear, batchSize, inputs, outputs);
|
||||||
|
}
|
||||||
|
|
||||||
|
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
|
||||||
|
linearBackpropFunc(linear, learnRate, batchSize, inputs, outputs, inputErrors, updateErrors, outputErrors, outputWeightsErrors, outputGradients);
|
||||||
|
}
|
@ -3,7 +3,10 @@
|
|||||||
|
|
||||||
#include "clm.h"
|
#include "clm.h"
|
||||||
|
|
||||||
int clm_gpuInit();
|
#define CLM_MODE_CPU 1
|
||||||
|
#define CLM_MODE_OPENCL 2
|
||||||
|
|
||||||
|
int clm_gpuInit(unsigned int mode);
|
||||||
void clm_gpuDestroy();
|
void clm_gpuDestroy();
|
||||||
|
|
||||||
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
|
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
|
||||||
|
@ -33,7 +33,7 @@ typedef struct __attribute__((packed)) {
|
|||||||
#define gpuMat(mat) \
|
#define gpuMat(mat) \
|
||||||
{ .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed }
|
{ .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed }
|
||||||
|
|
||||||
int clm_gpuInit() {
|
int clm_gpuInit(unsigned int mode) {
|
||||||
// Connect to a compute device
|
// Connect to a compute device
|
||||||
int useGPU = true;
|
int useGPU = true;
|
||||||
cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
|
cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
|
119
src/cltest.bak.c
119
src/cltest.bak.c
@ -1,119 +0,0 @@
|
|||||||
#define CL_TARGET_OPENCL_VERSION 300
|
|
||||||
|
|
||||||
#include <CL/cl.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
int main()
|
|
||||||
{
|
|
||||||
// Connect to a compute device
|
|
||||||
//
|
|
||||||
int gpu = 1;
|
|
||||||
cl_device_id deviceID;
|
|
||||||
cl_int err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
|
|
||||||
if (err != CL_SUCCESS) {
|
|
||||||
printf("Error: Failed to create a device group!\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
FILE *file = fopen("src/test.cl", "r");
|
|
||||||
fseek(file, 0, SEEK_END);
|
|
||||||
size_t length = ftell(file);
|
|
||||||
fseek(file, 0, SEEK_SET);
|
|
||||||
char *buffer = calloc(1, length + 1);
|
|
||||||
fread(buffer, length, 1, file);
|
|
||||||
|
|
||||||
printf("%s", buffer);
|
|
||||||
|
|
||||||
cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
|
|
||||||
if(!context) {
|
|
||||||
printf("Failed to create context\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
cl_command_queue queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err);
|
|
||||||
if(!queue) {
|
|
||||||
printf("Failed to create command queue\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err);
|
|
||||||
if(!program) {
|
|
||||||
printf("Failed to create program\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Failed to build program\n");
|
|
||||||
// clGetProgramBuildInfo...
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
cl_kernel kernel = clCreateKernel(program, "do_stuff", &err);
|
|
||||||
if(!kernel) {
|
|
||||||
printf("Failed to create kernel\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int inputSize = 256000000;
|
|
||||||
float *inputData = calloc(inputSize, sizeof(float));
|
|
||||||
for(unsigned int i = 0; i < inputSize; i++) {
|
|
||||||
inputData[i] = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * inputSize, NULL, &err);
|
|
||||||
cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * inputSize, NULL, &err);
|
|
||||||
if(!input || !output) {
|
|
||||||
printf("Failed to allocate input/output buffer\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, sizeof(float) * inputSize, inputData, 0, NULL, NULL);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Failed to write to buffer\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = 0;
|
|
||||||
err = clSetKernelArg(kernel, 0, sizeof(input), &input);
|
|
||||||
err |= clSetKernelArg(kernel, 1, sizeof(output), &output);
|
|
||||||
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Failed to set kernel args\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*char *info = calloc(1, 1024);
|
|
||||||
clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL);
|
|
||||||
printf("INFO: %s\n", info);*/
|
|
||||||
|
|
||||||
size_t local;
|
|
||||||
err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Failed to get work group size\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("Group size is %zu\n", local);
|
|
||||||
|
|
||||||
size_t global = inputSize;
|
|
||||||
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Failed to enqueue\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
clFinish(queue);
|
|
||||||
|
|
||||||
float *outputData = calloc(inputSize, sizeof(float));
|
|
||||||
err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(float) * inputSize, outputData, 0, NULL, NULL);
|
|
||||||
if(err != CL_SUCCESS) {
|
|
||||||
printf("Failed to read from buffer\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(unsigned int i = 0; i < inputSize; i++) {
|
|
||||||
if(i % 1000 != 0) continue;
|
|
||||||
printf("%f: %f\n", inputData[i], outputData[i]);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,264 +0,0 @@
|
|||||||
#include <dlfcn.h>
|
|
||||||
#include <inttypes.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
#include "clm.h"
|
|
||||||
|
|
||||||
float train_data_x[4][2] = {
|
|
||||||
{0, 0},
|
|
||||||
{0, 1},
|
|
||||||
{1, 0},
|
|
||||||
{1, 1}};
|
|
||||||
|
|
||||||
float train_data_y[4][1] = {
|
|
||||||
{0},
|
|
||||||
{1},
|
|
||||||
{1},
|
|
||||||
{0}};
|
|
||||||
|
|
||||||
float *predict(clm_NN nn, float *x, unsigned int length) {
|
|
||||||
clm_Matrix xM = clm_matrixWrapArray(x, length);
|
|
||||||
|
|
||||||
for(unsigned int i = 0; i < nn.numLayers; i++) {
|
|
||||||
clm_Linear layer = nn.layers[i];
|
|
||||||
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output);
|
|
||||||
|
|
||||||
if(clm_matrixIsInvalid(newX)) {
|
|
||||||
printf("Failed to predict\n");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
clm_matrixAddMatrix(newX, layer.bias);
|
|
||||||
clm_matrixSigmoid(newX);
|
|
||||||
xM = newX;
|
|
||||||
}
|
|
||||||
|
|
||||||
return xM.values;
|
|
||||||
}
|
|
||||||
|
|
||||||
void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
|
|
||||||
clm_Matrix xM = clm_matrixWrapArray(x, xL);
|
|
||||||
clm_Matrix yM = clm_matrixWrapArray(y, yL);
|
|
||||||
|
|
||||||
// TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached)
|
|
||||||
|
|
||||||
// Forward pass
|
|
||||||
clm_Matrix currentX = xM;
|
|
||||||
for(unsigned int i = 0; i < nn.numLayers; i++) {
|
|
||||||
clm_Linear layer = nn.layers[i];
|
|
||||||
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output);
|
|
||||||
if(clm_matrixIsInvalid(newX)) {
|
|
||||||
printf("Forward pass failed\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
clm_matrixAddMatrix(newX, layer.bias);
|
|
||||||
clm_matrixSigmoid(newX);
|
|
||||||
currentX = newX;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int i = nn.numLayers - 1; i >= 0; i--) {
|
|
||||||
clm_Linear layer = nn.layers[i];
|
|
||||||
clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output;
|
|
||||||
clm_Matrix outputOfThisLayer = nn.layers[i].output;
|
|
||||||
clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
|
|
||||||
clm_Matrix error = layer.error;
|
|
||||||
|
|
||||||
if(i == nn.numLayers - 1) {
|
|
||||||
clm_matrixSubtractMatrix(clm_matrixCopy(yM, error), outputOfThisLayer); // yhat - y
|
|
||||||
} else {
|
|
||||||
clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
|
|
||||||
clm_matrixMultiplyMatrix(weightsT, prevError, error);
|
|
||||||
}
|
|
||||||
|
|
||||||
clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat)
|
|
||||||
clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat)
|
|
||||||
clm_matrixMultiplyScalar(gradient, nn.learnRate);
|
|
||||||
|
|
||||||
clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer);
|
|
||||||
clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError);
|
|
||||||
|
|
||||||
clm_matrixAddMatrix(layer.weights, layer.weightsError);
|
|
||||||
clm_matrixAddMatrix(layer.bias, gradient);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
|
|
||||||
FILE *file = fopen("data/train-labels.idx1-ubyte", "r");
|
|
||||||
if(!file) {
|
|
||||||
perror("Failed to open labels\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned char magicBytes[4];
|
|
||||||
fread(magicBytes, sizeof(magicBytes), 1, file);
|
|
||||||
|
|
||||||
printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);
|
|
||||||
|
|
||||||
unsigned char lengthBytes[4];
|
|
||||||
fread(lengthBytes, sizeof(lengthBytes), 1, file);
|
|
||||||
|
|
||||||
uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) |
|
|
||||||
(lengthBytes[2] << 8) | lengthBytes[3];
|
|
||||||
printf("%" PRId32 "\n", length);
|
|
||||||
|
|
||||||
clm_Vector *vectors = calloc(length, sizeof(clm_Vector));
|
|
||||||
|
|
||||||
for(unsigned int i = 0; i < length; i++) {
|
|
||||||
unsigned char label;
|
|
||||||
fread(&label, sizeof(unsigned char), 1, file);
|
|
||||||
|
|
||||||
clm_Vector vector = clm_vectorCreate(10);
|
|
||||||
for(unsigned int j = 0; j < 10; j++) {
|
|
||||||
vector.values[j] = label == j ? 1 : 0;
|
|
||||||
}
|
|
||||||
vectors[i] = vector;
|
|
||||||
}
|
|
||||||
|
|
||||||
*labelsOut = vectors;
|
|
||||||
*labelsCountOut = length;
|
|
||||||
}
|
|
||||||
|
|
||||||
void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
|
|
||||||
FILE *file = fopen("data/train-images.idx3-ubyte", "r");
|
|
||||||
if(!file) {
|
|
||||||
perror("Failed to open images\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned char magicBytes[4];
|
|
||||||
fread(magicBytes, sizeof(magicBytes), 1, file);
|
|
||||||
|
|
||||||
printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);
|
|
||||||
|
|
||||||
unsigned char lengthBytes[4];
|
|
||||||
fread(lengthBytes, sizeof(lengthBytes), 1, file);
|
|
||||||
uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3];
|
|
||||||
printf("%" PRId32 "\n", length);
|
|
||||||
|
|
||||||
unsigned char rowsBytes[4];
|
|
||||||
fread(rowsBytes, sizeof(rowsBytes), 1, file);
|
|
||||||
uint32_t rows = (rowsBytes[0] << 24) | (rowsBytes[1] << 16) | (rowsBytes[2] << 8) | rowsBytes[3];
|
|
||||||
printf("%" PRId32 "\n", rows);
|
|
||||||
|
|
||||||
unsigned char colsBytes[4];
|
|
||||||
fread(colsBytes, sizeof(colsBytes), 1, file);
|
|
||||||
uint32_t cols = (colsBytes[0] << 24) | (colsBytes[1] << 16) | (colsBytes[2] << 8) | colsBytes[3];
|
|
||||||
printf("%" PRId32 "\n", cols);
|
|
||||||
|
|
||||||
clm_Vector *images = calloc(length, sizeof(clm_Vector));
|
|
||||||
for(unsigned int i = 0; i < length; i++) {
|
|
||||||
clm_Vector vec = clm_vectorCreate(cols * rows);
|
|
||||||
unsigned char image[cols * rows];
|
|
||||||
fread(image, sizeof(image), 1, file);
|
|
||||||
for(unsigned int j = 0; j < cols * rows; j++) {
|
|
||||||
vec.values[j] = (float) image[j];
|
|
||||||
}
|
|
||||||
images[i] = vec;
|
|
||||||
}
|
|
||||||
|
|
||||||
*imagesOut = images;
|
|
||||||
*imageCountOut = length;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef void *(*callocFunc)(size_t, size_t);
|
|
||||||
|
|
||||||
callocFunc oldCalloc;
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
oldCalloc = dlsym(RTLD_NEXT, "calloc");
|
|
||||||
|
|
||||||
clm_Vector *labels = NULL;
|
|
||||||
unsigned int labelCount;
|
|
||||||
loadLabels(&labels, &labelCount);
|
|
||||||
printf("LENGTH: %u\n", labelCount);
|
|
||||||
|
|
||||||
clm_Vector *images = NULL;
|
|
||||||
unsigned int imageCount;
|
|
||||||
loadImages(&images, &imageCount);
|
|
||||||
|
|
||||||
imageCount = 60000;
|
|
||||||
|
|
||||||
printf("%f\n", images[0].values[0]);
|
|
||||||
|
|
||||||
srand(1);
|
|
||||||
|
|
||||||
unsigned int
|
|
||||||
i = 784,
|
|
||||||
h = 30,
|
|
||||||
o = 10;
|
|
||||||
|
|
||||||
clm_Linear layer1 = clm_linearCreateRandom(i, h);
|
|
||||||
clm_Linear layer2 = clm_linearCreateRandom(h, o);
|
|
||||||
clm_Linear layers[] = {layer1, layer2};
|
|
||||||
clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};
|
|
||||||
|
|
||||||
for(unsigned int epoch = 0; epoch < 1; epoch++) {
|
|
||||||
printf("Epoch %u\n", epoch);
|
|
||||||
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
|
|
||||||
if(idx % 1000 == 0) {
|
|
||||||
printf("\r%.2f%%", idx / (float) imageCount * 100);
|
|
||||||
fflush(stdout);
|
|
||||||
}
|
|
||||||
// printf("%u\n", idx);
|
|
||||||
// train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
|
|
||||||
/*for(unsigned int f = 0; f < images[idx].length; f++) {
|
|
||||||
printf("%.2f ", images[idx].values[f]);
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
for(unsigned int f = 0; f < labels[idx].length; f++) {
|
|
||||||
printf("%.2f ", labels[idx].values[f]);
|
|
||||||
}
|
|
||||||
printf("\n");*/
|
|
||||||
// printf("%.2f\n", labels.values[idx]);
|
|
||||||
|
|
||||||
train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
|
|
||||||
// train(nn, test, 784, target, 10);
|
|
||||||
// predict(nn, test, 784);
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("Train done\n");
|
|
||||||
|
|
||||||
unsigned int correct = 0;
|
|
||||||
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
|
|
||||||
// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
|
|
||||||
// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
|
|
||||||
float *pred = predict(nn, images[idx].values, images[idx].length);
|
|
||||||
unsigned int predDigit = 0;
|
|
||||||
float max = -1;
|
|
||||||
for(unsigned int j = 0; j < 10; j++) {
|
|
||||||
// printf("%.2f ", pred[j]);
|
|
||||||
if(pred[j] > max || max < 0) {
|
|
||||||
max = pred[j];
|
|
||||||
predDigit = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
|
|
||||||
|
|
||||||
unsigned int actDigit = 0;
|
|
||||||
float maxA = -1;
|
|
||||||
for(unsigned int j = 0; j < 10; j++) {
|
|
||||||
// printf("%.2f ", pred[j]);
|
|
||||||
if(labels[idx].values[j] > maxA || maxA < 0) {
|
|
||||||
maxA = labels[idx].values[j];
|
|
||||||
actDigit = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(idx < 100) printf("Actual: %u\n", actDigit);
|
|
||||||
// printf("\n");
|
|
||||||
|
|
||||||
if(predDigit == actDigit) correct++;
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
|
|
||||||
|
|
||||||
printf("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
void *calloc(size_t nmemb, size_t size) {
|
|
||||||
// printf("CALLOC\n");
|
|
||||||
return oldCalloc(nmemb, size);
|
|
||||||
}
|
|
31
src/cltest.c
31
src/cltest.c
@ -1,7 +1,9 @@
|
|||||||
|
#include <errno.h>
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#include "clm.h"
|
#include "clm.h"
|
||||||
#include "clm_gpu.h"
|
#include "clm_gpu.h"
|
||||||
@ -105,7 +107,7 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *
|
|||||||
void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
|
void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
|
||||||
FILE *file = fopen("data/train-labels.idx1-ubyte", "r");
|
FILE *file = fopen("data/train-labels.idx1-ubyte", "r");
|
||||||
if(!file) {
|
if(!file) {
|
||||||
perror("Failed to open labels\n");
|
printf("Failed to open labels: %s\n", strerror(errno));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -141,7 +143,7 @@ void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
|
|||||||
void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
|
void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
|
||||||
FILE *file = fopen("data/train-images.idx3-ubyte", "r");
|
FILE *file = fopen("data/train-images.idx3-ubyte", "r");
|
||||||
if(!file) {
|
if(!file) {
|
||||||
perror("Failed to open images\n");
|
printf("Failed to open images: %s\n", strerror(errno));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -180,8 +182,23 @@ void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
|
|||||||
*imageCountOut = length;
|
*imageCountOut = length;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main() {
|
int main(int argc, const char *argv[]) {
|
||||||
if(clm_gpuInit() != 0) {
|
if(argc < 2) {
|
||||||
|
printf("Usage: %s [cpu | opencl]\n", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int mode = 0;
|
||||||
|
if(strcmp(argv[1], "cpu") == 0) {
|
||||||
|
mode = CLM_MODE_CPU;
|
||||||
|
} else if(strcmp(argv[1], "opencl") == 0) {
|
||||||
|
mode = CLM_MODE_OPENCL;
|
||||||
|
} else {
|
||||||
|
printf("Invalid mode\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(clm_gpuInit(mode) != 0) {
|
||||||
printf("Failed to init GPU\n");
|
printf("Failed to init GPU\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -190,6 +207,10 @@ int main() {
|
|||||||
unsigned int labelCount;
|
unsigned int labelCount;
|
||||||
loadLabels(&labels, &labelCount);
|
loadLabels(&labels, &labelCount);
|
||||||
printf("LENGTH: %u\n", labelCount);
|
printf("LENGTH: %u\n", labelCount);
|
||||||
|
if(labelCount == 0) {
|
||||||
|
printf("Loaded no labels\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
clm_Vector *images = NULL;
|
clm_Vector *images = NULL;
|
||||||
unsigned int imageCount;
|
unsigned int imageCount;
|
||||||
@ -209,7 +230,7 @@ int main() {
|
|||||||
clm_Linear layers[] = {
|
clm_Linear layers[] = {
|
||||||
clm_linearCreateRandom(i, h),
|
clm_linearCreateRandom(i, h),
|
||||||
clm_linearCreateRandom(h, o)};
|
clm_linearCreateRandom(h, o)};
|
||||||
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 1000);
|
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, imageCount / 2);
|
||||||
|
|
||||||
for(unsigned int epoch = 0; epoch < 1; epoch++) {
|
for(unsigned int epoch = 0; epoch < 1; epoch++) {
|
||||||
printf("Epoch %u\n", epoch);
|
printf("Epoch %u\n", epoch);
|
||||||
|
@ -1,5 +0,0 @@
|
|||||||
__kernel void do_stuff(__global float *input, __global float *output, unsigned int count) {
|
|
||||||
int i = get_global_id(0);
|
|
||||||
//printf("Task %d\n", i);
|
|
||||||
output[i] = input[i] * input[i];
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user