Improving parallelism in fragment shader leading to more than 20x the framerate
This commit is contained in:
@@ -27,7 +27,7 @@ const char* fragment_shader_kernel_source =
|
|||||||
"} \n"
|
"} \n"
|
||||||
" \n"
|
" \n"
|
||||||
"__kernel void fragment_shader ( \n"
|
"__kernel void fragment_shader ( \n"
|
||||||
" __global int* faces, \n"
|
" __global int3* faces, \n"
|
||||||
" __global float* vertices, \n"
|
" __global float* vertices, \n"
|
||||||
" __global int* pixels, \n"
|
" __global int* pixels, \n"
|
||||||
" __global int* screen_width, \n"
|
" __global int* screen_width, \n"
|
||||||
@@ -39,8 +39,9 @@ const char* fragment_shader_kernel_source =
|
|||||||
" __global float* norms_buff, \n"
|
" __global float* norms_buff, \n"
|
||||||
" __global uchar* diffuse_map \n"
|
" __global uchar* diffuse_map \n"
|
||||||
") { \n"
|
") { \n"
|
||||||
" int index = get_global_id(0); \n"
|
" int GROUP_ID = get_group_id(0); \n"
|
||||||
" \n"
|
" int GROUP_SIZE = get_local_size(0); \n"
|
||||||
|
" int LOCAL_ID = get_local_id(0); \n"
|
||||||
" \n"
|
" \n"
|
||||||
" bool out = true; \n"
|
" bool out = true; \n"
|
||||||
" float3 vertices3[3]; \n"
|
" float3 vertices3[3]; \n"
|
||||||
@@ -48,18 +49,18 @@ const char* fragment_shader_kernel_source =
|
|||||||
" float3 norms[3]; \n"
|
" float3 norms[3]; \n"
|
||||||
" \n"
|
" \n"
|
||||||
" for(int i = 0; i < 3; i++) { \n"
|
" for(int i = 0; i < 3; i++) { \n"
|
||||||
" float4 vertex = vertices [faces[(index * 9) + (3 * i) + (0)]]; \n"
|
" float4 vertex;// = vertices[faces[GROUP_ID * 3 + i ][0]]; \n"
|
||||||
" for(int j = 0; j < 4; j ++) { \n"
|
" for(int j = 0; j < 4; j ++) { \n"
|
||||||
" vertex[j] = vertices[4 * faces[(index*9) + (3 * i) + (0)] + j]; \n"
|
" vertex[j] = vertices[4 * faces[GROUP_ID*3 + i][0] + j]; \n"
|
||||||
" } \n"
|
" } \n"
|
||||||
" \n"
|
" \n"
|
||||||
" for(int j = 0; j < 2; j++) { \n"
|
" for(int j = 0; j < 2; j++) { \n"
|
||||||
" uv_coords[i][j] = uv_buffer [2 * faces[(index * 9) + (3 * i) + (1)] + j]; \n"
|
" uv_coords[i][j] = uv_buffer [2 * faces[GROUP_ID * 3 + i][1] + j]; \n"
|
||||||
" } \n"
|
" } \n"
|
||||||
" \n"
|
" \n"
|
||||||
" for( int j = 0; j < 3; j++ ) { \n"
|
" for( int j = 0; j < 3; j++ ) { \n"
|
||||||
" vertices3[j][i] = (vertex[j]/vertex[3]); \n"
|
" vertices3[j][i] = (vertex[j]/vertex[3]); \n"
|
||||||
" norms[j][i] = norms_buff[3 * (faces[(index * 9) + (3 * i) + (2)]) + j]; \n"
|
" norms[j][i] = norms_buff[3 * (faces[GROUP_ID * 3 + i][2]) + j]; \n"
|
||||||
" } \n"
|
" } \n"
|
||||||
" \n"
|
" \n"
|
||||||
" if ( vertices3[0][i] > 0 && vertices3[0][i] < *screen_width \n"
|
" if ( vertices3[0][i] > 0 && vertices3[0][i] < *screen_width \n"
|
||||||
@@ -82,10 +83,21 @@ const char* fragment_shader_kernel_source =
|
|||||||
" } \n"
|
" } \n"
|
||||||
" } \n"
|
" } \n"
|
||||||
" \n"
|
" \n"
|
||||||
" float3 point; \n"
|
" if(bounding_box_min[0] > *screen_width || bounding_box_max[0] < 0 || bounding_box_min[1] > *screen_width || bounding_box_max[1] < 0) return; \n"
|
||||||
" for(point[0] = bounding_box_min[0]; point[0] <= bounding_box_max[0]; point[0]++) { \n"
|
|
||||||
" for(point[1] = bounding_box_min[1]; point[1] <= bounding_box_max[1]; point[1]++) { \n"
|
|
||||||
" \n"
|
" \n"
|
||||||
|
" \n"
|
||||||
|
" int X_PER_ITEM = (int)(ceil((float)(bounding_box_max[0] - bounding_box_min[0]) / 16.f)); \n"
|
||||||
|
" int Y_PER_ITEM = (int)(ceil((float)(bounding_box_max[1] - bounding_box_min[1]) / 16.f)); \n"
|
||||||
|
" float STARTING_X = bounding_box_min[0] + X_PER_ITEM * (LOCAL_ID % 16); \n"
|
||||||
|
" float ENDING_X = STARTING_X + X_PER_ITEM; \n"
|
||||||
|
" float STARTING_Y = bounding_box_min[1] + Y_PER_ITEM * (LOCAL_ID / 16); \n"
|
||||||
|
" float ENDING_Y = STARTING_Y + Y_PER_ITEM; \n"
|
||||||
|
" \n"
|
||||||
|
" \n"
|
||||||
|
" float3 point; \n"
|
||||||
|
" for(point[0] = STARTING_X; point[0] <= ENDING_X; point[0]++) { \n"
|
||||||
|
" for(point[1] = STARTING_Y; point[1] <= ENDING_Y; point[1]++) { \n"
|
||||||
|
" if(point[1] >= *screen_width || point[1] >= *screen_width) break; \n"
|
||||||
" float3 bc_coord = barycentric(vertices3, point); \n"
|
" float3 bc_coord = barycentric(vertices3, point); \n"
|
||||||
" if (bc_coord[0] < 0 || bc_coord[1] < 0 || bc_coord[2] < 0) continue; \n"
|
" if (bc_coord[0] < 0 || bc_coord[1] < 0 || bc_coord[2] < 0) continue; \n"
|
||||||
" \n"
|
" \n"
|
||||||
@@ -114,13 +126,18 @@ const char* fragment_shader_kernel_source =
|
|||||||
" float3 normalized_norm = normalize(normal); \n"
|
" float3 normalized_norm = normalize(normal); \n"
|
||||||
" \n"
|
" \n"
|
||||||
" \n"
|
" \n"
|
||||||
" float intensity = clamp((dot(normalized_norm , (float3){light_dir[0], light_dir[1], light_dir[2]})), 0.f, 1.f); \n"
|
" float intensity = clamp((dot(normalized_norm , (float3){light_dir[0], light_dir[1], light_dir[2]})), 0.f, 1.f) + 0.2; \n"
|
||||||
" \n"
|
" \n"
|
||||||
" int color = 0; \n"
|
" int color = 0; \n"
|
||||||
" color |= ((int)fmin((float)(diffuse_map[col_index + 0]) * intensity, (float) 0xff)) << 16; \n"
|
" color |= ((int)fmin((float)(diffuse_map[col_index + 0]) * intensity, (float) 0xff)) << 16; \n"
|
||||||
" color |= ((int)fmin((float)(diffuse_map[col_index + 1]) * intensity, (float) 0xff)) << 8; \n"
|
" color |= ((int)fmin((float)(diffuse_map[col_index + 1]) * intensity, (float) 0xff)) << 8; \n"
|
||||||
" color |= ((int)fmin((float)(diffuse_map[col_index + 2]) * intensity, (float) 0xff)) << 0; \n"
|
" color |= ((int)fmin((float)(diffuse_map[col_index + 2]) * intensity, (float) 0xff)) << 0; \n"
|
||||||
" \n"
|
" \n"
|
||||||
|
" //color |= ((int)fmin((float)(0xff) * intensity, (float) 0xff)) << 16; \n"
|
||||||
|
" //color |= ((int)fmin((float)(0xff) * intensity, (float) 0xff)) << 8; \n"
|
||||||
|
" //color |= ((int)fmin((float)(0xff) * intensity, (float) 0xff)) << 0; \n"
|
||||||
|
" \n"
|
||||||
|
" \n"
|
||||||
" z_buffer[(int) (point[0] + point[1] * *screen_width)] = point[2]; \n"
|
" z_buffer[(int) (point[0] + point[1] * *screen_width)] = point[2]; \n"
|
||||||
" pixels [(int) (point[0] + point[1] * *screen_width)] = color;// & 0x00ffffff; \n"
|
" pixels [(int) (point[0] + point[1] * *screen_width)] = color;// & 0x00ffffff; \n"
|
||||||
" } \n"
|
" } \n"
|
||||||
|
|||||||
@@ -107,7 +107,7 @@ void vertex_shader(float* z, float* vertices, int vertex_count, float* new_verti
|
|||||||
{
|
{
|
||||||
mat_z = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * MATRIX_SIZE , NULL, &err);
|
mat_z = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * MATRIX_SIZE , NULL, &err);
|
||||||
vertices_mem = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * vertex_count * 3, NULL, &err);
|
vertices_mem = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * vertex_count * 3, NULL, &err);
|
||||||
new_vertices_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * vertex_count * 4, NULL, &err);
|
new_vertices_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * vertex_count, NULL, &err);
|
||||||
|
|
||||||
err = clSetKernelArg(vertex_shader_kernel, 0, sizeof(cl_mem), &mat_z);
|
err = clSetKernelArg(vertex_shader_kernel, 0, sizeof(cl_mem), &mat_z);
|
||||||
err = clSetKernelArg(vertex_shader_kernel, 1, sizeof(cl_mem), &vertices_mem);
|
err = clSetKernelArg(vertex_shader_kernel, 1, sizeof(cl_mem), &vertices_mem);
|
||||||
@@ -134,7 +134,7 @@ void clear(cl_mem* buffer, size_t size, const int pattern) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void fragment_shader(
|
void fragment_shader(
|
||||||
int* faces,
|
cl_int3* faces,
|
||||||
int nfaces,
|
int nfaces,
|
||||||
float* uv,
|
float* uv,
|
||||||
size_t uv_size,
|
size_t uv_size,
|
||||||
@@ -149,7 +149,7 @@ void fragment_shader(
|
|||||||
) {
|
) {
|
||||||
if (!fragment_shader_buffers_initialized) {
|
if (!fragment_shader_buffers_initialized) {
|
||||||
|
|
||||||
faces_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(int) * 3 * 3 * nfaces , NULL, &err);
|
faces_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_int3) * 3 * nfaces , NULL, &err);
|
||||||
pixel_data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * screen_height * screen_width , NULL, &err);
|
pixel_data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * screen_height * screen_width , NULL, &err);
|
||||||
screen_width_mem = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(int) , NULL, &err);
|
screen_width_mem = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(int) , NULL, &err);
|
||||||
z_buffer_mem = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * screen_height * screen_width, NULL, &err);
|
z_buffer_mem = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * screen_height * screen_width, NULL, &err);
|
||||||
@@ -174,7 +174,7 @@ void fragment_shader(
|
|||||||
err = clSetKernelArg(fragment_shader_kernel, 10, sizeof(cl_mem), &diffuse_map_buffer);
|
err = clSetKernelArg(fragment_shader_kernel, 10, sizeof(cl_mem), &diffuse_map_buffer);
|
||||||
|
|
||||||
err = clEnqueueWriteBuffer(commands, screen_width_mem , CL_FALSE, 0, sizeof(int) , &screen_width , 0, NULL, NULL);
|
err = clEnqueueWriteBuffer(commands, screen_width_mem , CL_FALSE, 0, sizeof(int) , &screen_width , 0, NULL, NULL);
|
||||||
err = clEnqueueWriteBuffer(commands, faces_buffer , CL_FALSE, 0, sizeof(int) * 3 * 3 * nfaces , faces , 0, NULL, NULL);
|
err = clEnqueueWriteBuffer(commands, faces_buffer , CL_FALSE, 0, sizeof(cl_int3) * 3 * nfaces , faces , 0, NULL, NULL);
|
||||||
err = clEnqueueWriteBuffer(commands, nfaces_mem , CL_FALSE, 0, sizeof(int) , &nfaces , 0, NULL, NULL);
|
err = clEnqueueWriteBuffer(commands, nfaces_mem , CL_FALSE, 0, sizeof(int) , &nfaces , 0, NULL, NULL);
|
||||||
err = clEnqueueWriteBuffer(commands, uv_buffer , CL_FALSE, 0, uv_size , uv , 0, NULL, NULL);
|
err = clEnqueueWriteBuffer(commands, uv_buffer , CL_FALSE, 0, uv_size , uv , 0, NULL, NULL);
|
||||||
err = clEnqueueWriteBuffer(commands, map_size_buffer , CL_FALSE, 0, sizeof(int) * 2 , map_size , 0, NULL, NULL);
|
err = clEnqueueWriteBuffer(commands, map_size_buffer , CL_FALSE, 0, sizeof(int) * 2 , map_size , 0, NULL, NULL);
|
||||||
@@ -190,7 +190,8 @@ void fragment_shader(
|
|||||||
clear(&z_buffer_mem, sizeof(float) * screen_width * screen_height, 0);
|
clear(&z_buffer_mem, sizeof(float) * screen_width * screen_height, 0);
|
||||||
|
|
||||||
|
|
||||||
size_t fragment_shader_global[] = { nfaces };
|
size_t fragment_shader_global[] = { nfaces * 256 };
|
||||||
|
size_t framgent_shader_local[] = { 256 };
|
||||||
|
|
||||||
err = clEnqueueNDRangeKernel(commands, fragment_shader_kernel, 1, NULL, fragment_shader_global, NULL, 0, NULL, NULL);
|
err = clEnqueueNDRangeKernel(commands, fragment_shader_kernel, 1, NULL, fragment_shader_global, NULL, 0, NULL, NULL);
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ extern const char* fragment_shader_kernel_source;
|
|||||||
void init_kernels();
|
void init_kernels();
|
||||||
void vertex_shader(float* z, float* vertices, int vertex_count, float* new_vertices);
|
void vertex_shader(float* z, float* vertices, int vertex_count, float* new_vertices);
|
||||||
void fragment_shader(
|
void fragment_shader(
|
||||||
int* faces,
|
cl_int3* faces,
|
||||||
int nfaces,
|
int nfaces,
|
||||||
float* uv,
|
float* uv,
|
||||||
size_t uv_size,
|
size_t uv_size,
|
||||||
|
|||||||
+2
-2
@@ -75,14 +75,14 @@ bool HandleButtonPressed() {
|
|||||||
|
|
||||||
void CALLBACK FixedUpdate(HWND hwnd, UINT message, UINT uInt, DWORD dWord)
|
void CALLBACK FixedUpdate(HWND hwnd, UINT message, UINT uInt, DWORD dWord)
|
||||||
{
|
{
|
||||||
TIME += 0.167;
|
TIME += 0.03333333;
|
||||||
HandleButtonPressed();
|
HandleButtonPressed();
|
||||||
camera.ApplyChanges();
|
camera.ApplyChanges();
|
||||||
clock_t start = clock();
|
clock_t start = clock();
|
||||||
render();
|
render();
|
||||||
|
Update();
|
||||||
clock_t end = clock();
|
clock_t end = clock();
|
||||||
printf("FPS: %f\n", 1/((float)(end-start)/CLOCKS_PER_SEC));
|
printf("FPS: %f\n", 1/((float)(end-start)/CLOCKS_PER_SEC));
|
||||||
Update();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LRESULT CALLBACK WndProc(HWND hwnd, UINT message, WPARAM wParam, LPARAM lParam)
|
LRESULT CALLBACK WndProc(HWND hwnd, UINT message, WPARAM wParam, LPARAM lParam)
|
||||||
|
|||||||
+11
-8
@@ -14,9 +14,9 @@
|
|||||||
#define VERTICAL_CAMERA_CLAMP_UP 90
|
#define VERTICAL_CAMERA_CLAMP_UP 90
|
||||||
#define VERTICAL_CAMERA_CLAMP_DOWN -90
|
#define VERTICAL_CAMERA_CLAMP_DOWN -90
|
||||||
#define NEAR_CLIP_PLANE 1.f
|
#define NEAR_CLIP_PLANE 1.f
|
||||||
#define FAR_CLIP_PLANE 200.0f
|
#define FAR_CLIP_PLANE 2000.0f
|
||||||
#define FOV 50
|
#define FOV 50
|
||||||
#define CAMERA_MOVEMENT_SPEED .1f
|
#define CAMERA_MOVEMENT_SPEED .7f
|
||||||
#define DEFAULT_CAMERA_POS Vec3f(0, 0, 5)
|
#define DEFAULT_CAMERA_POS Vec3f(0, 0, 5)
|
||||||
#define DEFAULT_CAMERA_ROT Vec3f(0, 0, 0)
|
#define DEFAULT_CAMERA_ROT Vec3f(0, 0, 0)
|
||||||
#define LIGHT_INTENSITY 1.5
|
#define LIGHT_INTENSITY 1.5
|
||||||
@@ -30,13 +30,13 @@ Matrix ViewPort = Matrix::identity();
|
|||||||
Matrix ModelView = Matrix::identity();
|
Matrix ModelView = Matrix::identity();
|
||||||
Matrix Projection = Matrix::identity();
|
Matrix Projection = Matrix::identity();
|
||||||
|
|
||||||
Model* model = new Model("african_head.obj");
|
Model* model = new Model("sakura.obj");
|
||||||
Camera camera;
|
Camera camera;
|
||||||
|
|
||||||
Vec3f light_dir = Vec3f(1, 1, 1).normalize();
|
Vec3f light_dir = Vec3f(1, 1, 1).normalize();
|
||||||
|
|
||||||
float* new_verts = (float*)malloc(4 * sizeof(float) * model->nverts());
|
float* new_verts = (float*)malloc(4 * sizeof(float) * model->nverts());
|
||||||
int* faces = (int*)malloc(3 * 3 * sizeof(int) * model->nfaces());
|
cl_int3* faces = (cl_int3*)malloc(3 * sizeof(cl_int3) * model->nfaces());
|
||||||
|
|
||||||
bool init_flag = false;
|
bool init_flag = false;
|
||||||
|
|
||||||
@@ -96,10 +96,13 @@ void render()
|
|||||||
if (!init_flag) {
|
if (!init_flag) {
|
||||||
//light_dir = camera.GetForward().normalize() * -1;
|
//light_dir = camera.GetForward().normalize() * -1;
|
||||||
viewport(0, 0, screen_width, screen_height, FAR_CLIP_PLANE, NEAR_CLIP_PLANE);
|
viewport(0, 0, screen_width, screen_height, FAR_CLIP_PLANE, NEAR_CLIP_PLANE);
|
||||||
for (int i = 0; i < model->nfaces(); i++)
|
for (int i = 0; i < model->nfaces(); i++) {
|
||||||
for (int j = 0; j < 3; j++)
|
for (int j = 0; j < 3; j++) {
|
||||||
for (int k = 0; k < 3; k++)
|
faces[i * 3 + j].x = model->faces_[i][j][0];
|
||||||
faces[i * 9 + j * 3 + k] = model->faces_[i][j][k];
|
faces[i * 3 + j].y = model->faces_[i][j][1];
|
||||||
|
faces[i * 3 + j].z = model->faces_[i][j][2];
|
||||||
|
}
|
||||||
|
}
|
||||||
init_flag = true;
|
init_flag = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
Reference in New Issue
Block a user