diff --git a/OpenWindow/OpenWindow.vcxproj b/OpenWindow/OpenWindow.vcxproj
index 92d4ff0..d6e46ec 100644
--- a/OpenWindow/OpenWindow.vcxproj
+++ b/OpenWindow/OpenWindow.vcxproj
@@ -78,9 +78,11 @@
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>$(ProjectDir)\include</AdditionalIncludeDirectories>
       <DisableSpecificWarnings>4996;</DisableSpecificWarnings>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <AdditionalLibraryDirectories>$(ProjectDir)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <SubSystem>NotSet</SubSystem>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@@ -91,9 +93,11 @@
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>$(ProjectDir)\include</AdditionalIncludeDirectories>
       <DisableSpecificWarnings>4996;</DisableSpecificWarnings>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <AdditionalLibraryDirectories>$(ProjectDir)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <SubSystem>NotSet</SubSystem>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -106,11 +110,13 @@
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>$(ProjectDir)\include</AdditionalIncludeDirectories>
       <DisableSpecificWarnings>4996;</DisableSpecificWarnings>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
       <AdditionalLibraryDirectories>$(ProjectDir)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <SubSystem>NotSet</SubSystem>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@@ -123,11 +129,13 @@
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>$(ProjectDir)\include</AdditionalIncludeDirectories>
       <DisableSpecificWarnings>4996;</DisableSpecificWarnings>
+      <PreprocessorDefinitions>_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
       <AdditionalLibraryDirectories>$(ProjectDir)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <SubSystem>NotSet</SubSystem>
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
diff --git a/OpenWindow/kernel_sources.cpp b/OpenWindow/kernel_sources.cpp
index dcda827..f1e480a 100644
--- a/OpenWindow/kernel_sources.cpp
+++ b/OpenWindow/kernel_sources.cpp
@@ -1,46 +1,16 @@
 #include "kernels.h"
 
-const char* matrix_mul_kernel_source =
-"__kernel void mmul (                                       \n"
-"    __global float* A,                                \n"
-"    __global float* B,                                \n"
-"    __global float* C)                                \n"
-"{                                                     \n"
-"    int k;                                            \n"
-"    int i = get_global_id(0);                         \n"
-"    int j = get_global_id(1);                         \n"
-"    float tmp;                                        \n"
-"    if( (i < 4) && (j < 4) )                          \n"
-"    {                                                 \n"
-"        tmp = 0.0;                                    \n"
-"        for(k = 0; k < 4; k++)                        \n"
-"            tmp += A[i*4+k] * B[k*4+j];               \n"
-"        C[i*4+j] = tmp;                               \n"
-"    }                                                 \n"
-"}                                                     \n" ;
-
 const char* vertex_shader_kernel_source =
-"__kernel                                                           \n"
-"void vertex_shader( __global float* z_matrix,                      \n"
-"                    __global float* VertexBuffer,                  \n"
-"                    __global float* NewVertexBuffer)               \n"
-"{                                                                  \n"
-"    int index = 3 * get_global_id(0);                              \n"
-"    float new_x = 0;                                               \n"
-"    float new_y = 0;                                               \n"
-"    float new_z = 0;                                               \n"
-"    float new_w = 0;                                               \n"
-"    for(int i = 0; i < 3; i++) {                                   \n"
-"        new_x = new_x + VertexBuffer[index+i]*z_matrix[0*4 + i];   \n"
-"        new_y = new_y + VertexBuffer[index+i]*z_matrix[1*4 + i];   \n"
-"        new_z = new_z + VertexBuffer[index+i]*z_matrix[2*4 + i];   \n"
-"        new_w = new_w + VertexBuffer[index+i]*z_matrix[3*4 + i];   \n"
-"    }                                                              \n"
-"    new_x = new_x + z_matrix[0*4 +3];                              \n"
-"    new_y = new_y + z_matrix[1*4 +3];                              \n"
-"    new_z = new_z + z_matrix[2*4 +3];                              \n"
-"    new_w = new_w + z_matrix[3*4 +3];                              \n"
-"    NewVertexBuffer[index+0] = new_x / new_w;                      \n"
-"    NewVertexBuffer[index+1] = new_y / new_w;                      \n"
-"    NewVertexBuffer[index+2] = new_z / new_w;                      \n"
-"}                                                                  \n";
+"__kernel                                                                                                                    \n"
+"void vertex_shader( __global float* m,                                                                                      \n"
+"                    __global float* VertexBuffer,                                                                           \n"
+"                    __global float* NewVertexBuffer)                                                                        \n"
+"{                                                                                                                           \n"
+"    int local_index = get_local_id(0);                                                                                      \n"
+"    int global_index = get_group_id(0);                                                                                     \n"
+"    NewVertexBuffer[4*global_index+local_index] =                                                                           \n"
+"						  m[local_index*4]*VertexBuffer[3*global_index]                                                      \n"
+"	                    + m[local_index*4 + 1]*VertexBuffer[3*global_index+1]                                                \n"
+"	                    + m[local_index*4 + 2]*VertexBuffer[3*global_index+2]                                                \n"
+"	                    + m[local_index*4 + 3];                                                                              \n"
+"}                                                                                                                           \n";
diff --git a/OpenWindow/kernels.cpp b/OpenWindow/kernels.cpp
index 1e4b9f8..c2e3f48 100644
--- a/OpenWindow/kernels.cpp
+++ b/OpenWindow/kernels.cpp
@@ -11,12 +11,7 @@ cl_context context;
 
 
 // Matrix Multiplication Variables
-#define ORDER 4
 #define MATRIX_SIZE 16
-int Mdim, Pdim, Ndim;
-cl_mem mat_a;
-cl_mem mat_b;
-cl_mem mat_c;
 
 // vertex shader
 cl_mem mat_z;
@@ -27,15 +22,10 @@ cl_mem new_vertices_mem;
 cl_platform_id* platforms = NULL;
 cl_device_id* devices = NULL;
 
-cl_program mat_mul_prog;
 cl_program vertex_shader_prog;
-cl_kernel mat_mul_kernel;
 cl_kernel vertex_shader_kernel;
 cl_command_queue commands;
 
-size_t mat_mul_global[DIM];
-size_t mat_mul_local[DIM];
-
 int err;
 
 
@@ -53,78 +43,43 @@ void init_kernels() {
 
 	commands = clCreateCommandQueue(context, devices[0], 0, &err);
 
-	mat_a = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * MATRIX_SIZE, NULL, &err);
-	mat_b = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * MATRIX_SIZE, NULL, &err);
-	mat_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * MATRIX_SIZE, NULL, &err);
-
-
-	mat_mul_prog = clCreateProgramWithSource(context, 1, (const char **)&matrix_mul_kernel_source, NULL, &err);
 	vertex_shader_prog = clCreateProgramWithSource(context, 1, (const char **)&vertex_shader_kernel_source, NULL, &err);
-	err = clBuildProgram(mat_mul_prog, 1, devices, NULL, NULL, NULL);
 	err = clBuildProgram(vertex_shader_prog, 1, devices, NULL, NULL, NULL);
 
 	if (err != CL_SUCCESS)
 	{
 		size_t len;
 		char buffer[2048];
-		OutputDebugString("Error: Failed to build program executable!\n");
-		clGetProgramBuildInfo(mat_mul_prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
-		OutputDebugString(buffer);
+		printf("Error: Failed to build program executable!\n");
+		//clGetProgramBuildInfo(mat_mul_prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
+		printf(buffer);
 		return;
 	}
 
-	mat_mul_kernel = clCreateKernel(mat_mul_prog, "mmul", &err);
 	vertex_shader_kernel = clCreateKernel(vertex_shader_prog, "vertex_shader", &err);
 	err = 0;
-	err |= clSetKernelArg(mat_mul_kernel, 0, sizeof(cl_mem), &mat_a);
-	err |= clSetKernelArg(mat_mul_kernel, 1, sizeof(cl_mem), &mat_b);
-	err |= clSetKernelArg(mat_mul_kernel, 2, sizeof(cl_mem), &mat_c);
-	
 }
 
-void mat4_mul(float* A, float* B, float* C)
-{
-	err = clEnqueueWriteBuffer(commands, mat_a, CL_TRUE, 0, sizeof(float) * MATRIX_SIZE, A, 0, NULL, NULL);
-	err = clEnqueueWriteBuffer(commands, mat_b, CL_TRUE, 0, sizeof(float) * MATRIX_SIZE, B, 0, NULL, NULL);
-
-	mat_mul_global[0] = ORDER;
-	mat_mul_global[1] = ORDER;
-
-	err = clEnqueueNDRangeKernel(commands, mat_mul_kernel, 2, NULL, mat_mul_global, NULL, 0, NULL, NULL);
-	
-	clFlush(commands);
-	clFinish(commands);
-
-	err = clEnqueueReadBuffer(commands, mat_c, CL_TRUE, 0, sizeof(float) * MATRIX_SIZE, C, 0, NULL, NULL);
-}
-
-void vertex_shader(Matrix* ViewPort, Matrix* Projection, Matrix* ModelView, Matrix* ModelTransform, float* vertices, int vertex_count, float* new_vertices) {
-
-	mat_z = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * MATRIX_SIZE, NULL, &err);
-	vertices_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * vertex_count * 3, NULL, &err);
-	new_vertices_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * vertex_count * 3, NULL, &err);
+void vertex_shader(float* z, float* vertices, int vertex_count, float* new_vertices) {
+	mat_z            = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * MATRIX_SIZE     , NULL, &err);
+	vertices_mem     = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * vertex_count * 3, NULL, &err);
+	new_vertices_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * vertex_count * 4, NULL, &err);
 
 	err = clSetKernelArg(vertex_shader_kernel, 0, sizeof(cl_mem), &mat_z);
 	err = clSetKernelArg(vertex_shader_kernel, 1, sizeof(cl_mem), &vertices_mem);
 	err = clSetKernelArg(vertex_shader_kernel, 2, sizeof(cl_mem), &new_vertices_mem);
 
-
-	Matrix intermediate_z;
-	{ // Calculate the Z to be multiplied by each vertex
-		mat4_mul((float*)ViewPort       , (float*)Projection    , (float*) &intermediate_z);
-		mat4_mul((float*)&intermediate_z, (float*)ModelView     , (float*) &intermediate_z);
-		mat4_mul((float*)&intermediate_z, (float*)ModelTransform, (float*) &intermediate_z);
-	}
-
-	err = clEnqueueWriteBuffer(commands, mat_z, CL_TRUE, 0, sizeof(float) * MATRIX_SIZE, (float*)&intermediate_z, 0, NULL, NULL);
+	err = clEnqueueWriteBuffer(commands, mat_z       , CL_TRUE, 0, sizeof(float) * MATRIX_SIZE, z, 0, NULL, NULL);
 	err = clEnqueueWriteBuffer(commands, vertices_mem, CL_TRUE, 0, sizeof(float) * vertex_count * 3, vertices, 0, NULL, NULL);
 
-	size_t vertex_shader_global[] = { vertex_count };
-	size_t vertex_shader_local[] = { 16 };
+	size_t vertex_shader_global[] = { vertex_count*4 };
+	size_t vertex_shader_local[] = { 4 };
 
-	err = clEnqueueNDRangeKernel(commands, vertex_shader_kernel, 1, NULL, vertex_shader_global, NULL, 0, NULL, NULL);
+	err = clEnqueueNDRangeKernel(commands, vertex_shader_kernel, 1, NULL, vertex_shader_global, vertex_shader_local, 0, NULL, NULL);
 
-	err = clEnqueueReadBuffer(commands, new_vertices_mem, CL_TRUE, 0, sizeof(float) * vertex_count * 3, new_vertices, 0, NULL, NULL);
+	clFlush(commands);
+	clFinish(commands);
+	err = clEnqueueReadBuffer(commands, new_vertices_mem, CL_TRUE, 0, sizeof(float) * vertex_count * 4, new_vertices, 0, NULL, NULL);
 
 	clReleaseMemObject(mat_z);
 	clReleaseMemObject(vertices_mem);
@@ -133,13 +88,8 @@ void vertex_shader(Matrix* ViewPort, Matrix* Projection, Matrix* ModelView, Matr
 
 void destroy_kernels()
 {
-	clReleaseProgram(mat_mul_prog);
 	clReleaseProgram(vertex_shader_prog);
-	clReleaseKernel(mat_mul_kernel);
 	clReleaseKernel(vertex_shader_kernel);
-	clReleaseMemObject(mat_a);
-	clReleaseMemObject(mat_b);
-	clReleaseMemObject(mat_c);
 //	clReleaseMemObject(mat_z);
 //	clReleaseMemObject(vertices_mem);
 //	clReleaseMemObject(new_vertices_mem);
diff --git a/OpenWindow/kernels.h b/OpenWindow/kernels.h
index f354680..da5d50d 100644
--- a/OpenWindow/kernels.h
+++ b/OpenWindow/kernels.h
@@ -39,7 +39,7 @@ extern const char* vertex_shader_kernel_source;
 
 void init_kernels();
 void mat4_mul(float* A, float* B, float* C);
-void vertex_shader(Matrix* ViewPort, Matrix* Projection, Matrix* ModelView, Matrix* ModelTransform, float* vertices, int vertex_count, float* new_vertices);
+void vertex_shader(float* z, float* vertices, int vertex_count, float* new_vertices);
 void destroy_kernels();
 
 
diff --git a/OpenWindow/main.cpp b/OpenWindow/main.cpp
index 66261ca..8f79185 100644
--- a/OpenWindow/main.cpp
+++ b/OpenWindow/main.cpp
@@ -21,6 +21,12 @@ int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine
 
    hwnd = create_window(hInstance);
    ShowCursor(false);
+
+    AllocConsole();
+	freopen("CONIN$", "r",stdin);
+	freopen("CONOUT$", "w",stdout);
+	freopen("CONOUT$", "w",stderr);
+
    ShowWindow(hwnd, nCmdShow);
 
    init_camera();
diff --git a/OpenWindow/renderer.cpp b/OpenWindow/renderer.cpp
index 6f4fa72..c59352b 100644
--- a/OpenWindow/renderer.cpp
+++ b/OpenWindow/renderer.cpp
@@ -5,6 +5,7 @@
 #include "util_renderer.h"
 #include "CL/cl.h"
 #include "kernels.h"
+#include <ctime>
 
 #pragma comment (lib, "x86_64/opencl.lib")
 
@@ -34,6 +35,8 @@ Camera camera;
 
 Vec3f light_dir = Vec3f(1, 1, 1).normalize();
 
+float* new_verts = (float*)malloc(4 * sizeof(float) * model->nverts());
+
 void init_camera() {
 	camera.SetPosition(DEFAULT_CAMERA_POS);
 	camera.SetRotation(DEFAULT_CAMERA_ROT);
@@ -98,28 +101,20 @@ void render()
 	}
 	
 	{
-		model->rotate(Vec3f(0, 0, 90));
-		model->ApplyTransform();
+		//model->rotate(Vec3f(0, 0, 90));
+		//model->ApplyTransform();
 	}
 
-	//Matrix z = ViewPort * Projection * ModelView * model->Transform;
-
 	clear_zbuffer();
 	TextureShader shader;
 	shader.uniform_m =   (Projection);
 	shader.uniform_mit = (Projection).invert_transpose();
 
-	//Matrix intermediate_z;
-	//mat4_mul((float*) &ViewPort, (float*) &Projection, (float*) &intermediate_z);
-	//mat4_mul((float*)&intermediate_z, (float*)&ModelView, (float*) &intermediate_z);
-	//mat4_mul((float*)&intermediate_z, (float*)&model->Transform, (float*) &intermediate_z);
-	//shader.z = intermediate_z;
+	Matrix z = ViewPort * Projection * ModelView * model->Transform;
 
 
-	//std::vector<Vec3f> new_verts = model->verts_;
-	Vec3f* new_verts = (Vec3f*)malloc(3 * sizeof(float) * model->nverts());
-	vertex_shader(&ViewPort, &Projection, &ModelView, &model->Transform, (float*)&model->verts_, model->nverts(), (float*)new_verts);
-
+	// Vertex Shader: Should be called per model
+	vertex_shader((float*)&z, *(float**)((Vec3f*) &model->verts_), model->nverts(), new_verts);
 
 	#pragma omp parallel for
 	for (int i = 0; i < model->nfaces(); i++) {
@@ -127,9 +122,7 @@ void render()
 		bool out = true;
 		#pragma omp parallel for
 		for (int j = 0; j < 3; j++) {
-			//screen_coords[j] = model->vert(i, j);
-			//screen_coords[j] = shader.vertex(i, j);
-			screen_coords[j] = new_verts[model->faces_[i][j][0]];
+			screen_coords[j] = ((Vec4f*)new_verts)[model->faces_[i][j][0]];
 			Vec3f screen3(screen_coords[j]);
 
 			shader.varying_uv_coords.set_col(j, model->uv(i, j));
@@ -138,7 +131,5 @@ void render()
 		if(!out)
 			triangle(screen_coords, shader);
 	}
-
-	free(new_verts);
 }