the engineer in me!
My Projects
2. Pager (ESP32 + OLED)
Embedded C, circuit design & power design
- Works on Morse code
- Wifi Connected
- OLED display
- 4 buttons only!
3. Missile Command in Linux Kernel
x86, Assembly, C, Linux, RTC Interrupt request & Synchronization
- Text-Video Mode version of arcade game Missile Command.
- Works on RTC Interrupt.
- Runs on Linux Kernel in VM.
4. Tiled Matrix Multiplication in CUDA
C, NVIDIA CUDA, Parallel Programming
#include#define wbCheck(stmt) \ do { \ cudaError_t err = stmt; \ if (err != cudaSuccess) { \ wbLog(ERROR, "Failed to run stmt ", #stmt); \ wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ return -1; \ } \ } while (0) // Compute C = A * B __global__ void matrixMultiply(float *A, float *B, float *C, int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns) { //@@ Insert code to implement matrix multiplication here __shared__ float subTileA[8][8]; __shared__ float subTileB[8][8]; int blockX = blockIdx.x; int blockY = blockIdx.y; int threadX = threadIdx.x; int threadY = threadIdx.y; int curRow = blockY * 8 + threadY; int curCol = blockX * 8 + threadX; float mySum = 0; for(int i = 0; i < ceil((float) numAColumns / 8); i++) { if(curRow < numARows && (i * 8 + threadX) < numAColumns) { subTileA[threadY][threadX] = A[(curRow * numAColumns) + (i * 8) + threadX]; } else { subTileA[threadY][threadX] = 0; } if(curCol < numBColumns && (i * 8 + threadY) < numBRows) { subTileB[threadY][threadX] = B[curCol + (numBColumns * (i * 8 + threadY))]; } else { subTileB[threadY][threadX] = 0; } __syncthreads(); for(int j = 0; j < 8; j++) { mySum += subTileA[threadY][j] * subTileB[j][threadX]; // 2 } __syncthreads(); } if(curRow < numCRows && curCol < numCColumns) { C[curRow * numCColumns + curCol] = mySum; } } int main(int argc, char **argv) { wbArg_t args; float *hostA; // The A matrix float *hostB; // The B matrix float *hostC; // The output C matrix float *deviceA; float *deviceB; float *deviceC; int numARows; // number of rows in the matrix A int numAColumns; // number of columns in the matrix A int numBRows; // number of rows in the matrix B int numBColumns; // number of columns in the matrix B int numCRows; // number of rows in the matrix C (you have to set this) int numCColumns; // number of columns in the matrix C (you have to set // this) args = wbArg_read(argc, argv); wbTime_start(Generic, "Importing data and creating memory on host"); hostA = (float *)wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); hostB = (float *)wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); //@@ Set numCRows and numCColumns numCRows = numARows; numCColumns = numBColumns; //@@ Allocate the hostC matrix wbTime_stop(Generic, "Importing data and creating memory on host"); hostC = (float*) malloc(numCColumns * numCRows * sizeof(float)); wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); wbTime_start(GPU, "Allocating GPU memory."); //@@ Allocate GPU memory here int sizeA = numARows * numAColumns * sizeof(float); int sizeB = numBRows * numBColumns * sizeof(float); int sizeC = numCRows * numCColumns * sizeof(float); cudaMalloc((void **) &deviceA, sizeA); cudaMalloc((void **) &deviceB, sizeB); cudaMalloc((void **) &deviceC, sizeC); wbTime_stop(GPU, "Allocating GPU memory."); wbTime_start(GPU, "Copying input memory to the GPU."); //@@ Copy memory to the GPU here cudaMemcpy(deviceA, hostA, sizeA, cudaMemcpyHostToDevice); cudaMemcpy(deviceB, hostB, sizeB, cudaMemcpyHostToDevice); wbTime_stop(GPU, "Copying input memory to the GPU."); //@@ Initialize the grid and block dimensions here dim3 DimGrid(ceil(numCColumns/8.0), ceil(numCRows/8.0), 1); dim3 DimBlock(8, 8, 1); wbTime_start(Compute, "Performing CUDA computation"); //@@ Launch the GPU Kernel here matrixMultiply<< >>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns); cudaDeviceSynchronize(); wbTime_stop(Compute, "Performing CUDA computation"); wbTime_start(Copy, "Copying output memory to the CPU"); //@@ Copy the GPU memory back to the CPU here cudaMemcpy(hostC, deviceC, sizeC, cudaMemcpyDeviceToHost); wbTime_stop(Copy, "Copying output memory to the CPU"); wbTime_start(GPU, "Freeing GPU Memory"); //@@ Free the GPU memory here cudaFree(deviceC); cudaFree(deviceB); cudaFree(deviceA); wbTime_stop(GPU, "Freeing GPU Memory"); wbSolution(args, hostC, numCRows, numCColumns); free(hostA); free(hostB); free(hostC); return 0; }
4. Custom VPS set-up for testing
Linux, Git, Networking
- Installed Linux Ubuntu 18.04 Bionic Beaver
- Using Apache
- Running Node.js
- Testing for Flutter Web