基于CPU SIMD和winograd的卷积计算加速技术_
Opencv findcontours函数原理,以及python numpy实现
microsoft/ DirectXMath github SIMD
void gaussianConvolution(Matrix<double>& srcIamge, Matrix<double>& desImage, Matrix<double>& kernel)
{int kernelSize = kernel.numCols();//卷积填充int startOffset = -1 * int(kernelSize / 2);for (int i = 0; i < srcIamge.numRows(); i++){for (int j = 0; j < srcIamge.numCols(); j++){double blurredPixel = 0.0;for (int kx = 0; kx < kernelSize; kx++){for (int ky = 0; ky < kernelSize; ky++){int x = i + startOffset + kx, y = j + startOffset + ky;GetPixelWrapAround(srcIamge, x, y);blurredPixel += kernel.get(kx, ky)* srcIamge.get(x, y);}}desImage.set(i, j, blurredPixel);}}
}void GetPixelWrapAround(const Matrix<double>& image, int& x, int& y)
{int w = image.numRows();int h = image.numCols();x = (x % w + w) % w;y = (y % h + h) % h;
void greenNoise::gaussianConvolutionSSE(Matrix<double>& srcImage, Matrix<double>& desImage, Matrix<double>& kernel)
{int kernelSize = kernel.numCols();int width = srcImage.numRows();int height = srcImage.numCols();int startOffset = -1 * static_cast<int>(kernelSize / 2);double temp[4];for (int i = 0; i < width; i++){for (int j = 0; j < height; j++){double blurredPixel = 0.0;for (int kx = 0; kx < kernelSize; kx++){int x = (i + startOffset + kx + width) % width;for (int ky = 0; ky < kernelSize-3; ky+=4){//int y = (j + startOffset + ky + height) % height;int y0 = j + startOffset + ky + height;int y1 = (y0 + 1)% height;int y2 = (y0 + 2) % height;int y3 = (y0 + 3) % height;y0 = y0 % height;__m256d srcValues = _mm256_set_pd(srcImage.get(x, y0), srcImage.get(x, y1), srcImage.get(x, y2), srcImage.get(x, y3));__m256d kernelValues = _mm256_set_pd(kernel.get(kx, ky), kernel.get(kx, ky+1), kernel.get(kx, ky+2), kernel.get(kx, ky+3));__m256d resultVec = _mm256_mul_pd(srcValues, kernelValues);_mm256_storeu_pd(temp, resultVec);blurredPixel += temp[0]+ temp[1] + temp[2] + temp[3] ;}// Process the remaining elements (if any) without SSEfor (int ky = kernelSize - kernelSize % 4; ky < kernelSize; ++ky){int y = (j + startOffset + ky + height) % height;blurredPixel += kernel.get(kx, ky) * srcImage.get(x, y);}}desImage.set(i, j, blurredPixel);}}
void greenNoise::parallelGaussianConvolutionSSE(Matrix<double>& srcImage, Matrix<double>& desImage, Matrix<double>& kernel)
{int kernelSize = kernel.numCols();int width = srcImage.numRows();int height = srcImage.numCols();int startOffset = -1 * static_cast<int>(kernelSize / 2);std::vector<std::thread> threads;//std::mutex mutex; // Mutex to control access to the result matrixconst int numThreads = std::thread::hardware_concurrency(); // Number of available threadsconst int rowsPerThread = (width + numThreads - 1) / numThreads; // Rows per threadfor (int t = 0; t < numThreads; ++t){threads.emplace_back([&srcImage, &desImage, &kernel, t, rowsPerThread,kernelSize, width, height, startOffset](){for (int i = t* rowsPerThread; i < std::min(width, (t +1)* rowsPerThread); i++){for (int j = 0; j < height; j++){double temp[4];double blurredPixel = 0.0;for (int kx = 0; kx < kernelSize; kx++){int x = (i + startOffset + kx + width) % width;for (int ky = 0; ky < kernelSize - 3; ky += 4){//int y = (j + startOffset + ky + height) % height;int y0 = j + startOffset + ky + height;int y1 = (y0 + 1) % height;int y2 = (y0 + 2) % height;int y3 = (y0 + 3) % height;y0 = y0 % height;__m256d srcValues = _mm256_set_pd(srcImage.get(x, y0), srcImage.get(x, y1), srcImage.get(x, y2), srcImage.get(x, y3));__m256d kernelValues = _mm256_set_pd(kernel.get(kx, ky), kernel.get(kx, ky + 1), kernel.get(kx, ky + 2), kernel.get(kx, ky + 3));__m256d resultVec = _mm256_mul_pd(srcValues, kernelValues);_mm256_storeu_pd(temp, resultVec);blurredPixel += temp[0] + temp[1] + temp[2] + temp[3];}// Process the remaining elements (if any) without SSEfor (int ky = kernelSize - kernelSize % 4; ky < kernelSize; ++ky){int y = (j + startOffset + ky + height) % height;blurredPixel += kernel.get(kx, ky) * srcImage.get(x, y);}}desImage.set(i, j, blurredPixel);}}});}for (auto& thread : threads){thread.join();}}