日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 人文社科 > 生活经验 >内容正文

生活经验

windows10 vs2013控制台工程中添加并编译cuda8.0文件操作步骤

發布時間:2023/11/27 生活经验 31 豆豆
生活随笔 收集整理的這篇文章主要介紹了 windows10 vs2013控制台工程中添加并编译cuda8.0文件操作步骤 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

一般有兩種方法可以在vs2013上添加運行cuda8.0程序:

一、直接新建一個基于CUDA8.0的項目:如下圖所示,


點擊確定后即可生成test_cuda項目;默認會自動生成一個kernel.cu文件;默認已經配置好Debug/Release, Win32/x64環境,直接編譯運行,結果如下圖所示:函數執行的是兩個數組的加操作。移除kernel.cu文件,加入自己需要的cuda文件即可進行實際操作了,非常方便。


二、實際情況下,多是在已有的項目中添加一些cuda文件,用于加速,下面說下具體的操作步驟:

1、新建一個CUDA_Test x64控制臺空工程;

2、新建CUDA_Test.cpp文件;

3、選中CUDA_Test項目,右鍵單擊-->生成依賴項-->生成自定義,勾選CUDA8.0,點擊確定,如下圖所示:


4、完成第3步后,再次打開工程的屬性配置,會多出兩項,CUDA C/C++和CUDA Linker,如下圖所示:


5、新建或添加幾個已有的文件,包括common.hpp、simple.hpp、simple.cpp、simple.cu,各個文件內容如下:

common.hpp:

#ifndef FBC_CUDA_TEST_COMMON_HPP_
#define FBC_CUDA_TEST_COMMON_HPP_#define PRINT_ERROR_INFO(info) { \fprintf(stderr, "Error: %s, file: %s, func: %s, line: %d\n", #info, __FILE__, __FUNCTION__, __LINE__); \return -1; }#endif // FBC_CUDA_TEST_COMMON_HPP_
simple.hpp:

#ifndef FBC_CUDA_TEST_SIMPLE_HPP_
#define FBC_CUDA_TEST_SIMPLE_HPP_// reference: C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\0_Simple
int test_vectorAdd();int vectorAdd_cpu(const float *A, const float *B, float *C, int numElements);int vectorAdd_gpu(const float *A, const float *B, float *C, int numElements);#endif // FBC_CUDA_TEST_SIMPLE_HPP_
simple.cpp:

#include "simple.hpp"
#include <stdlib.h>
#include <iostream>
#include "common.hpp"// =========================== vector add =============================
int test_vectorAdd()
{// Vector addition: C = A + B, implements element by element vector additionconst int numElements{ 50000 };float* A = new float[numElements];float* B = new float[numElements];float* C1 = new float[numElements];float* C2 = new float[numElements];// Initialize vectorfor (int i = 0; i < numElements; ++i) {A[i] = rand() / (float)RAND_MAX;B[i] = rand() / (float)RAND_MAX;}int ret = vectorAdd_cpu(A, B, C1, numElements);if (ret != 0) PRINT_ERROR_INFO(vectorAdd_cpu);ret = vectorAdd_gpu(A, B, C2, numElements);if (ret != 0) PRINT_ERROR_INFO(vectorAdd_gpu);for (int i = 0; i < numElements; ++i) {if (fabs(C1[i] - C2[i]) > 1e-5) {fprintf(stderr, "Result verification failed at element %d!\n", i);return -1;}}delete[] A;delete[] B;delete[] C1;delete[] C2;return 0;
}int vectorAdd_cpu(const float *A, const float *B, float *C, int numElements)
{for (int i = 0; i < numElements; ++i) {C[i] = A[i] + B[i];}return 0;
}
simple.cu:

#include "simple.hpp"
#include <iostream>
#include <cuda_runtime.h> // For the CUDA runtime routines (prefixed with "cuda_")
#include <device_launch_parameters.h>// reference: C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\0_Simple// =========================== vector add =============================
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{int i = blockDim.x * blockIdx.x + threadIdx.x;if (i < numElements) {C[i] = A[i] + B[i];}
}int vectorAdd_gpu(const float *A, const float *B, float *C, int numElements)
{// Error code to check return values for CUDA callscudaError_t err{ cudaSuccess };size_t length{ numElements * sizeof(float) };fprintf(stderr, "Length: %d\n", length);float* d_A{ nullptr };float* d_B{ nullptr };float* d_C{ nullptr };err = cudaMalloc(&d_A, length);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaMalloc(&d_B, length);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaMalloc(&d_C, length);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaMemcpy(d_A, A, length, cudaMemcpyHostToDevice);if (err != cudaSuccess) {fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaMemcpy(d_B, B, length, cudaMemcpyHostToDevice);if (err != cudaSuccess) {fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));return -1;}// Launch the Vector Add CUDA kernelint threadsPerBlock = 256;int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;fprintf(stderr, "CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);vectorAdd << <blocksPerGrid, threadsPerBlock >> >(d_A, d_B, d_C, numElements);err = cudaGetLastError();if (err != cudaSuccess) {fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));return -1;}// Copy the device result vector in device memory to the host result vector in host memory.err = cudaMemcpy(C, d_C, length, cudaMemcpyDeviceToHost);if (err != cudaSuccess) {fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaFree(d_A);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaFree(d_B);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaFree(d_C);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));return -1;}return err;
}
CUDA_Test.cpp:
#include <iostream>
#include "simple.hpp"int main()
{int ret = test_vectorAdd();if (ret == 0) fprintf(stderr, "***** test success *****\n");else fprintf(stderr, "===== test fail =====\n");return 0;
}
6、調整屬性配置項:

(1)、CUDA C/C++-->Common中Target Machine Platform中默認是32-bit(--machine32),因為是x64,所以將其調整為64-bit(--machine 64);

(2)、添加附加庫:鏈接器-->輸入-->附加依賴項:cudart.lib;

(3)、消除nvcc warning: The 'compute_20', 'sm_20', and'sm_21' architectures are deprecated, and may be removed in a future release:CUDA C/C++-->Device: Code Generation:由compute_20,sm_20修改為compute_30,sm_30; compute_35,sm_35; compute_37,sm_37;compute_50,sm_50; compute_52,sm_52; compute_60,sm_60

以上code是參考NVIDIA Corporation\CUDA Samples\v8.0\0_Simple中vectorAdd例子進行的改寫,輸出結果如下:



GitHub:https://github.com/fengbingchun/CUDA_Test

總結

以上是生活随笔為你收集整理的windows10 vs2013控制台工程中添加并编译cuda8.0文件操作步骤的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。